# Modelling

### Pipeline
- import the data
- replace null values
- separate categorical and numerical
- convert target to binary|
- run pca on numerical
- one hot encoding on categorical
- train test separation


### Imports

In [None]:
%load_ext autoreload

%autoreload 2

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from sklearn.ensemble import RandomForestClassifier
import pickle

import numpy as np


from etl.null_value_replacer import NullValueReplacer
import math


### Load Data

In [None]:
train_data = pd.read_csv("../data/loan-default-prediction/train_v2.csv")

In [None]:
null_value_replacer = NullValueReplacer("median")

train_data = null_value_replacer.fit_transform(train_data)

In [None]:
df_data_types = train_data.dtypes
cat_var = [key for key in dict(df_data_types)
                 if dict(df_data_types)[key] in ['object']]

In [None]:
train_data.drop(columns=cat_var, inplace=True)

In [None]:
def get_columns_with_distinct_values(df, column_subset):
    groups = []
    redundant_columns = []
    for i in range(len(column_subset)):
        col1 = column_subset[i]
        if col1 in redundant_columns:
                continue
        same_columns = [col1]
        
        for j in range(i, len(column_subset)):
            col2 = column_subset[j]
            if col1 == col2:
                continue
            if (df[col1]-df[col2]).sum() == 0:
                same_columns += [col2]
                redundant_columns += [col2]
        groups+=[same_columns]
    return [i[0] for i in groups]

columns_to_use = get_columns_with_distinct_values(train_data, train_data.columns.values)

In [None]:
len(columns_to_use)

In [None]:
def resample_and_split(df, ratio=0.7):
    
    
    lossless_data = df[df["loss"]==0]
    lossless_data_indices = np.random.permutation(lossless_data.index.values)
    lossless_data_split_index = math.floor(len(lossless_data_indices)*ratio)

    loss_data = df[df["loss"] >0 ]
    loss_data_indices = np.random.permutation(loss_data.index.values)
    loss_data_split_index = math.floor(len(loss_data_indices)*ratio)
    
    
    test_data = pd.concat(
            [
                lossless_data.loc[lossless_data_indices[lossless_data_split_index:]], 
                 loss_data.loc[loss_data_indices[loss_data_split_index:]]
            ]
        ).sample(frac=1).reset_index(drop=True)
    
    loss_train_data = loss_data.loc[loss_data_indices[:loss_data_split_index]]
    
    train_data = []
    NUMBER_OF_TRAIN_PARTITIONS = 9
    for i in range(0, NUMBER_OF_TRAIN_PARTITIONS):
        start_index = i * math.floor(lossless_data_split_index/NUMBER_OF_TRAIN_PARTITIONS)
        end_index = (i + 1) * math.floor(lossless_data_split_index/NUMBER_OF_TRAIN_PARTITIONS)
        
        train_data += [
            pd.concat(
                [
                    lossless_data.loc[lossless_data_indices[start_index: end_index]],
                    loss_train_data
                ]
            ).sample(frac=1).reset_index(drop=True)
        ]
        
    return train_data, test_data

train_all, test_all = resample_and_split(train_data[columns_to_use])
    

### Random Forest Classifier

In [None]:
def train_stack_of_classifiers(list_of_df):

    classifiers = []
    for df in list_of_df:
        X = df.drop(columns=["id", "loss"])
        y = df["loss"].astype("bool").astype("int")
        
        random_forest_classifier = RandomForestClassifier(n_estimators=50, criterion="gini", min_samples_split=10, verbose=1)
        
        random_forest_classifier.fit(
            X,
            y=y.values.reshape(-1),
        )
        classifiers += [random_forest_classifier]
        
    return classifiers

trained_classifiers = train_stack_of_classifiers(train_all)
        
        

In [None]:
X_test = test_all.drop(columns=["id", "loss"])

In [None]:
y_test = test_all["loss"].astype("bool").astype("int")

In [None]:
predictions = [i.predict(X_test) for i in trained_classifiers]

In [None]:
joined_prob = pd.DataFrame(data=predictions).agg(sum)/len(predictions)

In [None]:
np.around(joined_prob)

In [None]:
pre_recall= precision_recall_fscore_support(y_test, np.around(joined_prob-0.15))

In [None]:
pre_recall