# Modelling

### Pipeline
- load the data
- replace null values
- separate categorical and numerical
- remove columns with very high entropy in categorical and numerical
- convert target to binary
- run pca on numerical
- one hot encoding on categorical
- train test separation
- Grid search for dense_nn configuration

### Imports

In [None]:
%load_ext autoreload

%autoreload 2

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
grandparentdir = os.path.dirname(parentdir)
sys.path.insert(0,parentdir)
sys.path.insert(0,grandparentdir) 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import pickle

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Lambda
from keras import regularizers
from keras.losses import mean_absolute_error

from keras.wrappers.scikit_learn import KerasClassifier


import math
import numpy as np

from src.data.etl.redundant_columns_remover import RedundantColumnsRemover



In [None]:
from src.data.etl.combined_transformer import CombinedTransformer

### Load Data

In [None]:
os.getcwd()

In [None]:
train_data = pd.read_csv("../../data/loan-default-prediction/train_v2.csv")

In [None]:
train_id = train_data["id"]
train_loss = train_data["loss"]

train_data.drop(columns=["id", "loss"], inplace=True)

In [None]:
train_data, test_data, train_loss, test_loss = train_test_split(train_data, train_loss, test_size=0.33, random_state=42)

In [None]:
etl_pipeline = Pipeline([
    ("redundant_columns_remover", RedundantColumnsRemover()),
    ("combined_transformer", CombinedTransformer(["f776", "f777", "f725"])),
    ("pca", PCA(svd_solver='full')),
    ("standard_scaler", StandardScaler())
])
     

In [None]:
X_train = etl_pipeline.fit_transform(train_data)

In [None]:
X_test = etl_pipeline.transform(test_data)

In [None]:
y_train_class = train_loss.astype("bool").astype("int")

In [None]:
y_test_class = test_loss.astype("bool").astype("int")

## Classification

### Logistic Regression Classifier

In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train_class)
precision_recall_fscore_support(y_test_class.values, logistic_regression.predict(X_test))

### NN Classifier

Do the correlation between target and source columns

In [None]:
def get_classification_model():
    model=Sequential()

    model.add(Dense(256, input_dim=X_train.shape[1], kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.001)))
    model.add(Activation("tanh"))
    model.add(Dense(128, kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.001)))
    model.add(Activation("tanh"))
    model.add(Dense(1))
    model.add(Activation("sigmoid"))



    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model
classification_model = get_classification_model()

In [None]:
classification_model.fit(X_train, y_train_class, epochs=1000, batch_size=4096, validation_data=(X_test, y_test_class), class_weight={0:1., 1: 4.})

l1=0005
adagrad
4.0
66/74, 84/98

In [None]:
precision_recall_fscore_support(y_test_class.values, classification_model.predict_classes(X_test))

In [None]:
precision_recall_fscore_support(y_train_class.values, classification_model.predict_classes(X_train))

In [None]:
 classification_model.predict_proba(X_test)

## Regressor

In [None]:
foo = pd.DataFrame(data=train_loss.values, columns=["loss"])

In [None]:
X_r = pd.DataFrame(data=train_data).loc[foo[foo["loss"] > 0].index.values]

In [None]:
y_r = foo.loc[foo[foo["loss"] > 0].index.values]

In [None]:
y_r

In [None]:
X_train_r, X_test_r,y_train_r, y_test_r =  train_test_split(X_r, y_r, test_size=0.33, random_state=42)

In [None]:
def get_regression_model():
    model=Sequential()

    model.add(Dense(1024, input_dim=X_train_r.shape[1], kernel_regularizer=regularizers.l1_l2(l1=0.00001, l2=0.0)))
    model.add(Dropout(rate=0.2))
    model.add(Activation("tanh"))
    model.add(Dense(1024, kernel_regularizer=regularizers.l1_l2(l1=0.00001, l2=0.0)))
    model.add(Dropout(rate=0.2))
    model.add(Activation("tanh"))
    model.add(Dense(1))

    model.compile(optimizer='adagrad',
                  loss='mean_absolute_error')
    return model

regression_model = get_regression_model()

In [None]:
regression_model.fit(X_train_r, y_train_r, epochs=200, batch_size=1024, validation_data=(X_test_r, y_test_r))

In [None]:
regression_prediction = regression_model.predict(X_test_r).reshape(2160,)

In [None]:
np.round(regression_prediction)[: 20]

In [None]:
y_test_r.head(20)

In [None]:
np.abs(regression_prediction - y_test_r["loss"].values).sum()/y_test_r.shape[0]

### Combined prediction

In [None]:
class CombinedModel():
    def __init__(self,classification_model, regression_model):
        self.classification_model = classification_model
        self.regression_model = regression_model
        
    def predict(self,X):
        classification_prediction = self.classification_model.predict_classes(X)
        indices_of_predicted_defaults = np.where(classification_prediction == 1)[0]
        regression_prediction = self.regression_model.predict(X[indices_of_predicted_defaults])

        result = np.zeros(X.shape[0])
#         np.put(result, indices_of_predicted_defaults, regression_prediction.reshape(regression_prediction.shape[0], ))
        np.put(result, indices_of_predicted_defaults, np.full((regression_prediction.shape[0],), 2) )
        return result

In [None]:
combined_model = CombinedModel(classification_model, regression_model)


In [None]:
combined_prediction = combined_model.predict(train_data)

In [None]:
combined_prediction

In [None]:
np.abs(combined_prediction - train_loss.values).sum()/combined_prediction.shape[0]


### Submission to Kaggle

In [None]:
#test_data = pd.read_csv("../data/loan-default-prediction/test_v2.csv")

In [None]:
#test_ids = test_data["id"]

In [None]:
#test_data.drop(columns=["id"], inplace=True)

In [None]:
test_data = redundant_columns_remover.transform(test_data)
test_data = category_encoder.transform(test_data)
test_data = null_value_replacer.transform(test_data)
test_data = pca.transform(test_data)
test_data = standard_scaler.transform(test_data)

In [None]:
predictions = combined_model.predict(test_data)

In [None]:
predictions = predictions.reshape(34806,)

In [None]:
predictions[:30]

In [None]:
test_loss.values[:30]

In [None]:
np.abs(predictions - test_loss.values).sum()/predictions.shape[0]


In [None]:
pd.DataFrame(data={"loss":predictions}, columns=["loss"])

In [None]:
to_submit = pd.concat([test_ids, pd.DataFrame(data={"loss":predictions}, columns=["loss"])], axis=1)

In [None]:
to_submit.to_csv("../data/to_submit_3.csv", columns=["id", "loss"], index=False)

In [None]:
to_submit

### Baselines for predictions

* [ ] do a baseline with mean absolute error against 0
* [ ] do a baseline with mae against existing loss distribution

In [None]:
train_loss.mean() # This is also a loss against 0

In [None]:
def calculate_mean_abs_error(predictions, actuals):
    return np.abs(predictions - actuals).sum()/predictions.shape[0]

In [None]:
calculate_mean_abs_error(np.full(train_loss.shape, 0.8), train_loss) # This is loss against mean value

In [None]:
random_dist = np.random.choice(
    train_loss.value_counts(normalize=True).sort_index().index.values,
    train_loss.shape[0],
    p=train_loss.value_counts(normalize=True).sort_index().values)

In [None]:
calculate_mean_abs_error(random_dist, train_loss) # This is loss for random distribution with same prob values

In [None]:
train_loss[train_loss> 0].mean()