# Modelling

### Imports

In [None]:
%load_ext autoreload

%autoreload 2

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from catboost import CatBoostRegressor, CatBoostClassifier
import pickle

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import regularizers

from etl.null_value_replacer import NullValueReplacer


### Load Data

In [None]:
train_data = pd.read_csv("../data/loan-default-prediction/train_v2.csv")

In [None]:
dbfile = open('../data/columns_to_consider', 'rb')      
columns_to_keep= pickle.load(dbfile) 
dbfile.close()

In [None]:
null_value_replacer = NullValueReplacer("mean")

train_data = null_value_replacer.fit_transform(train_data)

In [None]:
X = train_data[columns_to_keep]
y = train_data["loss"].astype("bool").astype("int")

### Linear Regression Classifier

In [None]:
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)


In [None]:
X_scaled.shape

In [None]:
logistic_regression = LogisticRegression()

In [None]:
logistic_regression.fit(X_train_scaled, y_train)

In [None]:
prediction_logistic = logistic_regression.predict(X_test_scaled)

In [None]:
pre_recall_logictic= precision_recall_fscore_support(y_test.values, prediction_logistic)
pre_recall_logictic

In [None]:
prediction_logistic.sum()

### Dense NN

Do the correlation between target and source columns

In [None]:
X_resamples = resampled_train_data[columns_to_keep]
Y_resampled = resampled_train_data["loss"].astype("bool").astype("int")

In [None]:
min_max_scaler_on_resampled= MinMaxScaler()

In [None]:
X_resampled = min_max_scaler_on_resampled.fit_transform(X_resampled)

In [None]:
dense_nn_model=Sequential()

In [None]:
dense_nn_model.add(Dense(64, input_dim=X_train_scaled.shape[1], kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.0)))
dense_nn_model.add(Activation("relu"))
dense_nn_model.add(Dense(64, kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.0)))
dense_nn_model.add(Activation("relu"))
dense_nn_model.add(Dense(1))
dense_nn_model.add(Activation("sigmoid"))

In [None]:
dense_nn_model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
dense_nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32)

In [None]:
dense_nn_model.predict_classes(X_test_scaled).sum()

In [None]:
X_test_scaled.shape

### Catboost Classifier

In [None]:
no_loss_data = train_data[train_data["loss"] == 0]
loss_data = train_data[train_data["loss"] != 0]

resampled_train_data = pd.concat([no_loss_data.sample(n=loss_data.shape[0] * 2 ), loss_data])

In [None]:
X_resamples = resampled_train_data[columns_to_keep]
Y_resampled = resampled_train_data["loss"].astype("bool").astype("int")

X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(X_resamples, Y_resampled, test_size=0.33, random_state=42)

In [None]:
X_train_resampled['f776']

In [None]:
    cat_boost_classifier = CatBoostClassifier(iterations=100, cat_features=["f776", "f777", "f725", "f2", "f5", "f73", "f403"])

In [None]:
cat_boost_classifier.fit(
    X_train_resampled,
    y=y_train_resampled.values.reshape(-1),
    plot=True,
    eval_set=(X_test_resampled, y_test_resampled.values.reshape(-1))
)

In [None]:
cat_boost_predictions = cat_boost_classifier.predict(X_test_resampled)

In [None]:
cat_boost_predictions.sum()

In [None]:
pre_recall_catboost= precision_recall_fscore_support(y_test_resampled, cat_boost_predictions)

In [None]:
pre_recall_catboost

### Random Forest classifiers

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=500, criterion="entropy", min_samples_split=10, verbose=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
random_forest_classifier.fit(X_train, y_train)

In [None]:
random_forest_predictions = random_forest_classifier.predict(X_test)

In [None]:
random_forest_predictions.sum()

In [None]:
pre_recall_forest= precision_recall_fscore_support(y_test.values, random_forest_predictions)

In [None]:
pre_recall_forest

### CatBoost Regressor

In [None]:
cat_boost_regressor = CatBoostRegressor(iterations=100)

In [None]:
cat_boost_regressor.fit(X_train, y=y_train, plot=True, eval_set=(X_test, y_test))

In [None]:
cat_boost_regressor.predict(X_test[17])

In [None]:
X_test[17]

In [None]:
y_test.values[17]

### Dense NN regressor