# Import Libraries

In [1]:
#Importing Libraries

import warnings
warnings.filterwarnings("ignore")
import pandas as pd #pandas to create small dataframes 
import json         #json library would be using to parse JSON Columns
from pandas.io.json import json_normalize #Library to normalize semi-structured JSON data into a flat table.
import os           #Library to use system level variable.
import matplotlib.pylab as plt #Plotting
from matplotlib import pyplot
import numpy as np  #Do aritmetic operations on arrays
import plotly.graph_objects as go #Graphing library. 
import gc           #Garbage Collector interface
gc.enable() #Enable automatic garbage collection.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV #Hypertune parameters for model
from datetime import datetime, timedelta #The datetime module supplies classes for manipulating dates and times.
from sklearn import preprocessing #Will use this library to label encode categorical features.
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, confusion_matrix, fbeta_score, classification_report 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
import lightgbm as lgb
from xgboost import XGBClassifier
import plotly
import plotly.graph_objs as go
from plotly.offline import *
from scipy import stats
import statsmodels.api as sms

# Show all columns
pd.options.display.max_columns = None

## Import csv

In [2]:
df_model = pd.read_csv("df_model.csv")

## X & Y

In [3]:
y_class = df_model.pop("y_class")
y_reg = df_model.pop('y_reg')
X = df_model

## Train/Test-Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y_class,random_state=12)

# Classification

## Baseline model

In [5]:
# baseline model all zero
y_baseline = np.zeros(len(y_test))
#y_baseline = y_test

In [6]:
confusion_matrix(y_test,y_baseline)
# TN, FP
# FN, TP

array([[94264,     0],
       [   33,     0]])

In [7]:
# Alles Null
print(classification_report(y_test,y_baseline))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00     94264
        True       0.00      0.00      0.00        33

    accuracy                           1.00     94297
   macro avg       0.50      0.50      0.50     94297
weighted avg       1.00      1.00      1.00     94297



In [8]:
# accuracy baseline
(94341+0)/(94384)*100

99.95444143075098

## Random Forrest

In [9]:
# Create the model with 100 trees
model_rf = RandomForestClassifier(n_estimators=400, 
                               random_state=10, 
                               max_features = 'sqrt',
                               n_jobs=-1, 
                               verbose = 1)

# Fit on training data
model_rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  1.2min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=10, verbose=1,
                       warm_start=False)

In [10]:
y_pred = model_rf.predict(X_test)
confusion_matrix(y_test,y_pred)
# TN, FP
# FN, TP

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:    1.5s finished


array([[94264,     0],
       [   31,     2]])

In [11]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00     94264
        True       1.00      0.06      0.11        33

    accuracy                           1.00     94297
   macro avg       1.00      0.53      0.56     94297
weighted avg       1.00      1.00      1.00     94297



### Random Forest Optimization through Random Search

In [13]:
# Hyperparameter grid
param_grid = {
    'n_estimators': np.linspace(250, 450, 10).astype(int),
    'max_depth': [None],
    'max_features': ['sqrt'],
    'max_leaf_nodes': [None]+ list(np.linspace(80, 100, 4).astype(int)),
    'min_samples_split': [1, 3, 5],
    'bootstrap': [True]
}

RSEED = 50

# Estimator for use in random search
estimator = RandomForestClassifier(random_state = RSEED)

# Create the random search model
rs = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'roc_auc', cv = 3, 
                        n_iter = 10, verbose = 1, random_state=RSEED)

# Fit 
rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 10.4min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [14]:
# Best Params
rs.best_params_

{'n_estimators': 294,
 'min_samples_split': 3,
 'max_leaf_nodes': 86,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

### Use Best Model

In [15]:
best_model = rs.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
confusion_matrix(y_test,y_pred)
# TN, FP
# FN, TP

array([[94264,     0],
       [   31,     2]])

## xgboost

In [16]:
# fit model to x
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [17]:
y_pred = model_xgb.predict(X_test)
confusion_matrix(y_test,y_pred)
# TN, FP
# FN, TP

array([[94264,     0],
       [   31,     2]])

### xgboost Optimization through GridSearchCV

In [19]:
param_grid_xgb = {'xgbclassifier__max_depth':[2, 3],
                  'xgbclassifier__learning_rate':[0.1],
                  'xgbclassifier__n_estimators':[3, 50],
                  'xgbclassifier__booster':["gbtree"],
                  'xgbclassifier__gamma':[4]}

In [20]:
#f = 'gain'
grid_xgb = GridSearchCV(model_xgb, param_grid_xgb, return_train_score=False,n_jobs=3)
grid_xgb.fit(X_train, y_train)
print(grid_xgb.best_params_)

{'xgbclassifier__booster': 'gbtree', 'xgbclassifier__gamma': 4, 'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 2, 'xgbclassifier__n_estimators': 3}


In [21]:
y_pred_GS=grid_xgb.predict(X_test)

In [22]:
confusion_matrix(y_test,y_pred_GS)

array([[94264,     0],
       [   31,     2]])

## SVC

In [26]:
model_svc = SVC()
model_svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
y_pred = model_svc.predict(X_test)

In [28]:
confusion_matrix(y_test,y_pred)

array([[94264,     0],
       [   33,     0]])

## KNN

In [29]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [30]:
y_pred = model_knn.predict(X_test)

In [31]:
confusion_matrix(y_test,y_pred)

array([[94264,     0],
       [   33,     0]])

## Logistic Regression

In [32]:
model_logit = LogisticRegression(max_iter=1000)
model_logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
y_pred = model_knn.predict(X_test)

In [34]:
confusion_matrix(y_test,y_pred)

array([[94264,     0],
       [   33,     0]])

## AdaBoost

In [35]:
model_ada = AdaBoostClassifier()
model_ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [36]:
y_pred = model_knn.predict(X_test)

In [37]:
confusion_matrix(y_test,y_pred)

array([[94264,     0],
       [   33,     0]])

## LGBM

In [38]:
# metric = 'binary_logloss', 
model_lgb = lgb.LGBMClassifier(colsample_bytree = 0.8,
                               n_estimators = 222,
                               learning_rate = 0.05,
                               objective = 'binary',
                               min_child_samples = 20,
                               max_leaves = 250, 
                               boosting_type = 'gbdt', 
                               metric = 'binary_logloss', 
                               subsample = 0.7,
                               num_leaves = 16)
model_lgb.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               max_leaves=250, metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=222,
               n_jobs=-1, num_leaves=16, objective='binary', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.7,
               subsample_for_bin=200000, subsample_freq=0)

In [39]:
y_pred = model_lgb.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[94096,   168],
       [   31,     2]])

### RandomSearch

In [40]:
# Create parameters to search
gridParams = gridParams = gridParams = {
    'learning_rate': [0.35],    
    'n_estimators': [100,110],           
    'num_leaves': [16],           
    'boosting_type' : ['dart'],
    'reg_alpha' : [1],                    
    'reg_lambda' : [1],                   
    'objective' : ['binary'],              
    'metric' : ['binary_logloss'],        
    'colsample_bytree' : [0.8],     
    'subsample' : [0.7],             
    'max_leaves': [100,105],           
    'min_child_samples' : [18,20]            
     }

In [41]:
#Define LightGBM Classifier model
model_lgb_gs = lgb.LGBMClassifier()

In [42]:
#RandomizedSearchCV to hypertune the parameters
randCV = RandomizedSearchCV(model_lgb_gs, gridParams,
                    cv=3,
                    n_jobs=1)

# Fit 
randCV.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=31, objective=None,
                                            random_state=None, reg_alpha=0.0,
                                            reg_lambda=0.0, sile...
                   param_distributions={'boosting_type': ['dart'],
                                        'colsample_bytree'

In [43]:
randCV.best_estimator_

LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=0.8,
               importance_type='split', learning_rate=0.35, max_depth=-1,
               max_leaves=100, metric='binary_logloss', min_child_samples=18,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_leaves=16, objective='binary', random_state=None,
               reg_alpha=1, reg_lambda=1, silent=True, subsample=0.7,
               subsample_for_bin=200000, subsample_freq=0)

In [44]:
best_model_lgb = randCV.best_estimator_

In [45]:
y_pred = best_model_lgb.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[94242,    22],
       [   31,     2]])

In [46]:
y_pred_sub = best_model_lgb.predict(X_test_sub)
y_pred_sub.sum()

NameError: name 'X_test_sub' is not defined

##  Decision Tree

In [None]:
model_DecTree = tree.DecisionTreeClassifier()
model_DecTree.fit(X_train, y_train)

In [None]:
y_pred = model_knn.predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred)