In [11]:
!pip install vecstack



In [0]:
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [14]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

trainfile = r'/gdrive/My Drive/Assignments/Assignment3/RevisedHomesiteTrain.csv'
testfile = r'/gdrive/My Drive/Assignments/Assignment3/RevisedHomesiteTest.csv'

train_data = pd.read_csv(trainfile)
test_data = pd.read_csv(testfile)




Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [15]:
#checking the data
print(train_data.shape)
print(test_data.shape)
print(train_data.head())
print(test_data.head())


(65000, 596)
(173836, 596)
   CoverageField11A  ...  QuoteConversion_Flag
0                 2  ...                     0
1                 5  ...                     0
2                 4  ...                     0
3                15  ...                     0
4                 4  ...                     0

[5 rows x 596 columns]
   CoverageField11A  CoverageField11B  ...  GeographicField64_TX  GeographicField64
0                13                22  ...                     0                 IL
1                 4                 5  ...                     0                 NJ
2                 3                 3  ...                     0                 NJ
3                 5                 9  ...                     0                 TX
4                12                21  ...                     0                 CA

[5 rows x 596 columns]


In [0]:
#splitting train_data
X_train = train_data.iloc[:,:-1]
Y_train = train_data["QuoteConversion_Flag"]
print(X_train.head())
print(Y_train.head())

#removing the last column of test data as it is already hot encoded
X_test = test_data.iloc[:,:-1]
print(X_test.head())

In [17]:
#Decision tree without SMOTE
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X_train, Y_train)
dtc_cv_score = cross_val_score(dtc, X_train, Y_train, cv=10, scoring = "roc_auc" )
print("Mean AUC score for decision tree without SMOTE sampling technique:")
print(dtc_cv_score.mean())
resDF = pd.DataFrame(dtc.predict(X_test), columns= ['QuoteConversion_Flag'])
resultDTC = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultDTC.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultDTC.csv", index = None)


Mean AUC score for decision tree without SMOTE sampling technique:
0.8135516116445698


In [18]:
#balancing the train data using SMOTE methodology
print("Original dataset shape %s" % Counter(Y_train))
sm = SMOTE(sampling_strategy = 'float', ratio = 0.6)
X_res, Y_res = sm.fit_resample(X_train, Y_train)
print("Resampled Sample shape %s" % Counter(Y_res))


Original dataset shape Counter({0: 52738, 1: 12262})
Resampled Sample shape Counter({0: 52738, 1: 31642})


In [19]:
#Using SMOTE data on multiple models:
#Decision Trees without tuning
clf = DecisionTreeClassifier()
clf = clf.fit(X_res, Y_res)
clf_cv_score = cross_val_score(clf, X_res, Y_res, cv = 10, scoring = "roc_auc")
print("Mean AUC Score for normal Decision tree after SMOTE:")
print(clf_cv_score.mean())
resDF = pd.DataFrame(clf.predict(X_test), columns= ['QuoteConversion_Flag'])
resultDTC = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultDTC.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultDTCAfterSMOTE.csv", index = None)


Mean AUC Score for normal Decision tree after SMOTE:
0.7822020310114891


In [0]:
#Decision Trees with Hyperparameter tuning
parameters = {'criterion': ['gini', 'entropy'], 'min_samples_split' : range(50,100,10),'max_depth': range(10,100,10)}
clf_random = RandomizedSearchCV(clf, parameters, scoring = "roc_auc", n_iter = 15, cv = 5)
clf_random = clf_random.fit(X_res, Y_res)
random_param = clf_random.best_params_


In [21]:
#decision tree classifier with new optimum parameters
clf_tuned = DecisionTreeClassifier(**random_param)
clf_tuned=clf_tuned.fit(X_res,Y_res)
clf_cv_score = cross_val_score(clf_tuned, X_res, Y_res, cv = 10, scoring = "roc_auc")
print("Mean AUC Score for tuned Decision tree after SMOTE:")
print(clf_cv_score.mean())

resDF = pd.DataFrame(clf_tuned.predict(X_test), columns= ['QuoteConversion_Flag'])
resultDTC = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultDTC.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultTunedDTCAfterSMOTE.csv", index = None)


Mean AUC Score for tuned Decision tree after SMOTE:
0.8489199105253225


In [0]:
#Random Forest Trees with Hyperparameter tuning
parameters = {'criterion': ['gini', 'entropy'], 'min_samples_split' : range(50,100,10),'max_depth': range(10,100,10)}
rfc_random = RandomizedSearchCV(clf, parameters, scoring = "roc_auc", cv = 5)
rfc_random = rfc_random.fit(X_res, Y_res)
random_param = rfc_random.best_params_


In [23]:
#Random Forest tree classifier with new optimum parameters
rfc_tuned = RandomForestClassifier(**random_param)
rfc_tuned = rfc_tuned.fit(X_res,Y_res)
rfc_cv_score = cross_val_score(rfc_tuned, X_res, Y_res, cv = 10, scoring = "roc_auc")
print("Mean AUC Score for tuned Random Forest Tree after SMOTE:")
print(rfc_cv_score.mean())

resDF = pd.DataFrame(rfc_tuned.predict(X_test), columns= ['QuoteConversion_Flag'])
resultRFC = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultRFC.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultTunedRFCAfterSMOTE.csv", index = None)

Mean AUC Score for tuned Random Forest Tree after SMOTE:
0.9665957010878434


In [24]:
#K nearest Neighbours
KNN_Classifier = KNeighborsClassifier(n_neighbors = 5)
KNN_Classifier = KNN_Classifier.fit(X_res, Y_res)
KNN_cv_score = cross_val_score(KNN_Classifier, X_res, Y_res, cv = 10, scoring = "roc_auc")
print("Mean AUC Score for KNN after SMOTE:")
print(KNN_cv_score.mean())

resDF = pd.DataFrame(KNN_Classifier.predict(X_test), columns= ['QuoteConversion_Flag'])
resultKNN = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultKNN.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultTunedKNNAfterSMOTE.csv", index = None)

Mean AUC Score for KNN after SMOTE:
0.15454472896083074


In [25]:
#Multilayer Perceptron classifier
MLPClf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
MLPClf = MLPClf.fit(X_res,Y_res)
MLP_cv_score = cross_val_score(MLPClf, X_res, Y_res, cv = 10, scoring = "roc_auc")
print("Mean AUC Score for MLP after SMOTE:")
print(MLP_cv_score.mean())

resDF = pd.DataFrame(MLPClf.predict(X_test), columns= ['QuoteConversion_Flag'])
resultMLP = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultMLP.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultMLPAfterSMOTE.csv", index = None)


Mean AUC Score for MLP after SMOTE:
0.5


In [0]:
#Support Vector machines 
SVM_clf = LinearSVC()
SVM_clf = SVM_clf.fit(X_res, Y_res)
#SVM_cv_score = cross_val_score(SVM_clf, X_res, Y_res, cv = 10, scoring = "roc_auc")
#print("Mean AUC Score for SVM after SMOTE:")
#print(SVM_cv_score.mean())

resDF = pd.DataFrame(SVM_clf.predict(X_test), columns= ['QuoteConversion_Flag'])
resultSVM = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultSVM.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultSVMAfterSMOTE.csv", index = None)


In [0]:
#Gradient Boosting Clssifier
GB_clf = GradientBoostingClassifier()
GB_clf = GB_clf.fit(X_res, Y_res)
#GB_cv_score = cross_val_score(GB_clf, X_res, Y_res, cv = 10, scoring = "roc_auc")
#print("Mean AUC Score for SVM after SMOTE")
#print(GB_cv_score.mean())

resDF = pd.DataFrame(GB_clf.predict(X_test), columns = ['QuoteConversion_Flag'])
resultGB = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultGB.to_csv("/gdrive/My Drive/Assignments/Assignment3/resultGBAfterSMOTE.csv", index = None)


In [50]:
#stacking all the models
models = [clf_tuned, rfc_tuned, GB_clf]

S_train, S_test = stacking(models, 
                           X_res, Y_res, X_test, 
                           regression = False,
                           mode = 'oof_pred_bag',
                           needs_proba = False,
                           save_dir = None,
                           metric = accuracy_score,
                           n_folds = 4,
                           stratified = True,
                           shuffle= True,
                           random_state = 0,
                           verbose = 2)


task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [DecisionTreeClassifier]
    fold  0:  [0.92842245]
    fold  1:  [0.92562571]
    fold  2:  [0.92173130]
    fold  3:  [0.92248981]
    ----
    MEAN:     [0.92456732] + [0.00266183]
    FULL:     [0.92456743]

model  1:     [RandomForestClassifier]
    fold  0:  [0.91074137]
    fold  1:  [0.90638036]
    fold  2:  [0.91741728]
    fold  3:  [0.90902626]
    ----
    MEAN:     [0.91089132] + [0.00407547]
    FULL:     [0.91089121]

model  2:     [GradientBoostingClassifier]
    fold  0:  [0.94183732]
    fold  1:  [0.93918278]
    fold  2:  [0.93908220]
    fold  3:  [0.93533706]
    ----
    MEAN:     [0.93885984] + [0.00231458]
    FULL:     [0.93885992]



In [32]:
#checking the data
print(S_train)
print(S_test)

[[0 0 0]
 [0 0 0]
 [0 0 0]
 ...
 [1 1 1]
 [1 1 1]
 [1 1 1]]
[[0 0 0]
 [0 0 0]
 [0 0 0]
 ...
 [1 1 1]
 [0 0 0]
 [0 0 0]]


In [0]:
#constructing second level model on top of the stacked model results
#Random Forest Trees on stacked results with Hyperparameter tuning
RFC = RandomForestClassifier()
parameters = {'criterion': ['gini', 'entropy'], 'min_samples_split' : range(50,100,10)}
rfc_random = RandomizedSearchCV(RFC, parameters, scoring = "roc_auc", cv = 5)
rfc_random = rfc_random.fit(S_train, Y_res)
random_param = rfc_random.best_params_


In [0]:
#using Random Forest Classifier on the stacked results
modelRFC = RandomForestClassifier(**random_param)
modelRFC = modelRFC.fit(S_train, Y_res)
resDF = pd.DataFrame(modelRFC.predict(S_test), columns= ['QuoteConversion_Flag'])

resultModelRFC = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultModelRFC.to_csv("/gdrive/My Drive/Assignments/Assignment3/StackedResultRFC.csv", index = None)


In [0]:
#using Gradient Boosting Classifier on the stacked results
modelGBC = GradientBoostingClassifier()
modelGBC = modelGBC.fit(S_train, Y_res)
resDF = pd.DataFrame(modelGBC.predict(S_test), columns = ["QuoteConversion_Flag"])

resultModelGBC = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultModelGBC.to_csv("/gdrive/My Drive/Assignments/Assignment3/StackedResultGBC.csv", index = None)

In [0]:
#using Decision Tree Classifier on the stacked results
#Decision Trees with Hyperparameter tuning
clf = DecisionTreeClassifier()
parameters = {'criterion': ['gini', 'entropy'], 'min_samples_split' : range(50,100,10)}
clf_random = RandomizedSearchCV(clf, parameters, scoring = "roc_auc", n_iter = 15, cv = 5)
clf_random = clf_random.fit(X_res, Y_res)
random_param = clf_random.best_params_

In [0]:
#decision tree classifier with new optimum parameters
clf_tuned = DecisionTreeClassifier(**random_param)
clf_tuned=clf_tuned.fit(S_train,Y_res)
#clf_cv_score = cross_val_score(clf_tuned, X_res, Y_res, cv = 10, scoring = "roc_auc")
#print("Mean AUC Score for tuned Decision tree:")
#print(clf_cv_score.mean())

resDF = pd.DataFrame(clf_tuned.predict(S_test), columns= ['QuoteConversion_Flag'])
resultDTC = pd.concat([X_test['QuoteNumber'], resDF], axis = 1)
resultDTC.to_csv("/gdrive/My Drive/Assignments/Assignment3/StackedResultDTC.csv", index = None)
