# Model selection

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
import sklearn as sk
from sklearn.svm import SVC
import pickle

In [10]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

def AUC(y, yhat, y_dev, yhat_dev):
    res = {'AUC': metrics.roc_auc_score(y,yhat),
           'AUC_dev': metrics.roc_auc_score(y_dev, yhat_dev)
          }
    return res

def accuracy(y, yhat, y_dev, yhat_dev):
    res = {'accuracy': metrics.accuracy_score(y,yhat),
           'accuracy_dev': metrics.accuracy_score(y_dev, yhat_dev)
          }
    return res

## Data

In [11]:
train = pickle.load( open( "../input/ff-partition/train", "rb" ) )
dev = pickle.load( open( "../input/ff-partition/dev", "rb" ) )
print(train.shape)
dev.shape

(269118, 13)


(67280, 13)

In [12]:
test = pickle.load( open( "../input/ff-partition/test", "rb" ) )
test.shape

(144171, 13)

In [13]:
print(train['success'].value_counts())
dev['success'].value_counts()

0    216958
1     52160
Name: success, dtype: int64


0    54212
1    13068
Name: success, dtype: int64

In [14]:
X = train.drop(['success','split'], axis = 1)
y = train['success']
X_dev = dev.drop(['success','split'], axis = 1)
y_dev = dev['success']

In [15]:
X_test = dev.drop(['success','split'], axis = 1)
y_test = dev['success']

## Supervised Models

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
#from sklearn.neighbors import KNeighborsClassifier

In [17]:
models_list = pd.DataFrame()

### Logistic Regression

In [18]:
mod1 = LogisticRegression(random_state=111, max_iter=1000, solver='liblinear')
mod1.fit(X,y)

pred1 = mod1.predict(X)
pred1a = mod1.predict(X_dev)

In [19]:
res = AUC(y, pred1, y_dev, pred1a)

In [20]:
model_dict = {'model': "Logistic Regression"}
models_list = models_list.append({**model_dict,**res}, ignore_index=True)
models_list

Unnamed: 0,model,AUC,AUC_dev
0,Logistic Regression,0.724205,0.726058


In [21]:
print(pd.crosstab(y, pred1))
pd.crosstab(y_dev, pred1a)

col_0         0      1
success               
0        209983   6975
1         27094  25066


col_0,0,1
success,Unnamed: 1_level_1,Unnamed: 2_level_1
0,52450,1762
1,6735,6333


### Decision Tree

In [22]:
mod2 = DecisionTreeClassifier(random_state=111)
mod2.fit(X,y)

DecisionTreeClassifier(random_state=111)

In [23]:
pred2 = mod2.predict(X)
pred2a= mod2.predict(X_dev)
res = AUC(y, pred2, y_dev, pred2a)

In [24]:
model_dict = {'model': "Decision Tree"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

Unnamed: 0,model,AUC,AUC_dev
0,Logistic Regression,0.724205,0.726058
1,Decision Tree,0.997054,0.846983


In [25]:
print(pd.crosstab(y, pred2))
pd.crosstab(y_dev, pred2a)

col_0         0      1
success               
0        216944     14
1           304  51856


col_0,0,1
success,Unnamed: 1_level_1,Unnamed: 2_level_1
0,51054,3158
1,3238,9830


### Random Forest

In [26]:
mod3 = RandomForestClassifier(random_state=111)
mod3.fit(X,y)

RandomForestClassifier(random_state=111)

In [27]:
pred3 = mod3.predict(X)
pred3a = mod3.predict(X_dev)
res = AUC(y, pred3, y_dev, pred3a)

In [28]:
model_dict = {'model': "RandomForest"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

Unnamed: 0,model,AUC,AUC_dev
0,Logistic Regression,0.724205,0.726058
1,Decision Tree,0.997054,0.846983
2,RandomForest,0.998284,0.888859


In [29]:
print(pd.crosstab(y, pred3))
pd.crosstab(y_dev, pred3a)

col_0         0      1
success               
0        216775    183
1           135  52025


col_0,0,1
success,Unnamed: 1_level_1,Unnamed: 2_level_1
0,51450,2762
1,2239,10829


### Adaptive Boosting (ADABoost)

In [30]:
mod4 = AdaBoostClassifier(random_state=111)
mod4.fit(X,y)

AdaBoostClassifier(random_state=111)

In [31]:
pred4 = mod4.predict(X)
pred4a = mod4.predict(X_dev)
res = AUC(y, pred4, y_dev, pred4a)

In [32]:
model_dict = {'model': "ADABoost"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

Unnamed: 0,model,AUC,AUC_dev
0,Logistic Regression,0.724205,0.726058
1,Decision Tree,0.997054,0.846983
2,RandomForest,0.998284,0.888859
3,ADABoost,0.897438,0.897649


In [33]:
print(pd.crosstab(y, pred4))
pd.crosstab(y_dev, pred4a)

col_0         0      1
success               
0        204807  12151
1          7778  44382


col_0,0,1
success,Unnamed: 1_level_1,Unnamed: 2_level_1
0,51171,3041
1,1942,11126


### Gradient Boosting Machine (GBM)

In [34]:
mod5 = GradientBoostingClassifier(random_state=111)
mod5.fit(X,y)

GradientBoostingClassifier(random_state=111)

In [35]:
pred5 = mod5.predict(X)
pred5a = mod5.predict(X_dev)
res = AUC(y, pred5, y_dev, pred5a)

In [36]:
model_dict = {'model': "GBM"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

Unnamed: 0,model,AUC,AUC_dev
0,Logistic Regression,0.724205,0.726058
1,Decision Tree,0.997054,0.846983
2,RandomForest,0.998284,0.888859
3,ADABoost,0.897438,0.897649
4,GBM,0.900312,0.900673


In [37]:
print(pd.crosstab(y, pred5))
pd.crosstab(y_dev, pred5a)

col_0         0      1
success               
0        204511  12447
1          7407  44753


col_0,0,1
success,Unnamed: 1_level_1,Unnamed: 2_level_1
0,51084,3128
1,1842,11226


### Support Vector Machine (SVM)

In [38]:
mod6 = sk.svm.SVC(random_state=111)
mod6.fit(X,y)

SVC(random_state=111)

In [39]:
pred6 = mod6.predict(X)
pred6a = mod6.predict(X_dev)
res = AUC(y, pred6, y_dev, pred6a)

In [40]:
model_dict = {'model': "SVM"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

Unnamed: 0,model,AUC,AUC_dev
0,Logistic Regression,0.724205,0.726058
1,Decision Tree,0.997054,0.846983
2,RandomForest,0.998284,0.888859
3,ADABoost,0.897438,0.897649
4,GBM,0.900312,0.900673
5,SVM,0.748914,0.749516


In [41]:
print(pd.crosstab(y, pred6))
pd.crosstab(y_dev, pred6a)

col_0         0      1
success               
0        208979   7979
1         24275  27885


col_0,0,1
success,Unnamed: 1_level_1,Unnamed: 2_level_1
0,52189,2023
1,6059,7009


## Model Selection

In [42]:
models_list.sort_values('AUC_dev',ascending=False)

Unnamed: 0,model,AUC,AUC_dev
4,GBM,0.900312,0.900673
3,ADABoost,0.897438,0.897649
2,RandomForest,0.998284,0.888859
1,Decision Tree,0.997054,0.846983
5,SVM,0.748914,0.749516
0,Logistic Regression,0.724205,0.726058


Gradient boosting machine gives the best results !