# Model selection

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
import sklearn as sk
from sklearn.svm import SVC

In [4]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

def AUC(y, yhat, y_dev, yhat_dev):
    res = {'AUC': metrics.roc_auc_score(y,yhat),
           'AUC_dev': metrics.roc_auc_score(y_dev, yhat_dev)
          }
    return res

def accuracy(y, yhat, y_dev, yhat_dev):
    res = {'accuracy': metrics.accuracy_score(y,yhat),
           'accuracy_dev': metrics.accuracy_score(y_dev, yhat_dev)
          }
    return res

## Data

In [7]:
import pickle
train = pickle.load( open( "../input/ff-partition/train", "rb" ) )
dev = pickle.load( open( "../input/ff-partition/dev", "rb" ) )
print(train.shape)
dev.shape

In [8]:
test = pickle.load( open( "../input/ff-partition/test", "rb" ) )
test.shape

In [9]:
print(train['success'].value_counts())
dev['success'].value_counts()

In [10]:
X = train.drop(['success','split'], axis = 1)
y = train['success']
X_dev = dev.drop(['success','split'], axis = 1)
y_dev = dev['success']

In [11]:
X_test = dev.drop(['success','split'], axis = 1)
y_test = dev['success']

## Supervised Models

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
#from sklearn.neighbors import KNeighborsClassifier

In [13]:
models_list = pd.DataFrame()

### Logistic Regression

In [14]:
mod1 = LogisticRegression(random_state=111, max_iter=1000, solver='liblinear')
mod1.fit(X,y)

pred1 = mod1.predict(X)
pred1a = mod1.predict(X_dev)

In [15]:
res = accuracy(y, pred1, y_dev, pred1a)

In [16]:
model_dict = {'model': "Logistic Regression"}
models_list = models_list.append({**model_dict,**res}, ignore_index=True)
models_list

In [17]:
print(pd.crosstab(y, pred1))
pd.crosstab(y_dev, pred1a)

### Decision Tree

In [18]:
mod2 = DecisionTreeClassifier(random_state=111)
mod2.fit(X,y)

In [19]:
pred2 = mod2.predict(X)
pred2a= mod2.predict(X_dev)
res = accuracy(y, pred2, y_dev, pred2a)

In [20]:
model_dict = {'model': "Decision Tree"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

In [21]:
print(pd.crosstab(y, pred2))
pd.crosstab(y_dev, pred2a)

### Random Forest

In [22]:
mod3 = RandomForestClassifier(random_state=111)
mod3.fit(X,y)

In [23]:
pred3 = mod3.predict(X)
pred3a = mod3.predict(X_dev)
res = accuracy(y, pred3, y_dev, pred3a)

In [24]:
model_dict = {'model': "RandomForest"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

In [25]:
print(pd.crosstab(y, pred3))
pd.crosstab(y_dev, pred3a)

### Adaptive Boosting (ADABoost)

In [26]:
mod4 = AdaBoostClassifier(random_state=111)
mod4.fit(X,y)

In [27]:
pred4 = mod4.predict(X)
pred4a = mod4.predict(X_dev)
res = accuracy(y, pred4, y_dev, pred4a)

In [28]:
model_dict = {'model': "ADABoost"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

In [29]:
print(pd.crosstab(y, pred4))
pd.crosstab(y_dev, pred4a)

### Gradient Boosting Machine (GBM)

In [30]:
mod5 = GradientBoostingClassifier(random_state=111)
mod5.fit(X,y)

In [31]:
pred5 = mod5.predict(X)
pred5a = mod5.predict(X_dev)
res = accuracy(y, pred5, y_dev, pred5a)

In [32]:
model_dict = {'model': "GBM"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

In [33]:
print(pd.crosstab(y, pred5))
pd.crosstab(y_dev, pred5a)

### Support Vector Machine (SVM)

In [34]:
mod6 = sk.svm.SVC(random_state=111)
mod6.fit(X,y)

In [35]:
pred6 = mod6.predict(X)
pred6a = mod6.predict(X_dev)
res = accuracy(y, pred6, y_dev, pred6a)

In [36]:
model_dict = {'model': "SVM"}
models_list = models_list.append({**model_dict, **res}, ignore_index=True)
models_list

In [37]:
print(pd.crosstab(y, pred6))
pd.crosstab(y_dev, pred6a)

## Model Selection

In [38]:
models_list.sort_values('accuracy_dev',ascending=False)

Gradient boosting machine gives the best results !