In [None]:
from sklearn.metrics import confusion_matrix, classification_report, auc, roc_curve, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV, train_test_split
import catboost as ctb
import lightgbm as lgb
from sklearn.externals import joblib
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline

In [None]:
train = pd.read_csv('train_modified.csv')

In [None]:
test = pd.read_csv('test_modified.csv')

In [None]:
features = train.drop(columns=['Survived'])
target = train.Survived

Stratified K-fold cross validation

In [None]:
kf = StratifiedKFold(n_splits=5)

# Classification Models

##  Logistic Regression Model

In [None]:
lr = LogisticRegressionCV(Cs=[0.01, 0.1, 1, 10, 500],
                          cv=kf, scoring='accuracy', max_iter=2000, refit=True)

In [None]:
lr.fit(features, target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_lr = lr.predict_proba(features)[:, 1]
lr_fpr, lr_tpr, lr_thresholds = roc_curve(target, y_scores_lr)

In [None]:
# Finding the AUC for the logistic classification model.
lr_auc = auc(x=lr_fpr, y=lr_tpr)

In [None]:
lr_acc = np.mean(lr.scores_[1])

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(lr_auc, lr_acc))

Make prediction for test data

In [None]:
y_pred_nb = pd.DataFrame(lr.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
lr_model = pd.concat([test.PassengerId, y_pred_nb], axis=1)

## Gaussian Naive Bayes Model

In [None]:
nb = GaussianNB()

In [None]:
nb_params = {'priors': [[0.7, 0.3], [0.6, 0.4],
                        [0.5, 0.5], [0.4, 0.6], [0.3, 0.7]]}

In [None]:
gs_nb = GridSearchCV(nb, param_grid=nb_params,
                     scoring='accuracy', cv=kf, refit=True)

In [None]:
gs_nb.fit(features, target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_nb = gs_nb.predict_proba(features)[:, 1]
nb_fpr, nb_tpr, nb_thresholds = roc_curve(target, y_scores_nb)

In [None]:
# Finding the AUC for the naive bayes classification model.
nb_auc = auc(x=nb_fpr, y=nb_tpr)

In [None]:
nb_acc = gs_nb.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(nb_auc, nb_acc))

Make prediction for test data

In [None]:
y_pred_nb = pd.DataFrame(gs_nb.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
nb_model = pd.concat([test.PassengerId, y_pred_nb], axis=1)

## KNN Classification Model

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn_params = {'n_neighbors': list(range(3, 21)), 'weights': [
    'uniform', 'distance'], 'leaf_size': [20, 30, 40, 50, 60], 'p': [1, 2, 3]}

In [None]:
gs_knn = GridSearchCV(knn, param_grid=knn_params,
                      scoring='accuracy', cv=kf, refit=True)

In [None]:
gs_knn.fit(features, target)

In [None]:
joblib.dump(gs_knn, 'knnmodel.pkl')

In [None]:
gs_knn = joblib.load('knnmodel.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_knn = gs_knn.predict_proba(features)[:, 1]
knn_fpr, knn_tpr, knn_thresholds = roc_curve(target, y_scores_knn)

In [None]:
# Finding the AUC for the naive bayes classification model.
knn_auc = auc(x=knn_fpr, y=knn_tpr)

In [None]:
knn_acc = gs_knn.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(knn_auc, knn_acc))

Make prediction for test data

In [None]:
y_pred_knn = pd.DataFrame(gs_knn.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
knn_model = pd.concat([test.PassengerId, y_pred_knn], axis=1)

## Support Vector Machine Classification model

In [None]:
svm = SVC(probability=True)

In [None]:
svm_params = {'C': [0.1, 1, 10, 100, 500], 'kernel': ['rbf'], 'degree': [
    1, 2, 3, 4], 'gamma': [0.05, 0.1, 1, 5], 'shrinking': [True, False]}

In [None]:
rs_svm = RandomizedSearchCV(svm, param_distributions=svm_params,
                            scoring='accuracy', cv=kf, refit=True, n_iter=2000)

In [None]:
rs_svm.fit(features, target)

In [None]:
joblib.dump(rs_svm, 'svmmodel.pkl')

In [None]:
rs_svm = joblib.load('svmmodel.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_svm = rs_svm.predict_proba(features)[:, 1]
svm_fpr, svm_tpr, svm_thresholds = roc_curve(target, y_scores_svm)
# Finding the AUC for the SVM classification model.
svm_auc = auc(x=svm_fpr, y=svm_tpr)

In [None]:
svm_acc = rs_svm.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(svm_auc, svm_acc))

Make Prediction for test data

In [None]:
y_pred_svm = pd.DataFrame(rs_svm.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
svm_model = pd.concat([test.PassengerId, y_pred_svm], axis=1)

## Decision Tree Classification Model

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt_params = {'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 5, 10]}

In [None]:
gs_dt = GridSearchCV(dt, param_grid=dt_params,
                     scoring='accuracy', cv=kf, refit=True)

In [None]:
gs_dt.fit(features, target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_dt = gs_dt.predict_proba(features)[:, 1]
dt_fpr, dt_tpr, dt_thresholds = roc_curve(target, y_scores_dt)
# Finding the AUC for the Decision Tree classification model.
dt_auc = auc(x=dt_fpr, y=dt_tpr)

In [None]:
dt_acc = gs_dt.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(dt_auc, dt_acc))

Make Prediction for test data

In [None]:
y_pred_dt = pd.DataFrame(rs_dt.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
dt_model = pd.concat([test.PassengerId, y_pred_dt], axis=1)

## Random Forest Classification Model

In [None]:
rf = RandomForestClassifier()

In [None]:
rf_params = {'n_estimators': [10, 100, 200], 'criterion': ['gini', 'entropy'], 'min_samples_split': [
    2, 5, 10], 'max_features': ['sqrt', 'log2', None], 'class_weight': [{0: 0.6, 1: 0.4}, {0: 0.6, 1: 0.4}]}

In [None]:
rs_rf = RandomizedSearchCV(rf, param_distributions=rf_params,
                           scoring='accuracy', cv=kf, refit=True, n_iter=2000)

In [None]:
rs_rf.fit(features, target)

In [None]:
joblib.dump(rs_rf, 'randomdorestmodel.pkl')

In [None]:
rs_rf = joblib.load('randomdorestmodel.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_rf = rs_rf.predict_proba(features)[:, 1]
rf_fpr, rf_tpr, rf_thresholds = roc_curve(target, y_scores_rf)
# Finding the AUC for the Random Forest classification model.
rf_auc = auc(x=rf_fpr, y=rf_tpr)

In [None]:
rf_acc = rs_rf.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(rf_auc, rf_acc))

Make Prediction for test data

In [None]:
y_pred_rf = pd.DataFrame(rs_rf.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
rf_model = pd.concat([test.PassengerId, y_pred_rf], axis=1)

## Bagging Classification Model

In [None]:
bg = BaggingClassifier()

In [None]:
bg_params = {'n_estimators': [10, 100, 500]}

In [None]:
gs_bg = GridSearchCV(bg, param_grid=bg_params,
                     scoring='accuracy', cv=kf, refit=True)

In [None]:
gs_bg.fit(features, target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_bg = gs_bg.predict_proba(features)[:, 1]
bg_fpr, bg_tpr, bg_thresholds = roc_curve(target, y_scores_bg)
# Finding the AUC for the Bagging classification model.
bg_auc = auc(x=bg_fpr, y=bg_tpr)

In [None]:
bg_acc = gs_bg.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(bg_auc, bg_acc))

Make Prediction for test data

In [None]:
y_pred_bg = pd.DataFrame(gs_bg.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
bg_model = pd.concat([test.PassengerId, y_pred_bg], axis=1)

## Adaboost Classifier

In [None]:
ada = AdaBoostClassifier()

In [None]:
ada_params = {'n_estimators': [100, 500, 1000,
                               10000], 'learning_rate': [0.1, 0.5, 0.7, 1]}

In [None]:
gs_ada = GridSearchCV(ada, param_grid=ada_params, cv=kf,
                      scoring='accuracy', refit=True)

In [None]:
gs_ada.fit(features, target)

In [None]:
joblib.dump(gs_ada, 'adaboost.pkl')

In [None]:
gs_ada = joblib.load('adaboost.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_ada = gs_ada.predict_proba(features)[:, 1]
ada_fpr, ada_tpr, ada_thresholds = roc_curve(target, y_scores_ada)
# Finding the AUC for the AdaBoost classification model.
ada_auc = auc(x=ada_fpr, y=ada_tpr)

In [None]:
ada_acc = gs_ada.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(bg_auc, bg_acc))

Make Predictions for test data

In [None]:
y_pred_ada = pd.DataFrame(gs_ada.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')
ada_model = pd.concat([test.PassengerId, y_pred_ada], axis=1)

## Gradient Boost Classifier

In [None]:
gboost = GradientBoostingClassifier(
    validation_fraction=0.1, n_iter_no_change=20, tol=0.005)

In [None]:
gboost_params = {'learning_rate': [0.1, 0.2, 0.3], 'n_estimators': [
    100, 300, 500, 1000], 'max_features': [None, 'log2', 'sqrt']}

In [None]:
gs_gboost = GridSearchCV(gboost, param_grid=gboost_params,
                         cv=kf, scoring='accuracy', refit=True)

In [None]:
gs_gboost.fit(features, target)

In [None]:
gs_gboost.best_params_

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_gboost = gs_gboost.predict_proba(features)[:, 1]
gboost_fpr, gboost_tpr, gboost_thresholds = roc_curve(target, y_scores_gboost)
# Finding the AUC for the Gradient Boost classification model.
gboost_auc = auc(x=gboost_fpr, y=gboost_tpr)

In [None]:
gboost_acc = gs_gboost.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(gboost_auc, gboost_acc))

Make Predictions for test data

In [None]:
y_pred_gboost = pd.DataFrame(gs_gboost.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')
gboost_model = pd.concat([test.PassengerId, y_pred_gboost], axis=1)

## XGBoost Classifier

In [None]:
xgboost = xgb.XGBClassifier()

In [None]:
xgb_params = {'booster': ['gbtree', 'gblinear'], 'learning_rate': [
    0.2, 0.3, 0.4], 'gamma': [0, 0.01, 0.001]}

In [None]:
gs_xgb = GridSearchCV(xgboost, param_grid=xgb_params,
                      cv=kf, scoring='accuracy', refit=True)

In [None]:
gs_xgb.fit(features, target)

In [None]:
gs_xgb.best_params_

Make Predictions for test data

In [None]:
y_pred_xgboost = pd.DataFrame(gs_xgb.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')
xgboost_model = pd.concat([test.PassengerId, y_pred_xgboost], axis=1)

## LightGBM Classifier

In [None]:
lgboost = lgb.LGBMClassifier()

In [None]:
lgboost.fit(features, target)

Make Prediction for test data

In [None]:
y_pred_lgboost = pd.DataFrame(lgboost.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')
lgboost_model = pd.concat([test.PassengerId, y_pred_lgboost], axis=1)

## Catboost Classifier

In [None]:
ctboost = ctb.CatBoostClassifier(iterations=200, learning_rate=0.1, depth=10)

In [None]:
ctboost.fit(features, target)

Make Prediction for test data

In [None]:
y_pred_ctboost = pd.DataFrame(ctboost.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')
ctboost_model = pd.concat([test.PassengerId, y_pred_ctboost], axis=1)

## Voting Classifier

In [None]:
v = VotingClassifier(estimators=[
    ('lr', lr), ('NB', gs_nb), ('KNN', gs_knn), ('SVM', rs_svm), ('DT', gs_dt),
    ('RF', rs_rf), ('BG', gs_bg), ('AdaBoost', gs_ada), ('GBM', gs_gboost),
    ('XGBM', gs_xgb), ('LightGBM', lgboost), ('CatBoost', ctboost)],
    voting='soft')

In [None]:
v.fit(features, target)

In [None]:
joblib.dump(v, 'votingclassifier.pkl')

In [None]:
v = joblib.load('votingclassifier.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_v = v.predict_proba(features)[:, 1]
v_fpr, v_tpr, v_thresholds = roc_curve(target, y_scores_v)
# Finding the AUC for the Voting classification model.
v_auc = auc(x=v_fpr, y=v_tpr)

In [None]:
print('Area Under Curve: {}'.format(v_auc))

Make Prediction for test data

In [None]:
y_pred_v = pd.DataFrame(v.predict(test.drop(columns=['PassengerId'])), columns=[
                        'Survived'], dtype='int64')

In [None]:
v_model = pd.concat([test.PassengerId, y_pred_v], axis=1)

## Stacking

In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(
    features, target, test_size=0.3)

In [None]:
lr = LogisticRegressionCV(cv=kf)
nb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=14, leaf_size=20, p=1, weights='uniform')
svm = SVC(kernel='rbf', gamma=0.1, degree=1, C=500, shrinking=True)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)
adab = AdaBoostClassifier(n_estimators=500, learning_rate=0.7)
bg = BaggingClassifier(n_estimators=100)
gboost = GradientBoostingClassifier(
    validation_fraction=0.1, n_iter_no_change=20, tol=0.005)
xgboost = xgb.XGBClassifier()
lgboost = lgb.LGBMClassifier()
ctboost = ctb.CatBoostClassifier(iterations=200, learning_rate=0.1, depth=10)
rf = RandomForestClassifier()

In [None]:
lr.fit(x_train, y_train)
nb.fit(x_train, y_train)
knn.fit(x_train, y_train)
svm.fit(x_train, y_train)
gb.fit(x_train, y_train)
adab.fit(x_train, y_train)
bg.fit(x_train, y_train)
gboost.fit(x_train, y_train)
xgboost.fit(x_train, y_train)
lgboost.fit(x_train, y_train)
ctboost.fit(x_train, y_train)
rf.fit(x_train, y_train)

In [None]:
pred1 = lr.predict(x_validate)
pred2 = nb.predict(x_validate)
pred3 = knn.predict(x_validate)
pred4 = svm.predict(x_validate)
pred5 = gb.predict(x_validate)
pred6 = adab.predict(x_validate)
pred7 = bg.predict(x_validate)
pred8 = gboost.predict(x_validate)
pred9 = xgboost.predict(x_validate)
pred10 = lgboost.predict(x_validate)
pred11 = ctboost.predict(x_validate)
pred12 = rf.predict(x_validate)

In [None]:
test_pred1 = lr.predict(test.drop(columns=['PassengerId']))
test_pred2 = nb.predict(test.drop(columns=['PassengerId']))
test_pred3 = knn.predict(test.drop(columns=['PassengerId']))
test_pred4 = svm.predict(test.drop(columns=['PassengerId']))
test_pred5 = gb.predict(test.drop(columns=['PassengerId']))
test_pred6 = adab.predict(test.drop(columns=['PassengerId']))
test_pred7 = bg.predict(test.drop(columns=['PassengerId']))
test_pred8 = gboost.predict(test.drop(columns=['PassengerId']))
test_pred9 = xgboost.predict(test.drop(columns=['PassengerId']))
test_pred10 = lgboost.predict(test.drop(columns=['PassengerId']))
test_pred11 = ctboost.predict(test.drop(columns=['PassengerId']))
test_pred12 = rf.predict(test.drop(columns=['PassengerId']))

In [None]:
stacked_predictions = np.column_stack((pred1, pred2, pred3, pred4, pred5, pred6, pred7,
                                       pred8, pred9, pred10, pred11, pred12))

In [None]:
stacked_test_predictions = np.column_stack((test_pred1, test_pred2, test_pred3, test_pred4, test_pred5,
                                            test_pred6, test_pred7, test_pred8, test_pred9, test_pred10,
                                            test_pred11, test_pred12))

In [None]:
# Meta model
meta_model = LogisticRegressionCV(cv=kf)

In [None]:
meta_model.fit(stacked_predictions, y_validate)

Make predictions for test data

In [None]:
y_pred_stack = pd.DataFrame(meta_model.predict(
    stacked_test_predictions), columns=['Survived'], dtype='int64')

In [None]:
stack_model = pd.concat([test.PassengerId, y_pred_stack], axis=1)

# Models Comaprison

## Models score

In [None]:
pd.DataFrame([(lr_auc, lr_acc), (nb_auc, nb_acc), (knn_auc, knn_acc), (dt_auc, dt_acc),
              (rf_auc, rf_acc), (svm_auc, svm_acc), (bg_auc, bg_acc), (ada_auc, ada_acc), (v_auc, 'NA')],
             columns=['AUC', 'Accuracy'],
             index=['Logistic Regression', 'Naive Bayes', 'KNN', 'Decision Tree',
                    'Random Forest', 'SVM', 'Bagging', 'AdaBoost', 'Voting'])

## Plotting the ROC curve

In [None]:
plt.figure(figsize=(8, 5))
plt.title('Receiver Operating Characteristic Curve')
plt.plot(lr_fpr, lr_tpr, 'b', label='LR_AUC = %0.2f' % lr_auc)
plt.plot(nb_fpr, nb_tpr, 'g', label='NB_AUC = %0.2f' % nb_auc)
plt.plot(knn_fpr, knn_tpr, 'orange', label='KNN_AUC = %0.2f' % knn_auc)
plt.plot(svm_fpr, svm_tpr, 'y', label='SVM_AUC = %0.2f' % svm_auc)
plt.plot(dt_fpr, dt_tpr, 'brown', label='DT_AUC = %0.2f' % dt_auc)
plt.plot(rf_fpr, rf_tpr, 'grey', label='RF_AUC = %0.2f' % rf_auc)
plt.plot(bg_fpr, bg_tpr, 'black', label='BG_AUC = %0.2f' % bg_auc)
plt.plot(ada_fpr, ada_tpr, 'pink', label='Ada_AUC = %0.2f' % ada_auc)
plt.plot(v_fpr, v_tpr, 'purple', label='Voting_AUC = %0.2f' % v_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()