In [None]:
import sklearn.metrics
from sklearn.model_selection import StratifiedKFold, validation_curve ,cross_val_score, cross_val_predict, RandomizedSearchCV, train_test_split
import sklearn.feature_selection
import catboost as ctb
import lightgbm as lgb
from sklearn.externals import joblib
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import sklearn.ensemble as ens
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier, LogisticRegression
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline

In [None]:
train = pd.read_csv('train_modified.csv')

In [None]:
test = pd.read_csv('test_modified.csv')

In [None]:
passengerID = pd.read_csv('ID.csv')

In [None]:
features = train.drop(columns=['Survived'])
target = train.Survived

# Classification Models

##  Logistic Regression Model - Baseline model

### Train/Validation set

First, we want to tune our model such that we minimize the variance, which is sensitivity of the prediction score to the change in training set, so, we will use cross-validation. We will use the validation curve to help us choose the best number of folds.

In [None]:
lr = LogisticRegressionCV(Cs=10, scoring='accuracy', max_iter=3000, refit=True)
param_name = 'cv'
param_range = list(range(3, 21))
train_score, test_score = validation_curve(
    lr, features, target, param_name, cv= None, param_range=param_range)

In [None]:
train_score_mean = np.mean(train_score, axis= 1)
test_score_mean = np.mean(test_score, axis= 1)

# Plot number of folds VS. cross-validated scores for training and Validation sets.
plt.figure()
plt.xlabel("Number of folds")
plt.ylabel("Cross validated accuracy score")
plt.plot(np.arange(3,21), train_score_mean)
plt.plot(np.arange(3,21), test_score_mean, color = 'red')

In [None]:
train_test_diff = train_score_mean - test_score_mean

# Plot number of folds VS. difference of cross-validated scores between train and Dev sets.
plt.figure()
plt.xlabel("Number of folds")
plt.ylabel("Diff. Cross validated accuracy score")
plt.plot(np.arange(3,21), train_test_diff)

It seems that the minimum variance is obtained at K = 7 folds.

Fit our model and the use the best CV value.

In [None]:
logistic_reg = LogisticRegressionCV(
    Cs=10, cv= 7, scoring='accuracy', max_iter=3000, refit=True)

In [None]:
logistic_reg.fit(features, target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_lr = logistic_reg.predict_proba(features)[:, 1]
lr_fpr, lr_tpr, lr_thresholds = sklearn.metrics.roc_curve(target, y_scores_lr)

In [None]:
# Finding the AUC for the logistic classification model.
lr_auc = sklearn.metrics.auc(x=lr_fpr, y=lr_tpr)

In [None]:
lr_acc = np.mean(logistic_reg.scores_[1])

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(lr_auc, lr_acc))

### Feature selection

Here, we will try different method to select the features with the highest explainatory power. We will try the following methods, then we select the best method:
1. VarianceThreshold
2. SelectKBest
3. RFECV
4. SelectFromModel

#### VarianceThreshold method

In [None]:
threshold = np.arange(1, 10, 0.5) *1e-1

In [None]:
scores = []
for i in threshold:
    selector = sklearn.feature_selection.VarianceThreshold(threshold= i)
    selected_features = selector.fit_transform(features)
    logistic_reg.fit(selected_features, target)
    y_pred = logistic_reg.predict(selected_features)
    scores.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot variance threshold VS. cross-validated scores for training sets.
plt.figure()
plt.xlabel("variance threshold")
plt.ylabel("Cross validated accuracy score")
plt.plot(np.arange(1, 10, 0.5) *1e-1, np.array(scores))

In [None]:
np.max(np.array(scores))

The highest accuracy is obtained after execluding features whose variance is less than 0.1

#### SelectKbest method

In [None]:
number_of_features = list(range(1,17))

In [None]:
scores_k = []
for i in number_of_features:
    selector = sklearn.feature_selection.SelectKBest(k=i)
    selected_features = selector.fit_transform(features, target)
    logistic_reg.fit(selected_features, target)
    y_pred = logistic_reg.predict(selected_features)
    scores_k.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot number of selected features VS. cross-validated scores for training sets.
plt.figure()
plt.xlabel("Number of Selected Features")
plt.ylabel("Cross validated accuracy score")    
plt.plot(list(range(1,17)), scores_k)

In [None]:
print("Maximum accuracy score is :", max(scores_k))

In [None]:
print("Optimal number of features :", np.argmax(np.array(scores_k)) + 1)

The highest accuracy score is obtained after selecting the best 14 features.

#### RFECV method

In [None]:
selector = sklearn.feature_selection.RFECV(logistic_reg, step= 1, cv= 5)
selector.fit(features, target)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)

In [None]:
print("Optimal number of features : %d" % selector.n_features_)

In [None]:
print("Maximum accuracy score is :", np.max(selector.grid_scores_))

#### SelectFromModel method

In [None]:
threshold = np.arange(1, 5, 0.1) *1e-1

In [None]:
scores_sfm = []
for i in threshold:
    selector = sklearn.feature_selection.SelectFromModel(logistic_reg, threshold= i)
    selector.fit(features, target)
    selected_features = features.loc[:, selector.get_support()]
    logistic_reg.fit(selected_features, target)
    y_pred = logistic_reg.predict(selected_features)
    scores_sfm.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Threshold Value")
plt.ylabel("Cross validation score")    
plt.plot(np.arange(1, 5, 0.1) *1e-1, scores_sfm)

In [None]:
print("Maximum accuracy score is :", np.max(np.array(scores_sfm)))

In [None]:
print("Optimal threshold :", threshold[np.argmax(np.array(scores_sfm))])

We conclude the best feature selection method is SelectFromModel with threshold = 0.28.

In [None]:
selector = sklearn.feature_selection.SelectFromModel(logistic_reg, threshold= 0.25)
selector.fit(features, target)
selected_features = features.loc[:, selector.get_support()]
logistic_reg.fit(selected_features, target)

Make prediction for test data

In [None]:
y_pred_nb = pd.DataFrame(logistic_reg.predict(
    test.loc[:, selector.get_support()]), columns=['Survived'], dtype='int64')

In [None]:
lr_model = pd.concat([passengerID, y_pred_nb], axis=1)

In [None]:
lr_model.to_csv('logistic.csv', index= False)

## Gaussian Naive Bayes Model

In [None]:
nb = GaussianNB()

In [None]:
nb_params = {'priors': [[0.7, 0.3], [0.6, 0.4],
                        [0.5, 0.5], [0.4, 0.6], [0.3, 0.7]]}

In [None]:
gs_nb = GridSearchCV(nb, param_grid=nb_params,
                     scoring='accuracy', cv=kf, refit=True)

In [None]:
gs_nb.fit(features, target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_nb = gs_nb.predict_proba(features)[:, 1]
nb_fpr, nb_tpr, nb_thresholds = roc_curve(target, y_scores_nb)

In [None]:
# Finding the AUC for the naive bayes classification model.
nb_auc = auc(x=nb_fpr, y=nb_tpr)

In [None]:
nb_acc = gs_nb.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(nb_auc, nb_acc))

Make prediction for test data

In [None]:
y_pred_nb = pd.DataFrame(gs_nb.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
nb_model = pd.concat([test.PassengerId, y_pred_nb], axis=1)

## KNN Classification Model

### Train/Validation set

First, we want to tune our model such that we minimize the variance, which is sensitivity of the prediction score to the change in training set, so, we will use cross-validation. We will use the validation curve to help us choose the best number of neighbours (K).

In [None]:
knn = KNeighborsClassifier()
param_name = 'n_neighbors'
param_range = list(range(3, 21))
train_score, test_score = validation_curve(
    knn, features, target, param_name, cv= 10, param_range=param_range)

In [None]:
train_score_mean = np.mean(train_score, axis= 1)
test_score_mean = np.mean(test_score, axis= 1)

# Plot number of neighbours VS. cross-validated scores for training and Validation sets.
plt.figure()
plt.xlabel("Number of neighbours")
plt.ylabel("Cross validated accuracy score")
plt.plot(np.arange(3,21), train_score_mean, color = 'blue')
plt.plot(np.arange(3,21), test_score_mean, color = 'red')

In [None]:
train_test_diff = train_score_mean - test_score_mean

# Plot number of folds VS. difference of cross-validated scores between train and Dev sets.
plt.figure()
plt.xlabel("Number of neighbours")
plt.ylabel("Diff. Cross validated accuracy score")
plt.plot(np.arange(3,21), train_test_diff)

It seems that the minimum variance is obtained at number of neighbours K = 16.

### Feature selection

Here, we will try different method to select the features with the highest explainatory power. We will try the following methods, then we select the best method:
1. VarianceThreshold
2. SelectKBest
3. RFECV
4. SelectFromModel

#### VarianceThreshold method

In [None]:
threshold = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]

In [None]:
scores = []
for i in threshold:
    selector = sklearn.feature_selection.VarianceThreshold(threshold= i)
    selected_features = selector.fit_transform(features)
    knn.fit(selected_features, target)
    y_pred = knn.predict(selected_features)
    scores.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot variance threshold VS. cross-validated scores for training sets.
plt.figure()
plt.xlabel("variance threshold")
plt.ylabel("Cross validated accuracy score")
plt.plot([0.001, 0.005, 0.01, 0.05, 0.1, 0.2], np.array(scores))

In [None]:
np.max(np.array(scores))

#### SelectKbest method

In [None]:
number_of_features = list(range(1,13))

In [None]:
scores_k = []
for i in number_of_features:
    selector = sklearn.feature_selection.SelectKBest(k=i)
    selected_features = selector.fit_transform(features, target)
    knn.fit(selected_features, target)
    y_pred = knn.predict(selected_features)
    scores_k.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot number of selected features VS. cross-validated scores for training sets.
plt.figure()
plt.xlabel("Number of Selected Features")
plt.ylabel("Cross validated accuracy score")    
plt.plot(list(range(1,13)), scores_k)

In [None]:
print("Maximum accuracy score is :", max(scores_k))

In [None]:
print("Optimal number of features :", np.argmax(np.array(scores_k)) + 1)

We conclude that, the highest accuracy is obtained after execluding features whose variance is less than 0.05

Fit the model with the selected features.

In [None]:
knn = KNeighborsClassifier(n_neighbors= 14)
selector = sklearn.feature_selection.VarianceThreshold(threshold= 0.05)
selected_features = selector.fit_transform(features)
knn.fit(selected_features, target)

#### KNN hyperparamters tunning

We'll use randomized search to tune the hyperparamters of KNN.

In [None]:
knn_params = {'n_neighbors': [14] , 'weights': [
    'uniform', 'distance'], 'leaf_size': [20, 30, 40, 50, 60], 'p': [1, 2, 3]}

In [None]:
rs_knn = RandomizedSearchCV(knn, param_distributions= knn_params,
                      scoring='accuracy', cv= 20, n_iter= 100, refit=True)

In [None]:
rs_knn.fit(selected_features, target)

In [None]:
rs_knn.best_params_

In [None]:
rs_knn.best_score_

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_knn = rs_knn.predict_proba(selected_features)[:, 1]
knn_fpr, knn_tpr, knn_thresholds = sklearn.metrics.roc_curve(target, y_scores_knn)

In [None]:
# Finding the AUC for the naive bayes classification model.
knn_auc = sklearn.metrics.auc(x=knn_fpr, y=knn_tpr)

In [None]:
knn_acc = rs_knn.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(knn_auc, knn_acc))

Make prediction for test data

In [None]:
y_pred_knn = pd.DataFrame(rs_knn.predict(
    test.loc[:, selector.get_support()]), columns=['Survived'], dtype='int64')

In [None]:
knn_model = pd.concat([passengerID, y_pred_knn], axis=1)

In [None]:
knn_model.to_csv('knn.csv', index= False)

## Support Vector Machine Classification model

In [None]:
svm = SVC(probability=True)

In [None]:
svm_params = {'C': [0.1, 1, 10, 100, 500], 'kernel': ['rbf'], 'degree': [
    1, 2, 3, 4], 'gamma': [0.05, 0.1, 1, 5], 'shrinking': [True, False]}

In [None]:
rs_svm = RandomizedSearchCV(svm, param_distributions=svm_params,
                            scoring='accuracy', cv=kf, refit=True, n_iter=2000)

In [None]:
rs_svm.fit(features, target)

In [None]:
joblib.dump(rs_svm, 'svmmodel.pkl')

In [None]:
rs_svm = joblib.load('svmmodel.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_svm = rs_svm.predict_proba(features)[:, 1]
svm_fpr, svm_tpr, svm_thresholds = roc_curve(target, y_scores_svm)
# Finding the AUC for the SVM classification model.
svm_auc = auc(x=svm_fpr, y=svm_tpr)

In [None]:
svm_acc = rs_svm.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(svm_auc, svm_acc))

Make Prediction for test data

In [None]:
y_pred_svm = pd.DataFrame(rs_svm.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
svm_model = pd.concat([test.PassengerId, y_pred_svm], axis=1)

## Decision Tree Classification Model

In [None]:
dt = DecisionTreeClassifier()

### Feature selection for decision trees

#### VarianceThreshold method

In [None]:
threshold = [0.001, 0.01,0.1,0.5]

In [None]:
scores = []
for i in threshold:
    selector = sklearn.feature_selection.VarianceThreshold(threshold= i)
    selected_features = selector.fit_transform(features)
    dt.fit(selected_features, target)
    y_pred = dt.predict(selected_features)
    scores.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot([0.001, 0.01,0.1,0.5], np.array(scores))

In [None]:
np.max(np.array(scores))

The highest accuracy is obtained after execluding features whose variance is less than 0.1

#### SelectKbest method

In [None]:
number_of_features = list(range(1,17))

In [None]:
scores_k = []
for i in number_of_features:
    selector = sklearn.feature_selection.SelectKBest(k=i)
    selected_features = selector.fit_transform(features, target)
    dt.fit(selected_features, target)
    y_pred = dt.predict(selected_features)
    scores_k.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot(list(range(1,17)), scores_k)

In [None]:
max(scores_k)

In [None]:
print("Optimal number of features :", np.argmax(np.array(scores_k)) + 1)

The highest accuracy score is obtained after selecting the best 12 features.

#### RFECV method

In [None]:
selector = sklearn.feature_selection.RFECV(dt, step= 1, cv= 7)
selector.fit(features, target)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)

In [None]:
print("Optimal number of features : %d" % selector.n_features_)

In [None]:
np.max(selector.grid_scores_)

We conclude the VarianceThreshold and SelectKbest methods results in the same accuracy score.

We will use the SelectKbest method with K = 12.

In [None]:
selector = sklearn.feature_selection.SelectKBest(k= 12)
selected_features = selector.fit_transform(features, target)

#### Decision tree hyperparamters tunning

We will use randomized search method and we will follow coarse-to-fine strategy.

In [None]:
dt_params = {'criterion': ['gini'], 'min_samples_split': [
    14, 15,16], 'max_features': ['auto', 'log2', None]}

In [None]:
gs_dt = RandomizedSearchCV(dt, param_distributions= dt_params,
                     scoring='accuracy', cv= StratifiedKFold(7), refit=True, n_iter= 50)

In [None]:
gs_dt.fit(selected_features, target)

In [None]:
gs_dt.best_score_

In [None]:
gs_dt.best_params_

#### Variance check

We will use the max_depth of tree to minimize the variance. We'll use the validation curve to select the best value of max_depth.

In [None]:
dt = DecisionTreeClassifier(min_samples_split= 15, max_features= None)
param_name = 'max_depth'
param_range = list(range(1, 11))
train_score, test_score = validation_curve(
    dt, selected_features, target, param_name, cv= 7, param_range = param_range)

In [None]:
train_score_mean = np.mean(train_score, axis= 1)
test_score_mean = np.mean(test_score, axis= 1)
plt.plot(np.arange(1,11), train_score_mean)
plt.plot(np.arange(1,11), test_score_mean, color = 'red')

In [None]:
train_test_diff = train_score_mean - test_score_mean
plt.plot(np.arange(1,11), train_test_diff)

From the above graphs, we find that the minimum is at max_depth of 1 but the bias is high. So, we will choose max_depth of 4 because the variance is reasonable and the bias is much lower than that of max_depth = 1.

In [None]:
dt = DecisionTreeClassifier(min_samples_split= 15, max_features= None, max_depth= 4)

In [None]:
dt.fit(selected_features,target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_dt = dt.predict_proba(selected_features)[:, 1]
dt_fpr, dt_tpr, dt_thresholds = sklearn.metrics.roc_curve(target, y_scores_dt)
# Finding the AUC for the Decision Tree classification model.
dt_auc = sklearn.metrics.auc(x=dt_fpr, y=dt_tpr)

In [None]:
dt_acc = gs_dt.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(dt_auc, dt_acc))

Make Prediction for test data

In [None]:
y_pred_dt = pd.DataFrame(dt.predict(
    test.loc[:,selector.get_support()]), columns=['Survived'], dtype='int64')

In [None]:
dt_model = pd.concat([passengerID, y_pred_dt], axis=1)

In [None]:
dt_model.to_csv('dt.csv', index= False)

## Random Forest Classification Model

In [None]:
rf = RandomForestClassifier()

In [None]:
rf_params = {'n_estimators': [10, 100, 200], 'criterion': ['gini', 'entropy'], 'min_samples_split': [
    2, 5, 10], 'max_features': ['sqrt', 'log2', None], 'class_weight': [{0: 0.6, 1: 0.4}, {0: 0.6, 1: 0.4}]}

In [None]:
rs_rf = RandomizedSearchCV(rf, param_distributions=rf_params,
                           scoring='accuracy', cv=kf, refit=True, n_iter=2000)

In [None]:
rs_rf.fit(features, target)

In [None]:
joblib.dump(rs_rf, 'randomdorestmodel.pkl')

In [None]:
rs_rf = joblib.load('randomdorestmodel.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_rf = rs_rf.predict_proba(features)[:, 1]
rf_fpr, rf_tpr, rf_thresholds = roc_curve(target, y_scores_rf)
# Finding the AUC for the Random Forest classification model.
rf_auc = auc(x=rf_fpr, y=rf_tpr)

In [None]:
rf_acc = rs_rf.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(rf_auc, rf_acc))

Make Prediction for test data

In [None]:
y_pred_rf = pd.DataFrame(rs_rf.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
rf_model = pd.concat([test.PassengerId, y_pred_rf], axis=1)

## Bagging Classification Model

In [None]:
bg = BaggingClassifier()

In [None]:
bg_params = {'n_estimators': [10, 100, 500]}

In [None]:
gs_bg = GridSearchCV(bg, param_grid=bg_params,
                     scoring='accuracy', cv=kf, refit=True)

In [None]:
gs_bg.fit(features, target)

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_bg = gs_bg.predict_proba(features)[:, 1]
bg_fpr, bg_tpr, bg_thresholds = roc_curve(target, y_scores_bg)
# Finding the AUC for the Bagging classification model.
bg_auc = auc(x=bg_fpr, y=bg_tpr)

In [None]:
bg_acc = gs_bg.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(bg_auc, bg_acc))

Make Prediction for test data

In [None]:
y_pred_bg = pd.DataFrame(gs_bg.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')

In [None]:
bg_model = pd.concat([test.PassengerId, y_pred_bg], axis=1)

## Adaboost Classifier

In [None]:
ada = AdaBoostClassifier()

In [None]:
ada_params = {'n_estimators': [100, 500, 1000,
                               10000], 'learning_rate': [0.1, 0.5, 0.7, 1]}

In [None]:
gs_ada = GridSearchCV(ada, param_grid=ada_params, cv=kf,
                      scoring='accuracy', refit=True)

In [None]:
gs_ada.fit(features, target)

In [None]:
joblib.dump(gs_ada, 'adaboost.pkl')

In [None]:
gs_ada = joblib.load('adaboost.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_ada = gs_ada.predict_proba(features)[:, 1]
ada_fpr, ada_tpr, ada_thresholds = roc_curve(target, y_scores_ada)
# Finding the AUC for the AdaBoost classification model.
ada_auc = auc(x=ada_fpr, y=ada_tpr)

In [None]:
ada_acc = gs_ada.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(bg_auc, bg_acc))

Make Predictions for test data

In [None]:
y_pred_ada = pd.DataFrame(gs_ada.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')
ada_model = pd.concat([test.PassengerId, y_pred_ada], axis=1)

## Gradient Boost Classifier

### Train/Validation set

Here, we want to tune our model to minimize the variance, which is sensitivity of the prediction score to the change in training set, so, we will use cross-validation. We will use the validation curve to help us choose the best validation fraction of the model.

In [None]:
gboost = ens.GradientBoostingClassifier()
param_name = 'validation_fraction'
param_range = np.arange(1, 5.5, 0.5)*1e-1
train_score, test_score = validation_curve(
    gboost, features, target, param_name, cv=10, param_range=param_range)

In [None]:
train_score_mean = np.mean(train_score, axis= 1)
test_score_mean = np.mean(test_score, axis= 1)

# Plot validation fraction VS. cross-validated scores for training and Validation sets.
plt.figure()
plt.xlabel("Validation fraction")
plt.ylabel("Cross validated accuracy score")
plt.plot(np.arange(1,5.5, 0.5)*1e-1, train_score_mean, color = 'blue')
plt.plot(np.arange(1,5.5, 0.5)*1e-1, test_score_mean, color = 'red')


In [None]:
train_test_diff = train_score_mean - test_score_mean

# validation fraction VS. difference of cross-validated scores between train and Dev sets.
plt.figure()
plt.xlabel("Validation fraction")
plt.ylabel("Diff. Cross validated accuracy score")
plt.plot(np.arange(1,5.5, 0.5)*1e-1, train_test_diff)

It seems the minimum variance is at valdiation fraction = 0.45.<br/>
We will choose validation fraction of 0.45.

### Feature selection for Gradient Boost

#### VarianceThreshold method

validation_fraction= 0.35,
    n_iter_no_change= 150, tol= 0.001, random_state= 10

In [None]:
gboost = ens.GradientBoostingClassifier(validation_fraction= 0.45)

In [None]:
threshold = [0.001, 0.01,0.1,0.5]

In [None]:
scores = []
for i in threshold:
    selector = sklearn.feature_selection.VarianceThreshold(threshold= i)
    selected_features = selector.fit_transform(features)
    gboost.fit(selected_features, target)
    y_pred = gboost.predict(selected_features)
    scores.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot([0.001, 0.01,0.1,0.5], np.array(scores))

In [None]:
np.max(np.array(scores))

The highest accuracy is obtained after execluding features whose variance is less than 0.001

#### SelectKbest method

In [None]:
number_of_features = list(range(1,13))

In [None]:
scores_k = []
for i in number_of_features:
    selector = sklearn.feature_selection.SelectKBest(k=i)
    selected_features = selector.fit_transform(features, target)
    gboost.fit(selected_features, target)
    y_pred = gboost.predict(selected_features)
    scores_k.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot(list(range(1,13)), scores_k)

In [None]:
max(scores_k)

In [None]:
print("Optimal number of features :", np.argmax(np.array(scores_k)) + 1)

The highest accuracy score is obtained after selecting the best 11 features.

#### RFECV method

In [None]:
selector = sklearn.feature_selection.RFECV(gboost, step= 1, cv= 7)
selector.fit(features, target)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)

In [None]:
print("Optimal number of features : %d" % selector.n_features_)

In [None]:
np.max(selector.grid_scores_)

#### SelectFromModel method

In [None]:
threshold = np.arange(1, 10, 0.1) *1e-2

In [None]:
scores_sfm = []
for i in threshold:
    selector = sklearn.feature_selection.SelectFromModel(gboost, threshold= i)
    selector.fit(features, target)
    selected_features = features.loc[:, selector.get_support()]
    gboost.fit(selected_features, target)
    y_pred = gboost.predict(selected_features)
    scores_sfm.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Threshold Value")
plt.ylabel("Cross validation score")    
plt.plot(np.arange(1, 10, 0.1) *1e-2, scores_sfm)

In [None]:
print("Maximum accuracy score is :", np.max(np.array(scores_sfm)))

In [None]:
print("Optimal threshold :", threshold[np.argmax(np.array(scores_sfm))])

We conclude that SelectFromModel method results in the highest accuracy score with threshold = 0.018.

In [None]:
# Fit the model with best 15 features.
selector = sklearn.feature_selection.SelectFromModel(gboost, threshold= 0.018)
selected_features = selector.fit_transform(features, target)
gboost.fit(selected_features, target)

### GB hyperparamters tunning

Again, we will use randomized search and we'll follow a coarse to fine strategy.

In [None]:
gboost_params = {'learning_rate': [0.1 , 0.2, 0.25 ], 'n_estimators': [
    50, 100, 200], 'max_features': [None, 'log2', 'sqrt'],
                'loss': ['deviance', 'exponential']}

In [None]:
rs_gboost = RandomizedSearchCV(gboost, param_distributions= gboost_params,
                         cv= 10, scoring='accuracy', refit=True, n_iter= 100)

In [None]:
rs_gboost.fit(selected_features, target)

In [None]:
rs_gboost.best_params_

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_gboost = rs_gboost.predict_proba(selected_features)[:, 1]
gboost_fpr, gboost_tpr, gboost_thresholds = sklearn.metrics.roc_curve(
    target, y_scores_gboost)
# Finding the AUC for the Gradient Boost classification model.
gboost_auc = sklearn.metrics.auc(x=gboost_fpr, y=gboost_tpr)

In [None]:
gboost_acc = rs_gboost.best_score_

In [None]:
print('Area Under Curve: {}, Accuracy: {}'.format(gboost_auc, gboost_acc))

Make Predictions for test data

In [None]:
y_pred_gboost = pd.DataFrame(rs_gboost.predict(
    test.loc[:,selector.get_support()]), columns=['Survived'], dtype='int64')
gboost_model = pd.concat([passengerID, y_pred_gboost], axis=1)

In [None]:
gboost_model.to_csv('gboost.csv', index= False)

## XGBoost Classifier

### Feature selection for XGBoost

#### VarianceThreshold method

In [None]:
xgboost = xgb.XGBClassifier()

In [None]:
threshold = [0.001, 0.01,0.1,0.5]

In [None]:
scores = []
for i in threshold:
    selector = sklearn.feature_selection.VarianceThreshold(threshold= i)
    selected_features = selector.fit_transform(features)
    xgboost.fit(selected_features, target)
    y_pred = xgboost.predict(selected_features)
    scores.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot([0.001, 0.01,0.1,0.5], np.array(scores))

In [None]:
np.max(np.array(scores))

The highest accuracy is obtained after execluding features whose variance is less than 0.001

#### SelectKbest method

In [None]:
number_of_features = list(range(1,13))

In [None]:
scores_k = []
for i in number_of_features:
    selector = sklearn.feature_selection.SelectKBest(k=i)
    selected_features = selector.fit_transform(features, target)
    xgboost.fit(selected_features, target)
    y_pred = xgboost.predict(selected_features)
    scores_k.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot(list(range(1,13)), scores_k)

In [None]:
max(scores_k)

In [None]:
print("Optimal number of features :", np.argmax(np.array(scores_k)) + 1)

The highest accuracy score is obtained after selecting the best 11 features.

#### RFECV method

In [None]:
selector = sklearn.feature_selection.RFECV(xgboost, step= 1, cv= 7)
selector.fit(features, target)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)

In [None]:
print("Optimal number of features : %d" % selector.n_features_)

In [None]:
np.max(selector.grid_scores_)

#### SelectFromModel method

In [None]:
threshold = np.arange(1, 10, 0.1) *1e-2

In [None]:
scores_sfm = []
for i in threshold:
    selector = sklearn.feature_selection.SelectFromModel(xgboost, threshold= i)
    selector.fit(features, target)
    selected_features = features.loc[:, selector.get_support()]
    xgboost.fit(selected_features, target)
    y_pred = xgboost.predict(selected_features)
    scores_sfm.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Threshold Value")
plt.ylabel("Cross validation score")    
plt.plot(np.arange(1, 10, 0.1) *1e-2, scores_sfm)

In [None]:
print("Maximum accuracy score is :", np.max(np.array(scores_sfm)))

In [None]:
print("Optimal threshold :", threshold[np.argmax(np.array(scores_sfm))])

We conclude that SelectKBest method results in the highest accuracy score with K = 12, which is the total number of features.

Fit the model with all features

In [None]:
xgboost.fit(features, target)

In [None]:
xgboost.score(features, target)

Make Predictions for test data

In [None]:
y_pred_xgboost = pd.DataFrame(xgboost.predict(test), columns=['Survived'], dtype='int64')
xgboost_model = pd.concat([passengerID, y_pred_xgboost], axis=1)

In [None]:
xgboost_model.to_csv('xgb.csv', index= False)

## LightGBM Classifier

### Feature selection for LightGBM

#### VarianceThreshold method

In [None]:
lgboost = lgb.LGBMClassifier()

In [None]:
threshold = [0.001, 0.01,0.1,0.5]

In [None]:
scores = []
for i in threshold:
    selector = sklearn.feature_selection.VarianceThreshold(threshold= i)
    selected_features = selector.fit_transform(features)
    lgboost.fit(selected_features, target)
    y_pred = lgboost.predict(selected_features)
    scores.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot([0.001, 0.01,0.1,0.5], np.array(scores))

In [None]:
np.max(np.array(scores))

The highest accuracy is obtained after execluding features whose variance is less than 0.001

#### SelectKbest method

In [None]:
number_of_features = list(range(1,13))

In [None]:
scores_k = []
for i in number_of_features:
    selector = sklearn.feature_selection.SelectKBest(k=i)
    selected_features = selector.fit_transform(features, target)
    lgboost.fit(selected_features, target)
    y_pred = lgboost.predict(selected_features)
    scores_k.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot(list(range(1,13)), scores_k)

In [None]:
max(scores_k)

In [None]:
print("Optimal number of features :", np.argmax(np.array(scores_k)) + 1)

The highest accuracy score is obtained after selecting the best 11 features.

#### RFECV method

In [None]:
selector = sklearn.feature_selection.RFECV(lgboost, step= 1, cv= 7)
selector.fit(features, target)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)

In [None]:
print("Optimal number of features : %d" % selector.n_features_)

In [None]:
np.max(selector.grid_scores_)

#### SelectFromModel method

In [None]:
threshold = [0.001, 0.01, 0.05, 0.1 , 0.5]

In [None]:
scores_sfm = []
for i in threshold:
    selector = sklearn.feature_selection.SelectFromModel(lgboost, threshold= i)
    selector.fit(features, target)
    selected_features = features.loc[:, selector.get_support()]
    lgboost.fit(selected_features, target)
    y_pred = lgboost.predict(selected_features)
    scores_sfm.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Threshold Value")
plt.ylabel("Cross validation score")    
plt.plot([0.001, 0.01, 0.05, 0.1 , 0.5], scores_sfm)

In [None]:
print("Maximum accuracy score is :", np.max(np.array(scores_sfm)))

In [None]:
print("Optimal threshold :", threshold[np.argmax(np.array(scores_sfm))])

We conclude that SelectKBest method results in the highest accuracy score with K = 11.

In [None]:
# Fit the model with the best 11 features selected.
selector = sklearn.feature_selection.SelectKBest(k= 11)
selected_features = selector.fit_transform(features, target)
lgboost.fit(selected_features, target)

Make Prediction for test data

In [None]:
y_pred_lgboost = pd.DataFrame(lgboost.predict(
    test.loc[:,selector.get_support()]), columns=['Survived'], dtype='int64')
lgboost_model = pd.concat([passengerID, y_pred_lgboost], axis=1)

In [None]:
lgboost_model.to_csv('lgboost.csv', index= False)

## Catboost Classifier

### Feature selection for LightGBM

#### VarianceThreshold method

In [None]:
ctboost = ctb.CatBoostClassifier()

In [None]:
threshold = [0.001, 0.01,0.1,0.5]

In [None]:
scores = []
for i in threshold:
    selector = sklearn.feature_selection.VarianceThreshold(threshold= i)
    selected_features = selector.fit_transform(features)
    ctboost.fit(selected_features, target)
    y_pred = ctboost.predict(selected_features)
    scores.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot([0.001, 0.01,0.1,0.5], np.array(scores))

In [None]:
np.max(np.array(scores))

The highest accuracy is obtained after execluding features whose variance is less than 0.001

#### SelectKbest method

In [None]:
number_of_features = list(range(1,13))

In [None]:
scores_k = []
for i in number_of_features:
    selector = sklearn.feature_selection.SelectKBest(k=i)
    selected_features = selector.fit_transform(features, target)
    ctboost.fit(selected_features, target)
    y_pred = ctboost.predict(selected_features)
    scores_k.append(sklearn.metrics.accuracy_score(target, y_pred))
plt.plot(list(range(1,13)), scores_k)

In [None]:
max(scores_k)

In [None]:
print("Optimal number of features :", np.argmax(np.array(scores_k)) + 1)

The highest accuracy score is obtained after selecting the best 12 features.

#### RFECV method

In [None]:
selector = sklearn.feature_selection.RFECV(ctboost, step= 1, cv= 7)
selector.fit(features, target)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)

In [None]:
print("Optimal number of features : %d" % selector.n_features_)

In [None]:
np.max(selector.grid_scores_)

#### SelectFromModel method

In [None]:
threshold = [0.001, 0.01, 0.05, 0.1 , 0.5]

In [None]:
scores_sfm = []
for i in threshold:
    selector = sklearn.feature_selection.SelectFromModel(lgboost, threshold= i)
    selector.fit(features, target)
    selected_features = features.loc[:, selector.get_support()]
    lgboost.fit(selected_features, target)
    y_pred = lgboost.predict(selected_features)
    scores_sfm.append(sklearn.metrics.accuracy_score(target, y_pred))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Threshold Value")
plt.ylabel("Cross validation score")    
plt.plot([0.001, 0.01, 0.05, 0.1 , 0.5], scores_sfm)

In [None]:
print("Maximum accuracy score is :", np.max(np.array(scores_sfm)))

In [None]:
print("Optimal threshold :", threshold[np.argmax(np.array(scores_sfm))])

We conclude that SelectKBest method results in the highest accuracy score with K = 11.

In [None]:
ctboost.fit(features, target)

Make Prediction for test data

In [None]:
y_pred_ctboost = pd.DataFrame(ctboost.predict(
    test.drop(columns=['PassengerId'])), columns=['Survived'], dtype='int64')
ctboost_model = pd.concat([test.PassengerId, y_pred_ctboost], axis=1)

## Voting Classifier

In [None]:
v = VotingClassifier(estimators=[
    ('lr', lr), ('NB', gs_nb), ('KNN', gs_knn), ('SVM', rs_svm), ('DT', gs_dt),
    ('RF', rs_rf), ('BG', gs_bg), ('AdaBoost', gs_ada), ('GBM', gs_gboost),
    ('XGBM', gs_xgb), ('LightGBM', lgboost), ('CatBoost', ctboost)],
    voting='soft')

In [None]:
v.fit(features, target)

In [None]:
joblib.dump(v, 'votingclassifier.pkl')

In [None]:
v = joblib.load('votingclassifier.pkl')

In [None]:
# Finding the ROC curve for different threshold values.
# probability estimates of the positive class.
y_scores_v = v.predict_proba(features)[:, 1]
v_fpr, v_tpr, v_thresholds = roc_curve(target, y_scores_v)
# Finding the AUC for the Voting classification model.
v_auc = auc(x=v_fpr, y=v_tpr)

In [None]:
print('Area Under Curve: {}'.format(v_auc))

Make Prediction for test data

In [None]:
y_pred_v = pd.DataFrame(v.predict(test.drop(columns=['PassengerId'])), columns=[
                        'Survived'], dtype='int64')

In [None]:
v_model = pd.concat([test.PassengerId, y_pred_v], axis=1)

## Stacking

In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(
    features, target, test_size=0.3)

In [None]:
lr = LogisticRegressionCV(cv=kf)
nb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=14, leaf_size=20, p=1, weights='uniform')
svm = SVC(kernel='rbf', gamma=0.1, degree=1, C=500, shrinking=True)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)
adab = AdaBoostClassifier(n_estimators=500, learning_rate=0.7)
bg = BaggingClassifier(n_estimators=100)
gboost = GradientBoostingClassifier(
    validation_fraction=0.1, n_iter_no_change=20, tol=0.005)
xgboost = xgb.XGBClassifier()
lgboost = lgb.LGBMClassifier()
ctboost = ctb.CatBoostClassifier(iterations=200, learning_rate=0.1, depth=10)
rf = RandomForestClassifier()

In [None]:
lr.fit(x_train, y_train)
nb.fit(x_train, y_train)
knn.fit(x_train, y_train)
svm.fit(x_train, y_train)
gb.fit(x_train, y_train)
adab.fit(x_train, y_train)
bg.fit(x_train, y_train)
gboost.fit(x_train, y_train)
xgboost.fit(x_train, y_train)
lgboost.fit(x_train, y_train)
ctboost.fit(x_train, y_train)
rf.fit(x_train, y_train)

In [None]:
pred1 = lr.predict(x_validate)
pred2 = nb.predict(x_validate)
pred3 = knn.predict(x_validate)
pred4 = svm.predict(x_validate)
pred5 = gb.predict(x_validate)
pred6 = adab.predict(x_validate)
pred7 = bg.predict(x_validate)
pred8 = gboost.predict(x_validate)
pred9 = xgboost.predict(x_validate)
pred10 = lgboost.predict(x_validate)
pred11 = ctboost.predict(x_validate)
pred12 = rf.predict(x_validate)

In [None]:
test_pred1 = lr.predict(test.drop(columns=['PassengerId']))
test_pred2 = nb.predict(test.drop(columns=['PassengerId']))
test_pred3 = knn.predict(test.drop(columns=['PassengerId']))
test_pred4 = svm.predict(test.drop(columns=['PassengerId']))
test_pred5 = gb.predict(test.drop(columns=['PassengerId']))
test_pred6 = adab.predict(test.drop(columns=['PassengerId']))
test_pred7 = bg.predict(test.drop(columns=['PassengerId']))
test_pred8 = gboost.predict(test.drop(columns=['PassengerId']))
test_pred9 = xgboost.predict(test.drop(columns=['PassengerId']))
test_pred10 = lgboost.predict(test.drop(columns=['PassengerId']))
test_pred11 = ctboost.predict(test.drop(columns=['PassengerId']))
test_pred12 = rf.predict(test.drop(columns=['PassengerId']))

In [None]:
stacked_predictions = np.column_stack((pred1, pred2, pred3, pred4, pred5, pred6, pred7,
                                       pred8, pred9, pred10, pred11, pred12))

In [None]:
stacked_test_predictions = np.column_stack((test_pred1, test_pred2, test_pred3, test_pred4, test_pred5,
                                            test_pred6, test_pred7, test_pred8, test_pred9, test_pred10,
                                            test_pred11, test_pred12))

In [None]:
# Meta model
meta_model = LogisticRegressionCV(cv=kf)

In [None]:
meta_model.fit(stacked_predictions, y_validate)

Make predictions for test data

In [None]:
y_pred_stack = pd.DataFrame(meta_model.predict(
    stacked_test_predictions), columns=['Survived'], dtype='int64')

In [None]:
stack_model = pd.concat([test.PassengerId, y_pred_stack], axis=1)

# Models Comaprison

## Models score

In [None]:
pd.DataFrame([(lr_auc, lr_acc), (nb_auc, nb_acc), (knn_auc, knn_acc), (dt_auc, dt_acc),
              (rf_auc, rf_acc), (svm_auc, svm_acc), (bg_auc, bg_acc), (ada_auc, ada_acc), (v_auc, 'NA')],
             columns=['AUC', 'Accuracy'],
             index=['Logistic Regression', 'Naive Bayes', 'KNN', 'Decision Tree',
                    'Random Forest', 'SVM', 'Bagging', 'AdaBoost', 'Voting'])

## Plotting the ROC curve

In [None]:
plt.figure(figsize=(8, 5))
plt.title('Receiver Operating Characteristic Curve')
plt.plot(lr_fpr, lr_tpr, 'b', label='LR_AUC = %0.2f' % lr_auc)
plt.plot(nb_fpr, nb_tpr, 'g', label='NB_AUC = %0.2f' % nb_auc)
plt.plot(knn_fpr, knn_tpr, 'orange', label='KNN_AUC = %0.2f' % knn_auc)
plt.plot(svm_fpr, svm_tpr, 'y', label='SVM_AUC = %0.2f' % svm_auc)
plt.plot(dt_fpr, dt_tpr, 'brown', label='DT_AUC = %0.2f' % dt_auc)
plt.plot(rf_fpr, rf_tpr, 'grey', label='RF_AUC = %0.2f' % rf_auc)
plt.plot(bg_fpr, bg_tpr, 'black', label='BG_AUC = %0.2f' % bg_auc)
plt.plot(ada_fpr, ada_tpr, 'pink', label='Ada_AUC = %0.2f' % ada_auc)
plt.plot(v_fpr, v_tpr, 'purple', label='Voting_AUC = %0.2f' % v_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()