In [38]:
# packages
# import libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier



# Introduction


In [39]:
# Load training and test data
train_data = pd.read_csv('../cleaned_data/train_data_simple_clean.csv', index_col = 'PassengerId')
test_data = pd.read_csv('../cleaned_data/test_data_simple_clean.csv', index_col = 'PassengerId')
                         
# Identify target variable and drop it from 
target = train_data['Survived']
X = train_data.drop(['Survived'], axis = 1)

# Preview Data
display(train_data.tail())
display(test_data.head())

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,Sex,SibSp,Embarked_Q,Embarked_S,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
887,27.0,13.0,0,2,0,0,0,1,0
888,19.0,30.0,0,1,1,0,0,1,1
889,29.881138,23.45,2,3,1,1,0,1,0
890,26.0,30.0,0,1,0,0,0,0,1
891,32.0,7.75,0,3,0,0,1,0,0


Unnamed: 0_level_0,Age,Fare,Parch,Pclass,Sex,SibSp,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,34.5,7.8292,0,3,0,0,1,0
893,47.0,7.0,0,3,1,1,0,1
894,62.0,9.6875,0,2,0,0,1,0
895,27.0,8.6625,0,3,0,0,0,1
896,22.0,12.2875,1,3,1,1,0,1


In [40]:
# Partition the dataset in train + validation sets
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = 0.3)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

display(X_train.head())

X_train : (623, 8)
X_test : (268, 8)
y_train : (623,)
y_test : (268,)


Unnamed: 0_level_0,Age,Fare,Parch,Pclass,Sex,SibSp,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
400,28.0,12.65,0,2,1,0,0,1
359,29.881138,7.8792,0,3,1,0,1,0
640,29.881138,16.1,0,3,0,1,0,1
326,36.0,135.6333,0,1,1,0,0,0
740,29.881138,7.8958,0,3,0,0,0,1


# Logistic Regression

In [41]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# get preditictions for training and test set.
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)


print(accuracy_score(y_test_pred, y_test))


# get predictions
log_reg_preds = log_reg.predict(test_data)



log_reg_submission = pd.DataFrame({'PassengerId':test_data.index, 'Survived':log_reg_preds})
log_reg_submission = log_reg_submission.set_index('PassengerId')

print(log_reg_submission.head())

log_reg_submission.to_csv('../submissions/logistic_regression_simple.csv')

0.8171641791044776
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 1


In [42]:
# logistic regression with grid search 

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg, grid, cv=5)
logreg_cv.fit(X_train,y_train)

# get preditictions for training and test set.
y_train_pred = logreg_cv.predict(X_train)
y_test_pred = logreg_cv.predict(X_test)


print(accuracy_score(y_train_pred, y_train))
print(logreg_cv.best_params_)


# get predictions
logreg_cv_preds = logreg_cv.predict(test_data)


logreg_cv_submission = pd.DataFrame({'PassengerId':test_data.index, 'Survived':logreg_cv_preds})
logreg_cv_submission = logreg_cv_submission.set_index('PassengerId')

print(logreg_cv_submission.head())

logreg_cv_submission.to_csv('../submissions/logistic_regression_cv.csv')

0.7945425361155698
{'C': 0.1, 'penalty': 'l2'}
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 1


# Decision Trees

In [43]:
decision_tree = DecisionTreeClassifier(max_depth= 4)

decision_tree.fit(X_train, y_train)

# get preditictions for training and test set.
y_train_pred = decision_tree.predict(X_train)
y_test_pred = decision_tree.predict(X_test)


print(accuracy_score(y_test_pred, y_test))

# get predictions
decision_preds = decision_tree.predict(test_data)


decision_submission = pd.DataFrame({'PassengerId':test_data.index, 'Survived':decision_preds})
decision_submission = decision_submission.set_index('PassengerId')

print(logreg_cv_submission.head())

decision_submission.to_csv('../submissions/desision_tree.csv')

0.832089552238806
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 1


In [44]:
# Decision tree with a grid search

grid = {"max_depth" : [2, 3, 4, 5, 6, 7, 8]}

decision_cv=GridSearchCV(decision_tree, grid, cv=5)
decision_cv.fit(X_train,y_train)

# get preditictions for training and test set.
y_train_pred = decision_cv.predict(X_train)
y_test_pred = decision_cv.predict(X_test)


print(accuracy_score(y_train_pred, y_train))
print(decision_cv.best_params_)


# get predictions
decision_cv_preds = decision_cv.predict(test_data)


decision_cv_submission = pd.DataFrame({'PassengerId':test_data.index, 'Survived':decision_cv_preds})
decision_cv_submission = decision_cv_submission.set_index('PassengerId')

print(decision_cv_submission.head())

decision_cv_submission.to_csv('../submissions/decision_cv.csv')

0.8202247191011236
{'max_depth': 3}
             Survived
PassengerId          
892                 0
893                 1
894                 0
895                 0
896                 1


# XGBoost

In [45]:
xgb_tree = xgb.XGBClassifier()

xgb_tree.fit(X_train, y_train)

# get preditictions for training and test set.
y_train_pred = xgb_tree.predict(X_train)
y_test_pred = xgb_tree.predict(X_test)


print(accuracy_score(y_test_pred, y_test))

# get predictions
xgb_preds = xgb_tree.predict(test_data)


xgb_submission = pd.DataFrame({'PassengerId':test_data.index, 'Survived':xgb_preds})
xgb_submission = xgb_submission.set_index('PassengerId')

print(xgb_submission.head(10))

xgb_submission.to_csv('../submissions/xgb.csv')

0.8507462686567164
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
897                 0
898                 1
899                 0
900                 1
901                 0


In [46]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


# Decision tree with a grid search
grid = {"max_depth" : [2, 3, 4, 5, 6, 7, 8],
        "learning_rate" : [0.05, 0.075, 0.1, 0.125, 0.15, 0.2]}

xgb_cv = GridSearchCV(xgb_tree, grid, cv = 5)
xgb_cv.fit(X_train,y_train)

# get preditictions for training and test set.
y_train_pred = xgb_cv.predict(X_train)
y_test_pred = xgb_cv.predict(X_test)


print(accuracy_score(y_test_pred, y_test))
print(xgb_cv.best_params_)


# get predictions
xgb_cv_preds = xgb_cv.predict(test_data)


xgb_cv_submission = pd.DataFrame({'PassengerId':test_data.index, 'Survived':xgb_cv_preds})
xgb_cv_submission = xgb_cv_submission.set_index('PassengerId')

print(xgb_cv_submission.head())

xgb_cv_submission.to_csv('../submissions/xgb_cv.csv')

0.8432835820895522
{'learning_rate': 0.075, 'max_depth': 3}
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0


# KNN Classifier

In [47]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

# get preditictions for training and test set.
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)


print(accuracy_score(y_train_pred, y_train))
print(accuracy_score(y_test_pred, y_test))


# get predictions
knn_preds = knn.predict(test_data)

knn_submission = pd.DataFrame({'PassengerId':test_data.index, 'Survived':knn_preds})
knn_submission = knn_submission.set_index('PassengerId')

print(knn_submission.head(10))

knn_submission.to_csv('../submissions/knn.csv')

0.7865168539325843
0.7052238805970149
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
897                 0
898                 1
899                 1
900                 0
901                 1


# Ensemble Model

In [49]:
w1 = 1/3
w2 = 1/3
w3 = 1/3
w4 = 1/4

ensemble_sub = (w1 * logreg_cv_submission) + (w2 * decision_cv_submission) + (w3 * xgb_cv_submission)
ensemble_sub['Survived'] = ensemble_sub['Survived'].apply(lambda x: 0 if x < 0.5 else 1)

display(ensemble_sub.head(20))

ensemble_sub.to_csv('../submissions/ensemble.csv')

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1
901,0
