# 1. Importing Libraries

In [1]:
# Main
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Prediction
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.ensemble import StackingClassifier

# Other
import warnings
warnings.filterwarnings("ignore")

# 2. Import Data

In [1]:
train = pd.read_csv(r"../data/train.csv")
test = pd.read_csv(r"../data/test.csv")

# Create a list to do same tasks for both dataframe
df_list = [train, test]

NameError: name 'pd' is not defined

# 3. Exploratory Data Analysis

In [3]:
# Missing data in training dataframe
missing = train.isnull().sum()
missing_percentage = train.isnull().sum() / train.isnull().count() * 100
missing_percentage = round(missing_percentage, 1)
missing_data = pd.concat([missing, missing_percentage], axis=1, keys=['Total Missing', '% Missing'])
missing_data

Unnamed: 0,Total Missing,% Missing
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,177,19.9
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,0,0.0


In [4]:
# Missing data in test dataframe
missing = test.isnull().sum()
missing_percentage = test.isnull().sum() / test.isnull().count() * 100
missing_percentage = round(missing_percentage, 1)
missing_data = pd.concat([missing, missing_percentage], axis=1, keys=['Total Missing', '% Missing'])
missing_data

Unnamed: 0,Total Missing,% Missing
PassengerId,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,86,20.6
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,1,0.2
Cabin,327,78.2


# 4. Feature Engineering

### 4.1. Sex

In [5]:
sex_dict = {"male": 0, "female": 1}
for df in df_list:
    df["Sex"] = df["Sex"].map(sex_dict)

### 4.2. Embarked

In [6]:
train["Embarked"].fillna("S", inplace=True)
test["Fare"].fillna(test["Fare"].median(), inplace=True)

In [7]:
embarked_dict = {"S": 0, "C": 1, "Q": 2}
for df in df_list:
    df['Embarked'] = df['Embarked'].map(embarked_dict)

### 4.3. Family Size

In [8]:
for df in df_list:
    df["FamSize"] = df["SibSp"] + df["Parch"] + 1
    df["FamSize"] = df["SibSp"] + df["Parch"] + 1
    df["isAlone"] = df.FamSize.apply(lambda x: 1 if x == 1 else 0)

In [9]:
for df in df_list:
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

### 4.4. Name

In [10]:
title_dict = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
for df in df_list:
    df['Title'] = df['Title'].map(title_dict)

### 4.5. Age

In [11]:
# fill missing age with median age for each title (Mr, Mrs, Miss, Others)
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [12]:
def calc_age(row):
    if row.Age <= 16:
        age = 0
    elif (row.Age > 16 and row.Age <= 26):
        age = 1
    elif (row.Age > 26 and row.Age <= 36):
        age = 2
    elif (row.Age > 36 and row.Age <= 62):
        age = 3
    else:
        age = 4
    
    return age
    
for df in df_list:
    df["Age"] = df.apply(calc_age, axis=1)

### 4.6. Final Adjustments

In [13]:
# Drop unecessary variables
features_drop = ['Ticket', 'SibSp', 'Parch', 'Name', "Cabin"]

train = train.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

test = test.drop(features_drop, axis=1)

In [14]:
train_data = train.drop('Survived', axis=1)
target = train['Survived']

# 5. Modelling

### 5.1. Pre-Modelling Adjustments

#### 5.1.1 Cross Validation (K-Fold)

In [15]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

#### 5.1.2. Split Data

In [26]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamSize','isAlone', 'Title']
X_train = train[features] 
y_train = train["Survived"] 
X_test = test[features]

X_training, X_valid, y_training, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

### 5.2. Prediction

#### 5.2.1. SVC

In [17]:
svc = SVC(random_state=1)
scoring = "accuracy"
score = cross_val_score(svc, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_svc = round(np.mean(score)*100, 2)
print("\nScore:", acc_svc)

[0.66666667 0.76404494 0.69662921 0.59550562 0.75280899 0.70786517
 0.70786517 0.6741573  0.59550562 0.64044944]

Score: 68.01


#### 5.2.2. kNN

In [18]:
knn = KNeighborsClassifier(n_neighbors = 13)
scoring = "accuracy"
score = cross_val_score(knn, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_knn = round(np.mean(score)*100, 2)
print("\nScore:", acc_knn)

[0.74444444 0.7752809  0.73033708 0.75280899 0.76404494 0.75280899
 0.7752809  0.7752809  0.75280899 0.80898876]

Score: 76.32


#### 5.2.3. Decision Tree

In [19]:
dc = DecisionTreeClassifier(random_state=1)
scoring = "accuracy"
score = cross_val_score(dc, X_train, y_train, cv=k_fold, n_jobs=1, scoring = scoring)
print(score)
acc_dt = round(np.mean(score)*100, 2)
print("\nScore:", acc_dt)

[0.76666667 0.7752809  0.80898876 0.7752809  0.79775281 0.78651685
 0.84269663 0.80898876 0.76404494 0.82022472]

Score: 79.46


#### 5.2.4. Random Forest

In [20]:
rf = RandomForestClassifier(random_state=1, n_estimators=13)
scoring = "accuracy"
score = cross_val_score(rf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_rf = round(np.mean(score)*100, 2)
print("\nScore:", acc_rf)

[0.76666667 0.84269663 0.79775281 0.83146067 0.80898876 0.78651685
 0.76404494 0.82022472 0.7752809  0.79775281]

Score: 79.91


#### 5.2.5. Naive Bayes

In [21]:
gnb = GaussianNB()
scoring = "accuracy"
score = cross_val_score(gnb, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_gnb = round(np.mean(score)*100, 2)
print("\nScore:", acc_gnb)

[0.81111111 0.76404494 0.80898876 0.78651685 0.79775281 0.82022472
 0.84269663 0.82022472 0.82022472 0.85393258]

Score: 81.26


#### 5.2.6. Linear SVC

In [22]:
linear_svc = LinearSVC(random_state=1)
scoring = "accuracy"
score = cross_val_score(linear_svc, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_linsvc = round(np.mean(score)*100, 2)
print("\nScore:", acc_linsvc)

[0.82222222 0.7752809  0.79775281 0.78651685 0.79775281 0.83146067
 0.80898876 0.83146067 0.85393258 0.83146067]

Score: 81.37


#### 5.2.7. Logistic Regression

In [23]:
logreg = LogisticRegression(random_state=1)
scoring = "accuracy"
score = cross_val_score(logreg, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_logreg = round(np.mean(score)*100, 2)
print("\nScore:", acc_logreg)

[0.83333333 0.7752809  0.80898876 0.78651685 0.79775281 0.83146067
 0.82022472 0.85393258 0.82022472 0.85393258]

Score: 81.82


#### 5.2.8. XGBoost

In [24]:
xgboost = XGBClassifier(random_state=1, objective="binary:logistic", n_estimators=10, eval_metric='mlogloss')
scoring = "accuracy"
score = cross_val_score(xgboost, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_xg = round(np.mean(score)*100, 2)
print("\nScore:", acc_xg)

[0.83333333 0.84269663 0.85393258 0.80898876 0.86516854 0.83146067
 0.84269663 0.82022472 0.78651685 0.83146067]

Score: 83.16


### 5.3 Model Performance

In [25]:
model_performance = pd.DataFrame({
    "Model": ["SVC", "Linear SVC", "Random Forest", 
              "Logistic Regression", "K Nearest Neighbors", "Gaussian Naive Bayes",  
              "Decision Tree", "XGBClassifier"],
    "Accuracy": [acc_svc, acc_linsvc, acc_rf, acc_logreg, acc_knn, acc_gnb, acc_dt, acc_xg]
})

model_performance.sort_values(by="Accuracy", ascending=False)

Unnamed: 0,Model,Accuracy
7,XGBClassifier,83.16
3,Logistic Regression,81.82
1,Linear SVC,81.37
5,Gaussian Naive Bayes,81.26
2,Random Forest,79.91
6,Decision Tree,79.46
4,K Nearest Neighbors,76.32
0,SVC,68.01


In [29]:
estimators = [('Logistic Regression',logreg),
              ('XGBoost',xgboost),
              ('Naive Bayes', gnb),
              ('Linear SVC', linear_svc),
              ('Random Forest', rf)]

stack = StackingClassifier(estimators=estimators)
score = cross_val_score(stack, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
acc_stack = round(np.mean(score)*100, 2)
print("\nScore:", acc_stack)

[0.83333333 0.83146067 0.83146067 0.83146067 0.83146067 0.85393258
 0.84269663 0.82022472 0.82022472 0.86516854]

Score: 83.61


### 5.4. Tunning Parameters

In [None]:
rf_clf = LogisticRegression(random_state=1)

parameters = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(1, -1),
    'solver' : ['liblinear', "newton-cg", "lbfgs", "sag", "saga"]
}

grid_cv = GridSearchCV(rf_clf, parameters, scoring = make_scorer(accuracy_score))
grid_cv = grid_cv.fit(X_training, y_training)

best_estimator = grid_cv.best_estimator_
best_score = grid_cv.best_score_
best_params = grid_cv.best_params_

### 5.5. Best Model

In [None]:
best_model = grid_cv.best_estimator_
best_model.fit(train_data, target)

# 6. Submission

In [33]:
test_data = test.drop("PassengerId", axis=1).copy()
stack.fit(train_data, target)
prediction = stack.predict(test_data)

In [34]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction
})
submission.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [35]:
submission.to_csv(r"../data/submission.csv", index=False)