In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt

In [6]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


Data Exploration

In [117]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
survival_rate = train['Survived'].sum()/len(train)
print(f"The total survival rate is {float(round(survival_rate,3))}")

The total survival rate is 0.384


In [7]:
women = train.loc[train.Sex == 'female']
women

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [8]:
woman_survivial_rate = women["Survived"].sum()/len(women)
print(f"The survival rate of women is {float(round(woman_survivial_rate,3))}")

The survival rate of women is 0.742


In [9]:
men = train.loc[train.Sex == 'male']
men

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:
man_survival_rate = men["Survived"].sum()/len(men)
print(f"The survival rate of men is {float(round(man_survival_rate,3))}")

The survival rate of men is 0.189


The survival rate of men is significantly lower than the survival rate of women

In [11]:
for pclass in train["Pclass"].unique():
    class_data = train[train["Pclass"] == pclass]

    pclass_survival_rate = class_data["Survived"].sum()/len(class_data)

    print(f"The survival rate for class {pclass} is {float(round(pclass_survival_rate,3))}")

The survival rate for class 3 is 0.242
The survival rate for class 1 is 0.63
The survival rate for class 2 is 0.473


In [12]:
for port in train["Embarked"].unique():
    port_data = train[train["Embarked"] == port]

    port_survival_rate = port_data["Survived"].sum()/len(port_data)

    print(f"The survival rate for port {port} is {float(round(port_survival_rate,3))}")

The survival rate for port S is 0.337
The survival rate for port C is 0.554
The survival rate for port Q is 0.39
The survival rate for port nan is nan


  port_survival_rate = port_data["Survived"].sum()/len(port_data)


Data Preprocessing

Addition of new features

In [8]:
train["Title"] = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test["Title"] = test['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [9]:
train["FamilySize"] = train['SibSp'] + train['Parch'] + 1
test["FamilySize"] = test['SibSp'] + test['Parch'] + 1

In [10]:
train["IsAlone"] = (train["FamilySize"] == 1).astype(int)
test["IsAlone"] = (test["FamilySize"] == 1).astype(int)

In [11]:
train["AgeGroup"] = pd.cut(train["Age"],
                           bins = [0, 12, 18, 35, 60, np.inf],
                           labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])

test["AgeGroup"] = pd.cut(test["Age"],
                           bins = [0, 12, 18, 35, 60, np.inf],
                           labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])

Drop Unnecessary Columns

In [12]:
columns_to_drop_train = ['Name', 'Age', 'Ticket', 'Cabin']
columns_to_drop_test = ['Name', 'Age', 'Ticket', 'Cabin']
train = train.drop(columns= columns_to_drop_train)
test = test.drop(columns= columns_to_drop_test)

One-Hot Encoding

In [13]:
train['Sex'] = train["Sex"].map({'male' : 0, 'female' : 1})
test['Sex'] = test["Sex"].map({'male' : 0, 'female' : 1})

In [14]:
train = pd.get_dummies(train, columns= ['Embarked', 'Title', 'AgeGroup'], prefix= ['Emb', 'Title', 'Age'])
test = pd.get_dummies(test, columns= ['Embarked', 'Title', 'AgeGroup'], prefix= ['Emb', 'Title', 'Age'])

In [15]:
train_columns = set(train.columns)
test_columns = set(test.columns)

In [16]:
common_columns = list(train_columns.intersection(test_columns))
common_columns

['Title_Ms',
 'Title_Dr',
 'Pclass',
 'Title_Rev',
 'Parch',
 'Emb_Q',
 'Age_Young Adult',
 'SibSp',
 'Age_Senior',
 'FamilySize',
 'Emb_C',
 'Title_Master',
 'Age_Child',
 'Age_Teenager',
 'PassengerId',
 'Emb_S',
 'Sex',
 'IsAlone',
 'Title_Mrs',
 'Title_Miss',
 'Age_Adult',
 'Title_Col',
 'Title_Mr',
 'Fare']

Model Creation

In [38]:
y = train['Survived']
x = train.loc[:, train.columns != 'Survived']

In [39]:
x = x[common_columns]
common_test = test[common_columns]

In [41]:
#Check for correct dimensionality
print("X Train Shape:", x.shape)
print("Y Train Shape:", y.shape)
print("Real Test Shape:", common_test.shape)

X Train Shape: (891, 24)
Y Train Shape: (891,)
Real Test Shape: (418, 24)


In [42]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy= 'median')),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state= 42))
])

In [43]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [2, 4, 6, 8, 10],
    'classifier__min_samples_split': [2, 5, 7, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['auto', 'sqrt']
}

In [44]:
cv = StratifiedKFold(n_splits= 5, shuffle= True, random_state= 42)

In [45]:
grid_search = GridSearchCV(pipeline, param_grid, cv= cv, scoring= 'accuracy', n_jobs= -1, verbose= 2)

grid_search.fit(x, y)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


900 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
557 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\phill\.conda\envs\machine_learning\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\phill\.conda\envs\machine_learning\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\phill\.conda\envs\machine_learning\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:

In [46]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'classifier__max_depth': 6, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best cross-validation score: 0.8372606867114432


In [47]:
best_model = grid_search.best_estimator_

In [48]:
y_pred = best_model.predict(x)

In [49]:
accuracy = accuracy_score(y, y_pred)
accuracy

0.8529741863075196

In [50]:
y_test = best_model.predict(common_test)

In [52]:
predictions_df = pd.DataFrame({
    'PassengerId': common_test['PassengerId'],
    'Survived': y_test
})

predictions_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [53]:
predictions_df.to_csv('titanic_predictions.csv', index= False)