In [2]:
# download titanic datasets

import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading: ", filename)
            urllib.request.urlretrieve(url + filename, filepath)
fetch_titanic_data()

In [3]:
# load titanic datasets
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [4]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [5]:
# assign PassengerId column as index

train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

In [6]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,0.0,1.0,0.4167,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [9]:
# Survived is our target label in this supervised binary classification
# only 38% survivied
# mean age 29 
# mean fare 32, probably very expensive in those times

# cabin has more then 70% null values, we will drop it
# sex, Pclass, Embarked are categorical variables
# age has some null, we can use the median age to replace them
# name and ticket can also be dropped

In [10]:
train_data["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [11]:
# from 891 passangers, 549 did not survive and 342 survived

train_data["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [12]:
train_data["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [13]:
train_data[(train_data["Sex"] == "female") & (train_data["Survived"] == 1)].shape[0]

233

In [14]:
train_data[(train_data["Sex"] == "male") & (train_data["Survived"] == 1)].shape[0]

109

In [15]:
# notice 74% of women survived: probably rescued first
# while only 18% of the men were able to survive 

In [16]:
train_data[(train_data["Age"] < 18)].shape[0]

113

In [17]:
train_data[(train_data["Age"] < 18) & (train_data["Survived"] == 1)].shape[0]

61

In [18]:
train_data[(train_data["Pclass"] == 1) & (train_data["Survived"] == 1)].shape[0]

136

In [19]:
train_data[(train_data["Pclass"] == 2) & (train_data["Survived"] == 1)].shape[0]

87

In [20]:
train_data[(train_data["Pclass"] == 3) & (train_data["Survived"] == 1)].shape[0]

119

In [21]:
# notice only 53% of underage and children were rescued
# 62% of class 1 pessangers survived, 47% of class 2 passangers survived, while only 24% of class 3 passangers survived

train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [22]:
train_data[(train_data["Embarked"] == "Q") & (train_data["Survived"] == 1)].shape[0]

30

In [23]:
train_data[(train_data["Embarked"] == "C") & (train_data["Survived"] == 1)].shape[0]

93

In [24]:
train_data[(train_data["Embarked"] == "S") & (train_data["Survived"] == 1)].shape[0]

217

In [25]:
# 38% Q saved, 55% C saved, 33% S saved

# let's get our data pipeline ready for model
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# numerical variable pipeline
num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

In [26]:
# categorical variable pipeline
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("cat_encoder", OneHotEncoder(sparse=False))])

In [27]:
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

data_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", cat_pipeline, cat_attribs)])

In [28]:
X_train = data_pipeline.fit_transform(train_data[num_attribs+cat_attribs])
X_train

array([[-0.56573582,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.6638609 ,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.10463705,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276213, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

In [29]:
# also our target variable
y_train = train_data["Survived"]

In [30]:
# let's try two binary classifiers, RandomForest and SVM

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [31]:
# evaludate the model with cross_validation_score

from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8137578027465668

In [32]:
# score is good, we can now use our test set to make predictions

X_test = data_pipeline.transform(test_data[num_attribs+cat_attribs])
y_pred = forest_clf.predict(X_test)

In [33]:
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [34]:
# we can also try SVC

from sklearn.svm import SVC

svc_clf = SVC(gamma="auto")
svc_scores = cross_val_score(svc_clf, X_train, y_train, cv=10)
svc_scores.mean()

0.8249313358302123

In [35]:
# better score! let's use svc and try grid search for better hyperparameters

from sklearn.model_selection import GridSearchCV

param_grid = [{
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'degree': [2, 3, 4]
}]

svc_grid_search = GridSearchCV(svc_clf, param_grid, cv=5, scoring="accuracy", return_train_score=True)
svc_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid=[{'C': [0.1, 1, 10], 'degree': [2, 3, 4],
                          'gamma': ['scale', 'auto', 0.1, 1],
                          'kernel': ['linear', 'rbf', 'poly']}],
             return_train_score=True, scoring='accuracy')

In [47]:
svc_grid_search.best_params_

{'C': 10, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}

In [51]:
best_svc = SVC(C= 10, degree= 4, gamma = 'auto', kernel= 'poly')

In [52]:
best_svc.fit(X_train, y_train)

SVC(C=10, degree=4, gamma='auto', kernel='poly')

In [53]:
best_svc_scores = cross_val_score(best_svc, X_train, y_train, cv=10)
best_svc_scores.mean()

0.8260549313358302

In [54]:
y_pred = best_svc.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [55]:
result_df = pd.DataFrame({'Index': test_data.index, 'Array': y_pred})

result_df.to_csv('/Users/chiragmacwan/ML/titanic_predictions.csv', index=False)

print("Result CSV file created successfully.")    

Result CSV file created successfully.
