In [1]:
import statistics

import numpy as np
import pandas as pd

In [2]:
raw_train_df = pd.read_csv('titanic/train.csv')

### data wrangling

In [3]:
raw_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
raw_train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
raw_train_df.drop(['Name'], axis=1, inplace=True)

In [6]:
raw_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [7]:
raw_train_df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [8]:
raw_train_df['Sex'].replace(['male', 'female'], [0, 1], inplace=True)

In [9]:
raw_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,0,35.0,0,0,373450,8.05,,S


In [10]:
raw_train_df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [11]:
raw_train_df['Embarked'] = pd.factorize(raw_train_df['Embarked'])[0]

In [12]:
raw_train_df.drop(['PassengerId', 'Cabin', 'Ticket'], axis=1, inplace=True)

In [13]:
raw_train_df['Survived'].replace(0, -1, inplace=True)

In [14]:
age_mean = raw_train_df['Age'].mean()
raw_train_df['Age'].replace(np.nan, age_mean, inplace=True)

In [15]:
raw_train_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,-1,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,0
3,1,1,1,35.0,1,0,53.1,0
4,-1,3,0,35.0,0,0,8.05,0
5,-1,3,0,29.699118,0,0,8.4583,2
6,-1,1,0,54.0,0,0,51.8625,0
7,-1,3,0,2.0,3,1,21.075,0
8,1,3,1,27.0,0,2,11.1333,0
9,1,2,1,14.0,1,0,30.0708,1


### Converting the dataframe to numpy

In [16]:
input_matrix = raw_train_df.drop(['Survived'], axis=1).to_numpy()

In [17]:
labels_vector = raw_train_df['Survived'].to_numpy()
labels_vector = labels_vector.reshape(labels_vector.shape + (1,))

In [18]:
data_train = np.append(input_matrix, labels_vector, axis=1)

In [19]:
data_train

array([[ 3.        ,  0.        , 22.        , ...,  7.25      ,
         0.        , -1.        ],
       [ 1.        ,  1.        , 38.        , ..., 71.2833    ,
         1.        ,  1.        ],
       [ 3.        ,  1.        , 26.        , ...,  7.925     ,
         0.        ,  1.        ],
       ...,
       [ 3.        ,  1.        , 29.69911765, ..., 23.45      ,
         0.        , -1.        ],
       [ 1.        ,  0.        , 26.        , ..., 30.        ,
         1.        ,  1.        ],
       [ 3.        ,  0.        , 32.        , ...,  7.75      ,
         2.        , -1.        ]])

### Hyper-parameter tuning

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import ms_svm as mss
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
print("Training the model with different hyper-parameters...")
for sigma in (1, 2, 3, 5, 8, 10):
    Input = [('scale',StandardScaler()), ('model', mss.Svm(mss.RbfKernel(sigma)))]
    pipe = Pipeline(Input)

    np.random.shuffle(data_train)

    X_train, X_test, y_train, y_test = train_test_split(data_train[:,0:-1], data_train[:,-1:], train_size=.75)

    pipe.fit(X_train, y_train)

    score = accuracy_score(y_test, pipe.predict(X_test))
    print("Accuracy score for s={0} is {1}".format(sigma, score))

Training the model with different hyper-parameters...
Stage 1 | 5 Loss: [7232.98405731]
Stage 2 | 5 Loss: [-298.03283271]
Stage 3 | 5 Loss: [-387.27821858]
Stage 4 | 5 Loss: [-452.76833298]
Stage 5 | 5 Loss: [-461.25192658]
Final Loss: [-461.95821532]
Accuracy score for s=1 is 0.7937219730941704
Stage 1 | 5 Loss: [15434.50312284]
Stage 2 | 5 Loss: [-422.66863005]
Stage 3 | 5 Loss: [-428.85551129]
Stage 4 | 5 Loss: [-492.66337168]
Stage 5 | 5 Loss: [-510.37096382]
Final Loss: [-511.32100451]
Accuracy score for s=2 is 0.8116591928251121
Stage 1 | 5 Loss: [16217.9327367]
Stage 2 | 5 Loss: [-452.87390853]
Stage 3 | 5 Loss: [-541.66801679]
Stage 4 | 5 Loss: [-554.25065492]
Stage 5 | 5 Loss: [-571.3364539]
Final Loss: [-572.43425471]
Accuracy score for s=3 is 0.820627802690583
Stage 1 | 5 Loss: [13684.9282593]
Stage 2 | 5 Loss: [-445.56977294]
Stage 3 | 5 Loss: [-605.77577846]
Stage 4 | 5 Loss: [-641.40429161]
Stage 5 | 5 Loss: [-640.08292178]
Final Loss: [-643.07502555]
Accuracy score for s

Svm with RBF kernel with sigma = 5 gave the best result, i.e. 0.8565 accuracy on unseen data. Btw, it took too long to compute as of 8 dec 2022 (3 hours or so) on my M1 mac.

### Final model

In [52]:
x_train = data_train[:,0:-1]
y_train = data_train[:,-1:]

In [53]:
x_train.shape, y_train.shape

((891, 7), (891, 1))

In [54]:
scale = StandardScaler()
model = mss.Svm(mss.RbfKernel(5))
model.iterations_per_stage = 1200

In [55]:
Input = [('scale', scale), ('model', model)]
pipe = Pipeline(Input)

In [59]:
pipe.fit(x_train, y_train)

Stage 1 | 5 Loss: [28188.11970934]
Stage 2 | 5 Loss: [-572.67926035]
Stage 3 | 5 Loss: [-786.45743142]
Stage 4 | 5 Loss: [-838.92113743]
Stage 5 | 5 Loss: [-843.446947]
Final Loss: [-847.47890801]


In [60]:
accuracy_score(y_train, pipe.predict(x_train))

0.8316498316498316

### Predicting

In [61]:
test_df = pd.read_csv('titanic/test.csv')

In [62]:
test_df.drop(['Name'], axis=1, inplace=True)
test_df['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
test_df['Embarked'] = pd.factorize(test_df['Embarked'])[0]
test_df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)
age_mean = test_df['Age'].mean()
test_df['Age'].replace(np.nan, age_mean, inplace=True)

In [63]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,0,34.5,0,0,7.8292,0
1,893,3,1,47.0,1,0,7.0,1
2,894,2,0,62.0,0,0,9.6875,0
3,895,3,0,27.0,0,0,8.6625,1
4,896,3,1,22.0,1,1,12.2875,1


In [64]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,417.0,418.0
mean,1100.5,2.26555,0.363636,30.27259,0.447368,0.392344,35.627188,1.133971
std,120.810458,0.841838,0.481622,12.634534,0.89676,0.981429,55.907576,0.580452
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,1100.5,3.0,0.0,30.27259,0.0,0.0,14.4542,1.0
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.5,1.0
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


In [65]:
fare_mean = test_df['Fare'].mean()
test_df['Fare'].replace(np.nan, fare_mean, inplace=True)

In [66]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,0.363636,30.27259,0.447368,0.392344,35.627188,1.133971
std,120.810458,0.841838,0.481622,12.634534,0.89676,0.981429,55.8405,0.580452
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,1100.5,3.0,0.0,30.27259,0.0,0.0,14.4542,1.0
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.5,1.0
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


In [67]:
X = test_df.drop(['PassengerId'], axis=1).to_numpy()
ids = test_df['PassengerId'].to_numpy()
ids = ids.reshape(ids.shape + (1,))

In [79]:
y_predicted = pipe.predict(X)

In [81]:
y_predicted

array([[-1.],
       [ 1.],
       [-1.],
       [-1.],
       [ 1.],
       [-1.],
       [ 1.],
       [-1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [ 1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [ 1.],
       [-1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [ 1.],
       [-1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [ 1.],
      

### Saving the predictions

In [85]:
predicted_data = np.append(ids, y_predicted, axis=1).astype(int)

In [86]:
predicted_data

array([[ 892,   -1],
       [ 893,    1],
       [ 894,   -1],
       [ 895,   -1],
       [ 896,    1],
       [ 897,   -1],
       [ 898,    1],
       [ 899,   -1],
       [ 900,    1],
       [ 901,   -1],
       [ 902,   -1],
       [ 903,   -1],
       [ 904,    1],
       [ 905,   -1],
       [ 906,    1],
       [ 907,    1],
       [ 908,   -1],
       [ 909,   -1],
       [ 910,    1],
       [ 911,    1],
       [ 912,   -1],
       [ 913,    1],
       [ 914,    1],
       [ 915,   -1],
       [ 916,    1],
       [ 917,   -1],
       [ 918,    1],
       [ 919,   -1],
       [ 920,   -1],
       [ 921,   -1],
       [ 922,   -1],
       [ 923,   -1],
       [ 924,    1],
       [ 925,    1],
       [ 926,   -1],
       [ 927,   -1],
       [ 928,    1],
       [ 929,    1],
       [ 930,   -1],
       [ 931,   -1],
       [ 932,   -1],
       [ 933,   -1],
       [ 934,   -1],
       [ 935,    1],
       [ 936,    1],
       [ 937,   -1],
       [ 938,   -1],
       [ 939,

In [74]:
dataframe = pd.DataFrame(predicted_data)
dataframe.columns = ['PassengerId', 'Survived']

In [87]:
dataframe['Survived'].replace(-1, 0, inplace=True)

In [90]:
dataframe.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [89]:
dataframe.to_csv("titanic/predictions.csv", index=False)