In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder



In [52]:
data = pd.read_csv('data/train.csv')

In [53]:
y = data['Survived']
X = data.drop('Survived', axis = 1)

In [54]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [55]:
to_normalize = []

### 1. PassengerId

In [56]:
X.drop(columns = 'PassengerId', inplace = True)

### 2. Pclass

In [57]:
X['Pclass'].isna().values.any()

False

### 3. Name

In [58]:
X.drop(columns = 'Name', inplace = True)

### 4. Sex

In [59]:
X['Sex'].isna().values.sum()

0

In [60]:
X['Female'] = (X['Sex'] == 'female').astype(int)
X.drop(columns = 'Sex', inplace = True)

In [61]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Female
0,3,22.0,1,0,A/5 21171,7.25,,S,0
1,1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,1,35.0,1,0,113803,53.1,C123,S,1
4,3,35.0,0,0,373450,8.05,,S,0


### 5. Age

In [62]:
print(X['Age'].isna().values.sum())
X['Age'][X['Age'].isna()] = X['Age'].median()

177


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Age'][X['Age'].isna()] = X['Age'].median()


In [63]:
to_normalize.append('Age')

### 6. SibSp

In [65]:
X['SibSp'].isna().values.any()

False

In [66]:
to_normalize.append('SibSp')

### 7. Parch

In [67]:
X['Parch'].isna().values.any()

False

In [68]:
to_normalize.append('Parch')

### 8. Ticket

In [69]:
X.drop(columns = 'Ticket', inplace = True)

### 9. Fare

In [70]:
print(X['Fare'].isna().values.sum())
X['Fare'].describe()
to_normalize.append('Fare')

0


### 10. Cabin

In [71]:
len(X)

891

In [72]:
with_cabins = X['Cabin'][X['Cabin'].isna() == False]
X['deck'] = with_cabins.str[0]

In [73]:
X.drop(columns = 'Cabin', inplace = True)

In [74]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Female,deck
0,3,22.0,1,0,7.25,S,0,
1,1,38.0,1,0,71.2833,C,1,C
2,3,26.0,0,0,7.925,S,1,
3,1,35.0,1,0,53.1,S,1,C
4,3,35.0,0,0,8.05,S,0,


In [75]:
X.deck.value_counts().sort_index()

A    15
B    47
C    59
D    33
E    32
F    13
G     4
T     1
Name: deck, dtype: int64

In [76]:
X['deck'][X['deck'].isna()] = 'XX'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['deck'][X['deck'].isna()] = 'XX'


In [77]:
deck_encoder = OrdinalEncoder(categories = [['XX', 'G', 'F', 'E', 'D', 'C', 'B', 'A', 'T']])
X['deck'] = deck_encoder.fit_transform(X['deck'].values.reshape(-1,1))

### 11. Embarked

In [78]:
X['Embarked'].isna().values.sum() # 2
X.loc[X['Embarked'].isna(), 'Embarked'] = "S"

In [79]:
onehot_encoder = OneHotEncoder(drop = 'first')
new_embarked = onehot_encoder.fit_transform(X['Embarked'].values.reshape(-1,1)).toarray()
X[onehot_encoder.get_feature_names_out()] = new_embarked
X.drop(columns = 'Embarked', inplace = True)

In [80]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Female,deck,x0_Q,x0_S
0,3,22.0,1,0,7.2500,0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1,5.0,0.0,0.0
2,3,26.0,0,0,7.9250,1,0.0,0.0,1.0
3,1,35.0,1,0,53.1000,1,5.0,0.0,1.0
4,3,35.0,0,0,8.0500,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0.0,0.0,1.0
887,1,19.0,0,0,30.0000,1,6.0,0.0,1.0
888,3,28.0,1,2,23.4500,1,0.0,0.0,1.0
889,1,26.0,0,0,30.0000,0,5.0,0.0,0.0


## Normalize

In [81]:
normalizer = MinMaxScaler()
X[to_normalize] = normalizer.fit_transform(X[to_normalize])


In [82]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Female,deck,x0_Q,x0_S
0,3,0.271174,0.125,0.000000,0.014151,0,0.0,0.0,1.0
1,1,0.472229,0.125,0.000000,0.139136,1,5.0,0.0,0.0
2,3,0.321438,0.000,0.000000,0.015469,1,0.0,0.0,1.0
3,1,0.434531,0.125,0.000000,0.103644,1,5.0,0.0,1.0
4,3,0.434531,0.000,0.000000,0.015713,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,2,0.334004,0.000,0.000000,0.025374,0,0.0,0.0,1.0
887,1,0.233476,0.000,0.000000,0.058556,1,6.0,0.0,1.0
888,3,0.346569,0.125,0.333333,0.045771,1,0.0,0.0,1.0
889,1,0.321438,0.000,0.000000,0.058556,0,5.0,0.0,0.0


In [83]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

# SVM

In [85]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score

In [86]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

parameters = {
    'C': (0.5, 1, 5, 10),
    'kernel': ('rbf', 'sigmoid'),
    'gamma': np.linspace(0.1, 1, 10)
}

clf = SVC()
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search_estimator = GridSearchCV(clf, parameters, scoring='f1', cv=stratified_10_fold_cv, return_train_score=False,
                                    verbose = 2)
grid_search_estimator.fit(X_train,y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ...................C=0.5, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=0.5, gamma=0.1, 

[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.5, gamma=0.6, kernel=rbf; total time=   0.0s
[CV] END ...................C=0.5, gamma=0.6, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=0.5, gamma=0.6, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=0.5, gamma=0.6, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=0.5, gamma=0.6, kernel=sigmoid; total time=   0.0s
[CV] END ...................

[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .........................C=1, gamma=0.2, kernel=rbf; total time=   0.0s
[CV] END .........................C=1, gamma=0.2, kernel=rbf; total time=   0.0s
[CV] END .........................C=1, gamma=0.2, kernel=rbf; total time=   0.0s
[CV] END ...................

[CV] END .....................C=1, gamma=0.6, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.6, kernel=sigmoid; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ..........C=1, gamma=0.7000000000000001, kernel=rbf; total time=   0.0s
[CV] END ......C=1, gamma=0.

[CV] END .........................C=5, gamma=0.2, kernel=rbf; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=5, gamma=0.2, kernel=sigmoid; total time=   0.0s
[CV] END .........C=5, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END .........C=5, gamma

[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END ......C=5, gamma=0.7000000000000001, kernel=sigmoid; total time=   0.0s
[CV] END .........................C=5, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END .........................C=5, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END .........................C=5, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END ...................

[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ........C=10, gamma=0.30000000000000004, kernel=rbf; total time=   0.0s
[CV] END ....C=10, gamma=0.30000000000000004, kernel=sigmoid; total time=   0.0s
[CV] END ....C=10, gamma=0.30000000000000004, kernel=sigmoid; total time=   0.0s
[CV] END ....C=10, gamma=0.3

[CV] END ........................C=10, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END ........................C=10, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END ........................C=10, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END ........................C=10, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END ........................C=10, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END ........................C=10, gamma=0.8, kernel=rbf; total time=   0.0s
[CV] END ....................C=10, gamma=0.8, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=10, gamma=0.8, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=10, gamma=0.8, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=10, gamma=0.8, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=10, gamma=0.8, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=10, gamma=0.8, kernel=sigmoid; total time=   0.0s
[CV] END ...................

In [87]:
results = pd.DataFrame(grid_search_estimator.cv_results_)
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))
display(results)
estimator = grid_search_estimator.best_estimator_

best score is 0.7331992943934832 with params {'C': 0.5, 'gamma': 0.7000000000000001, 'kernel': 'rbf'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015925,0.008856,0.009623,0.007228,0.5,0.1,rbf,"{'C': 0.5, 'gamma': 0.1, 'kernel': 'rbf'}",0.666667,0.765957,...,0.809524,0.734694,0.750000,0.727273,0.590909,0.500000,0.666667,0.692598,0.086355,40
1,0.011690,0.006316,0.006251,0.007656,0.5,0.1,sigmoid,"{'C': 0.5, 'gamma': 0.1, 'kernel': 'sigmoid'}",0.625000,0.711111,...,0.772727,0.708333,0.680851,0.723404,0.608696,0.538462,0.636364,0.670262,0.064548,42
2,0.014717,0.005287,0.002255,0.004668,0.5,0.2,rbf,"{'C': 0.5, 'gamma': 0.2, 'kernel': 'rbf'}",0.695652,0.775510,...,0.702703,0.734694,0.682927,0.711111,0.636364,0.565217,0.723404,0.702758,0.063273,38
3,0.015853,0.006137,0.004557,0.004372,0.5,0.2,sigmoid,"{'C': 0.5, 'gamma': 0.2, 'kernel': 'sigmoid'}",0.555556,0.526316,...,0.717949,0.512821,0.411765,0.214286,0.486486,0.400000,0.529412,0.479903,0.122865,46
4,0.002371,0.006867,0.012901,0.005523,0.5,0.3,rbf,"{'C': 0.5, 'gamma': 0.30000000000000004, 'kern...",0.750000,0.800000,...,0.702703,0.772727,0.666667,0.714286,0.722222,0.600000,0.714286,0.719965,0.053894,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0.018857,0.009434,0.002052,0.004762,10,0.8,sigmoid,"{'C': 10, 'gamma': 0.8, 'kernel': 'sigmoid'}",0.235294,0.294118,...,0.375000,0.622222,0.444444,0.285714,0.210526,0.157895,0.341463,0.342822,0.131308,67
76,0.013798,0.007819,0.003978,0.006974,10,0.9,rbf,"{'C': 10, 'gamma': 0.9, 'kernel': 'rbf'}",0.790698,0.800000,...,0.736842,0.772727,0.684211,0.727273,0.722222,0.585366,0.750000,0.728729,0.058369,7
77,0.019012,0.006794,0.001970,0.004718,10,0.9,sigmoid,"{'C': 10, 'gamma': 0.9, 'kernel': 'sigmoid'}",0.187500,0.250000,...,0.333333,0.622222,0.352941,0.258065,0.222222,0.533333,0.358974,0.353036,0.131526,66
78,0.015641,0.004573,0.000000,0.000000,10,1.0,rbf,"{'C': 10, 'gamma': 1.0, 'kernel': 'rbf'}",0.761905,0.826087,...,0.736842,0.772727,0.684211,0.727273,0.722222,0.585366,0.750000,0.728458,0.059773,8
