In [1]:
import pandas as pd
import numpy as np
# import Titanic data
df = pd.read_csv("../data/titanicDataSet.csv")

# drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# choose only columns we may want to use in Analysis
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded']]

# choose only columns where data is complete for all features
df = df[(df['Pclass'].notnull()) & (df['Age'].notnull()) & (df['SibSp'].notnull()) & (df['Parch'].notnull()) & (df['Fare'].notnull()) & (df['Boarded'].notnull()) & (df['Sex'].notnull())]

In [2]:
# train/test Set
df_train = df[df['Survived'].notnull()]
df_test = df[df['Survived'].isnull()]

# set features and target
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']
X_test = df_test.drop('Survived', axis=1)
y_test = df_test['Survived']
feature_column_names = X_train.columns

# get indices for train/test sets
index_values_train = X_train.index
index_values_test = X_test.index


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical


# separate Categorical and Numeric data to encode categorical data
categorical_X_train = df_train[['Sex', 'Boarded']]
numeric_X_train = df_train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
categorical_X_test = df_test[['Sex', 'Boarded']]
numeric_X_test = df_test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

# encode categorical data (and reindex rows)
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(categorical_X_train)
cat_encoded_X_train = ordinal_encoder.transform(categorical_X_train)
cat_encoded_X_test = ordinal_encoder.transform(categorical_X_test)

# make dataframes with indices associated with numeric data
cat_encoded_X_train = pd.DataFrame(data = cat_encoded_X_train, index = index_values_train, columns = ['Sex', 'Boarded'])
cat_encoded_X_test = pd.DataFrame(data = cat_encoded_X_test, index = index_values_test, columns = ['Sex', 'Boarded'])

# join categorical encoded data with numeric data
encoded_X_train = numeric_X_train.join(cat_encoded_X_train)
encoded_X_test = numeric_X_test.join(cat_encoded_X_test)

# Encode train target values
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)

# scale X data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(encoded_X_train)
X_train_scaled = X_scaler.transform(encoded_X_train)
X_test_scaled = X_scaler.transform(encoded_X_test)

# RFE Selection
from sklearn.svm import SVC 
from sklearn.feature_selection import RFE
model = SVC(kernel='linear')
selector = RFE(model, n_features_to_select=7, step=1)
selector = selector.fit(X_train_scaled, y_train)
ss = selector.support_
selectedFeatures = list(feature_column_names[ss])

# make X of only important parameters
X_train = X_train[selectedFeatures]
X_test = X_test[selectedFeatures]

# rescale to X of only important parameters
X_scaler = MinMaxScaler().fit(encoded_X_train)
X_train_scaled = X_scaler.transform(encoded_X_train)
X_test_scaled = X_scaler.transform(encoded_X_test)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Train the Model



In [4]:
# Fit model
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train_scaled, y_train))

Train Acc: 0.780


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [5]:
# Create the GridSearchCV model from SVC model
from sklearn.model_selection import GridSearchCV
parameters = {
                'gamma':['scale', 'auto'],
                'degree': [2,3],
                'C':[1, 5, 10]
            }
# Train the model with GridSearch
print(X_train_scaled)
print(y_train)
grid = GridSearchCV(model, parameters)
grid.fit(X_train_scaled, y_train)

[[1.         0.27117366 0.2        ... 0.01415106 1.         1.        ]
 [0.         0.4722292  0.2        ... 0.13913574 0.         0.33333333]
 [1.         0.32143755 0.         ... 0.01546857 0.         1.        ]
 ...
 [0.         0.23347575 0.         ... 0.0585561  0.         1.        ]
 [0.         0.32143755 0.         ... 0.0585561  1.         0.33333333]
 [1.         0.39683338 0.         ... 0.01512699 1.         0.66666667]]
0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
885    0.0
886    0.0
887    1.0
889    1.0
890    0.0
Name: Survived, Length: 714, dtype: float64


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'degree': [2, 3],
                         'gamma': ['scale', 'auto']})

In [461]:
# get parameters to optimize SVC model
bestParamsDict = grid.best_params_
bestModel = SVC(kernel='linear', gamma=bestParamsDict['gamma'], C=bestParamsDict['C'], degree=bestParamsDict['degree'])
bestModel.fit(X_train_scaled, y_train)

# Model Accuracy
print('Train Acc: %.3f' % bestModel.score(X_train_scaled, y_train))

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_true = y_train, y_pred = bestModel.predict(X_train_scaled), target_names=['Survived', 'Deceased']))

Train Acc: 0.780
              precision    recall  f1-score   support

    Survived       0.79      0.85      0.82       424
    Deceased       0.75      0.68      0.72       290

    accuracy                           0.78       714
   macro avg       0.77      0.76      0.77       714
weighted avg       0.78      0.78      0.78       714



In [463]:
# predict all test value target
# make dataframes with indices associated with numeric data
encoded_X_train = pd.DataFrame(data = X_train_scaled, index = index_values_train, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded'])
encoded_X_test = pd.DataFrame(data = X_test_scaled, index = index_values_test, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded'])
print(encoded_X_train.shape)
print(encoded_X_test.shape)

index_values_train_array = list(index_values_train.array)
index_values_test_array = list(index_values_test.array)
index_values_array = index_values_train_array + index_values_test_array

encoded_X = encoded_X_train.append(encoded_X_test)
print(encoded_X.shape)
#encoded_X_test = pd.DataFrame(data = y_pred, index = index_values_array, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded'])
encoded_X = encoded_X.sort_index(axis=0)
y_pred = bestModel.predict(encoded_X)

print(y_pred.shape)

(714, 7)
(328, 7)
(1042, 7)
(1042,)


In [464]:
df['predicted_survival'] = y_pred
print(df)

      Survived  Pclass     Sex   Age  SibSp  Parch      Fare      Boarded  \
0          0.0       3    male  22.0      1      0    7.2500  Southampton   
1          1.0       1  female  38.0      1      0   71.2833    Cherbourg   
2          1.0       3  female  26.0      0      0    7.9250  Southampton   
3          1.0       1  female  35.0      1      0   53.1000  Southampton   
4          0.0       3    male  35.0      0      0    8.0500  Southampton   
...        ...     ...     ...   ...    ...    ...       ...          ...   
1300       NaN       3  female   3.0      1      1   13.7750  Southampton   
1302       NaN       1  female  37.0      1      0   90.0000  Southampton   
1303       NaN       3  female  28.0      0      0    7.7750  Southampton   
1305       NaN       1  female  39.0      0      0  108.9000    Cherbourg   
1306       NaN       3    male  38.5      0      0    7.2500  Southampton   

      predicted_survival  
0                    0.0  
1                    

In [465]:
df.to_csv('svm_results.csv')