# Module Imports

In [1]:
import os
import pandas as pd
import numpy as np

# Loading the data

##### Using OS module to determine filepaths in OS independent manner as backslashes used on Windows and forward slashes used on Unix

In [2]:
working_directory = os.path.abspath('.')

In [3]:
training_data_location = os.path.join(working_directory, 'Data', 'train.csv')
testing_data_location = os.path.join(working_directory, 'Data', 'test.csv')

In [4]:
titanic_training_data = pd.read_csv(training_data_location)
titanic_testing_data = pd.read_csv(testing_data_location)

# Initial data exploration

In [5]:
titanic_training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Training data consist of the following columns:

    1) Passenger ID: Assuming this is unique
    2) Survived: This is a true(1)/false(0) of whether or not the passenger has survived
    3) Pclass: class of ticket, this is either lower (3), middle (2) or upper (1)
    4) Name: Passenger name as a string. It may be worth splitting passenger name into surnames to see if we can group by families
    5) Sex: categorical of either male or female
    6) SibSp: This is the number of siblings and spouses on-board the titanic
    7) Parch: This is the number of parents and children on-board the titanic
    8) Ticket number: I assume this is unique --> I wonder if there is any processing that can be done to extract more information.
    9) Fare: price paid for ticket
    10) Cabin: Cabin number, maybe this is something to do with location upon the titanic..may require some processing. Or maybe we can make a new column which represents whether or not a person has a Cabin.
    11) Embarked: Location at which person embarked the titanic. S is Southampton, Q is queenstown and C is Cherbourg


## Data Preparation

Based off previous analysis the Cabin column needs to be processed to obtain information about whether or not a person had a cabin and also the location of their cabin.

In [6]:
titanic_training_data['Cabin'] = titanic_training_data.Cabin.str.get(0).fillna('NoCabin')
titanic_training_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,NoCabin,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C


In [7]:
titanic_testing_data['Cabin'] = titanic_testing_data.Cabin.str.get(0).fillna('NoCabin')
titanic_testing_data.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,NoCabin,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,NoCabin,S


In [8]:
titanic_training_data['Family_Name'] = titanic_training_data.Name.str.split(',').str.get(0)
titanic_testing_data['Family_Name'] = titanic_testing_data.Name.str.split(',').str.get(0)

Worked out family names incase I want to include that in model.

## Model Selection

As I don't know what type of estimator I should be using I used the SciKit-Learn cheat sheet to choose LinearSVC (https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html). The reason for this is that we have available to us more then 50 labelled training set rows and we wish to classify them into two categories which are whether or not a person survived.

SVC stands for support vector classification. It looks to find a plane betweeen the extremes of different categories.

#### Importing SVM library

In [9]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.impute  import SimpleImputer
from sklearn.preprocessing import StandardScaler

#### Data preprocessing

##### Remove Cabin T as looks like outlier.

In [10]:
titanic_training_data = titanic_training_data[titanic_training_data.Cabin!='T']
titanic_training_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Name
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,NoCabin,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,NoCabin,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,NoCabin,S,Allen


First want to decide which features to use. Based off previous analysis I would like to use:

    1) Sex
    2) Ticket class
    3) Cabin
    
As these seem to be the biggest influences.

In [11]:
feature_columns = ['Sex', 'Cabin', 'Pclass']
categorical_columns = ['Cabin', 'Sex', 'Pclass']
X_data = titanic_training_data[feature_columns]
X_data_kaggle = titanic_testing_data[feature_columns]
Y_data = titanic_training_data['Survived']

##### Ensure categorical columns are actually categorical

In [12]:
X_data[categorical_columns].astype('category')
X_data_kaggle[categorical_columns].astype('category')
X_data.head(1)
X_data_kaggle.head(1)

Unnamed: 0,Sex,Cabin,Pclass
0,male,NoCabin,3


Encode categorical data

In [13]:
X_data = pd.get_dummies(X_data, drop_first=False, columns=categorical_columns)
X_data_kaggle = pd.get_dummies(X_data_kaggle, drop_first=False, columns=categorical_columns)
X_data.head(1)

Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_NoCabin,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
0,0,0,0,0,0,0,0,1,0,1,0,0,1


The X data is encoded, however, need to drop one column from each category.

In [14]:
columns_to_drop = ['Sex_male', 'Cabin_A', 'Pclass_1']
X_data = X_data.drop(columns_to_drop, axis=1)
X_data_kaggle = X_data_kaggle.drop(columns_to_drop, axis=1)
X_data.head(1)

Unnamed: 0,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_NoCabin,Sex_female,Pclass_2,Pclass_3
0,0,0,0,0,0,0,1,0,0,1


#### Impute the data to remove NaNs

In [15]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [16]:
imputer.fit(X_data)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [17]:
X_data = imputer.transform(X_data)
X_data_kaggle = imputer.transform(X_data_kaggle)

In [18]:
scaler = StandardScaler()

In [19]:
processed_X_data = scaler.fit_transform(X_data)

In [20]:
processed_X_data_kaggle = scaler.transform(X_data_kaggle)

#### Fitting a Linear SVM model

#### Try to use cross validation

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
from sklearn.pipeline import make_pipeline

In [23]:
classifier = LinearSVC(max_iter = 100000)
clf = make_pipeline(StandardScaler(), classifier)
scores = cross_val_score(clf, processed_X_data, Y_data, cv=10, n_jobs=-1)
print(scores)
np.mean(scores)

[0.77777778 0.77777778 0.7752809  0.83146067 0.7752809  0.76404494
 0.76404494 0.75280899 0.81818182 0.77272727]


0.7809385994779252

##### Splitting the data into training and testing set

In [24]:
classifier = LinearSVC(max_iter = 100000)

In [25]:
classifier.fit(processed_X_data, Y_data)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=100000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

Predict Kaggle Dataset

In [26]:
y_pred_kaggle = classifier.predict(X_data_kaggle)

In [27]:
kaggle_final_df = pd.DataFrame({'PassengerID':titanic_testing_data.PassengerId,'Survived':y_pred_kaggle})
kaggle_final_df.head()

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


Export dataframe to csv

In [28]:
kaggle_final_df.to_csv('linar_SVM_prediction_sex_cabin_class.csv', index=False)