# Module Imports

In [1]:
import os

import pandas as pd
import numpy as np

# Loading the data

##### Using OS module to determine filepaths in OS independent manner as backslashes used on Windows and forward slashes used on Unix

In [2]:
working_directory = os.path.abspath('.')

In [3]:
training_data_location = os.path.join(working_directory, 'Data', 'train.csv')
testing_data_location = os.path.join(working_directory, 'Data', 'test.csv')

In [4]:
titanic_training_data = pd.read_csv(training_data_location)
titanic_testing_data = pd.read_csv(testing_data_location)

# Initial data exploration

In [5]:
titanic_training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Training data consist of the following columns:

    1) Passenger ID: Assuming this is unique
    2) Survived: This is a true(1)/false(0) of whether or not the passenger has survived
    3) Pclass: class of ticket, this is either lower (3), middle (2) or upper (1)
    4) Name: Passenger name as a string. It may be worth splitting passenger name into surnames to see if we can group by families
    5) Sex: categorical of either male or female
    6) SibSp: This is the number of siblings and spouses on-board the titanic
    7) Parch: This is the number of parents and children on-board the titanic
    8) Ticket number: I assume this is unique --> I wonder if there is any processing that can be done to extract more information.
    9) Fare: price paid for ticket
    10) Cabin: Cabin number, maybe this is something to do with location upon the titanic..may require some processing. Or maybe we can make a new column which represents whether or not a person has a Cabin.
    11) Embarked: Location at which person embarked the titanic. S is Southampton, Q is queenstown and C is Cherbourg


## Data Preparation

Based off previous analysis the Cabin column needs to be processed to obtain information about whether or not a person had a cabin and also the location of their cabin.

In [6]:
titanic_training_data['Cabin'] = titanic_training_data.Cabin.str.get(0).fillna('NoCabin')
titanic_training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,NoCabin,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,NoCabin,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,NoCabin,S


In [7]:
titanic_testing_data['Cabin'] = titanic_testing_data.Cabin.str.get(0).fillna('NoCabin')
titanic_testing_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,NoCabin,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,NoCabin,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,NoCabin,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,NoCabin,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,NoCabin,S


In [8]:
titanic_training_data['Family_Name'] = titanic_training_data.Name.str.split(',').str.get(0)
titanic_testing_data['Family_Name'] = titanic_testing_data.Name.str.split(',').str.get(0)

Worked out family names incase I want to include that in model.

## First Model

As I don't know what type of estimator I should be using I used the SciKit-Learn cheat sheet to choose LinearSVC (https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html). The reason for this is that we have available to us more then 50 labelled training set rows and we wish to classify them into two categories which are whether or not a person survived.

SVC stands for support vector classification. It looks to find a plane betweeen the extremes of different categories.

#### Importing SVM library

In [9]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#### Data preprocessing

#### First remove all NaNs

In [10]:
titanic_training_data = titanic_training_data.dropna()
titanic_testing_data = titanic_testing_data.dropna()

#### Remove Cabin T as looks like outlier.

In [31]:
titanic_training_data.Cabin!='T'

0      True
1      True
2      True
3      True
4      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
18     True
20     True
21     True
22     True
23     True
24     True
25     True
27     True
30     True
33     True
34     True
35     True
37     True
38     True
       ... 
856    True
857    True
858    True
860    True
861    True
862    True
864    True
865    True
866    True
867    True
869    True
870    True
871    True
872    True
873    True
874    True
875    True
876    True
877    True
879    True
880    True
881    True
882    True
883    True
884    True
885    True
886    True
887    True
889    True
890    True
Name: Cabin, Length: 712, dtype: bool

First want to decide which features to use. Based off previous analysis I would like to use:

    1) Sex
    2) Ticket class
    3) Siblings and spouses
    4) Parents and children
    5) Age
    6) Cabin
    7) Fare cost

In [11]:
feature_columns = ['Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Age', 'Embarked']
categorical_columns = ['Cabin', 'Sex', 'Embarked']
X_data = titanic_training_data[feature_columns]
X_data_kaggle = titanic_testing_data[feature_columns]
Y_data = titanic_training_data['Survived']

Encode categorical data

In [12]:
X_data = pd.get_dummies(X_data, drop_first=False)
X_data_kaggle = pd.get_dummies(X_data_kaggle, drop_first=False)
X_data.head(1)

Unnamed: 0,SibSp,Parch,Fare,Age,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_NoCabin,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,1,0,7.25,22.0,0,1,0,0,0,0,0,0,0,1,0,0,0,1


The X data is encoded, however, need to drop one column from each category.

In [13]:
columns_to_drop = ['Sex_male', 'Cabin_A', 'Embarked_C']
X_data = X_data.drop(columns_to_drop, axis=1)
X_data_kaggle = X_data_kaggle.drop(columns_to_drop, axis=1)
X_data.head(1)

Unnamed: 0,SibSp,Parch,Fare,Age,Sex_female,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_NoCabin,Cabin_T,Embarked_Q,Embarked_S
0,1,0,7.25,22.0,0,0,0,0,0,0,0,1,0,0,1


In [27]:
X_data_kaggle.head(1)

Unnamed: 0,SibSp,Parch,Fare,Age,Sex_female,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_NoCabin,Embarked_Q,Embarked_S
0,0,0,7.8292,34.5,0,0,0,0,0,0,0,1,1,0


In [24]:
scaler = StandardScaler()

In [25]:
processed_X_data = scaler.fit_transform(X_data)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [26]:
processed_X_data_kaggle = scaler.transform(X_data_kaggle)

  """Entry point for launching an IPython kernel.


ValueError: operands could not be broadcast together with shapes (331,14) (15,) (331,14) 

#### Fitting a Linear SVM model

##### Splitting the data into training and testing set

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(processed_X_data, Y_data, test_size = 0.2, random_state = 99)

In [17]:
classifier = LinearSVC(max_iter = 10000)

In [18]:
classifier.fit(X_train, Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [19]:
y_pred = classifier.predict(X_test)

In [20]:
from sklearn.metrics import confusion_matrix

In [21]:
confusion_matrix(y_pred, Y_test)

array([[72, 21],
       [13, 37]], dtype=int64)