In [5]:
# Importing dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn import svm

from sklearn.metrics import accuracy_score

import warnings 
warnings.filterwarnings("ignore")

%matplotlib inline

In [6]:
# Function to transform the titles other than Mr, Mrs, Miss, Master into these using age, sex,
# and titles extracted from Name Columns

def new_titles(dataFrame):
    titles = dataFrame.Title.values.tolist()
    sex = dataFrame.Sex.values.tolist()
    age = dataFrame.Age.values.tolist()
    title_new = []
    for i in range(len(titles)):
        if sex[i] == 'male':
            if age[i] > 18:
                if titles[i] in ['Mr', 'Master','Mrs', 'Miss']:
                    title_new.append(titles[i])
                else:
                    title_new.append('Mr')

            else:
                if titles[i] in ['Mr', 'Master','Mrs', 'Miss']:
                    title_new.append(titles[i])
                else:
                    title_new.append('Master')
        else:
            if age[i] > 18:
                if titles[i] in ['Mr', 'Master','Mrs', 'Miss']:
                    title_new.append(titles[i])
                else:
                    title_new.append('Mrs')
            else:
                if titles[i] in ['Mr', 'Master','Mrs', 'Miss']:
                    title_new.append(titles[i])
                else:
                    title_new.append('Miss')
    dataFrame['Title'] = title_new

In [8]:
# Reading the Dataset
df_train = pd.read_csv(r'Data/train.csv')
df_test = pd.read_csv(r'Data/test.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
df_train.groupby(['Survived']).mean()

Unnamed: 0_level_0,PassengerId,Pclass,Age,SibSp,Parch,Fare
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,447.016393,2.531876,30.626179,0.553734,0.32969,22.117887
1,444.368421,1.950292,28.34369,0.473684,0.464912,48.395408


In [12]:
# Extracting Title from Name using Reg Expressions
df_train['Title'] = df_train['Name'].str.extract('(\s[A-Z]\w{0,})')
df_train['Title'] = df_train['Title'].str.strip()
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [13]:
# Dropping Name Column
df_train.drop(['Name'], axis=1, inplace= True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Mr


In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [15]:
df_train.groupby(['Sex', 'Title']).mean()['Survived']

Sex     Title      
female  Castellana     1.000000
        Countess       1.000000
        Dr             1.000000
        Gordon         1.000000
        Impe           0.000000
        Messemaeker    1.000000
        Miss           0.703911
        Mlle           1.000000
        Mme            1.000000
        More           1.000000
        Mrs            0.801653
        Ms             1.000000
        Planke         0.000000
male    Billiard       0.000000
        Capt           0.000000
        Carlo          0.000000
        Castellana     0.000000
        Col            0.500000
        Cruyssen       0.000000
        Don            0.000000
        Dr             0.333333
        Gordon         1.000000
        Impe           0.000000
        Jonkheer       0.000000
        Major          0.500000
        Manent         1.000000
        Master         0.575000
        Melkebeke      0.000000
        Mr             0.157058
        Mulder         1.000000
        Pelsmaeker  

In [16]:
# Replacing the missing age value with the median of corresponding Titles
df_train.at[(df_train.Age.isna()) & (df_train.Title == 'Mr'), 'Age'] = df_train[df_train.Title == 'Mr'].Age.median()
df_train.at[(df_train.Age.isna()) & (df_train.Title == 'Miss'), 'Age'] = df_train[df_train.Title == 'Miss'].Age.median()
df_train.at[(df_train.Age.isna()) & (df_train.Title == 'Mrs'), 'Age'] = df_train[df_train.Title == 'Mrs'].Age.median()
df_train.at[(df_train.Age.isna()) & (df_train.Title == 'Master'), 'Age'] = df_train[df_train.Title == 'Master'].Age.median()
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          888 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [17]:
df_train[df_train.Age.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
547,548,1,2,male,,0,0,SC/PARIS 2146,13.8625,,C,Manent
766,767,0,1,male,,0,0,112379,39.6,,C,Dr
868,869,0,3,male,,0,0,345777,9.5,,S,Melkebeke


In [18]:
df_train.Age.fillna(df_train[df_train.Title == 'Mr'].Age.median(), inplace= True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [19]:
# using funtion to change titles to mr, miss, ...
new_titles(df_train)
df_train.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master'], dtype=object)

In [20]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Mr


In [21]:
# Converting categorical variables to numerical 
df_train['Title'].replace(['Mr', 'Mrs', 'Master', 'Miss'], [1,2,3,4], inplace=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,4
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,male,35.0,0,0,373450,8.05,,S,1


In [22]:
# Transforming Cabin Columns, if cabin present set 1 or 0
df_train.at[df_train.Cabin.isna() == False, 'Cabin'] = 1
df_train.at[df_train.Cabin.isna() == True, 'Cabin'] = 0
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,0,S,1
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,1,C,2
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,0,S,4
3,4,1,1,female,35.0,1,0,113803,53.1,1,S,2
4,5,0,3,male,35.0,0,0,373450,8.05,0,S,1


In [23]:
# Filling missing Embarked values with Mode vlaues
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace= True)

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    object 
 10  Embarked     891 non-null    object 
 11  Title        891 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [25]:
# Replacing Categorical variables like Sex, Embarked into numerical discrete variables
df_train['Sex'] = df_train['Sex'].replace(['male','female'],[1,0]) 
df_train['Embarked'] = df_train['Embarked'].replace(['C','Q','S'], [1,2,3])
df_train.Cabin = df_train.Cabin.astype(int)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    int32  
 10  Embarked     891 non-null    int64  
 11  Title        891 non-null    int64  
dtypes: float64(2), int32(1), int64(8), object(1)
memory usage: 80.2+ KB


In [26]:
# Using the Pearson method to compute the correlation between the variables
df_train.corr(method='pearson')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
PassengerId,1.0,-0.005007,-0.035144,0.042939,0.04119,-0.057527,-0.001652,0.012658,0.019919,0.013128,-0.080753
Survived,-0.005007,1.0,-0.338481,-0.543351,-0.077786,-0.035322,0.081629,0.257307,0.316912,-0.167675,0.460385
Pclass,-0.035144,-0.338481,1.0,0.1319,-0.352147,0.083081,0.018443,-0.5495,-0.725541,0.162098,-0.026602
Sex,0.042939,-0.543351,0.1319,1.0,0.103887,-0.114631,-0.245489,-0.182333,-0.140391,0.108262,-0.799059
Age,0.04119,-0.077786,-0.352147,0.103887,1.0,-0.265187,-0.187636,0.097828,0.244745,-0.015621,-0.38747
SibSp,-0.057527,-0.035322,0.083081,-0.114631,-0.265187,1.0,0.414838,0.159651,-0.04046,0.06823,0.227445
Parch,-0.001652,0.081629,0.018443,-0.245489,-0.187636,0.414838,1.0,0.216225,0.036987,0.039798,0.257884
Fare,0.012658,0.257307,-0.5495,-0.182333,0.097828,0.159651,0.216225,1.0,0.482075,-0.224719,0.160659
Cabin,0.019919,0.316912,-0.725541,-0.140391,0.244745,-0.04046,0.036987,0.482075,1.0,-0.160196,0.075743
Embarked,0.013128,-0.167675,0.162098,0.108262,-0.015621,0.06823,0.039798,-0.224719,-0.160196,1.0,-0.091926


In [27]:
# Creating new features 
df_train['Family_Size'] = df_train['SibSp'] + df_train['Parch'] + 1
df_train['Age*PClass'] = df_train['Age'] * df_train['Pclass']

In [28]:
df_train.corr(method='pearson')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,Family_Size,Age*PClass
PassengerId,1.0,-0.005007,-0.035144,0.042939,0.04119,-0.057527,-0.001652,0.012658,0.019919,0.013128,-0.080753,-0.040143,0.013446
Survived,-0.005007,1.0,-0.338481,-0.543351,-0.077786,-0.035322,0.081629,0.257307,0.316912,-0.167675,0.460385,0.016639,-0.339394
Pclass,-0.035144,-0.338481,1.0,0.1319,-0.352147,0.083081,0.018443,-0.5495,-0.725541,0.162098,-0.026602,0.065997,0.502362
Sex,0.042939,-0.543351,0.1319,1.0,0.103887,-0.114631,-0.245489,-0.182333,-0.140391,0.108262,-0.799059,-0.200988,0.200416
Age,0.04119,-0.077786,-0.352147,0.103887,1.0,-0.265187,-0.187636,0.097828,0.244745,-0.015621,-0.38747,-0.274986,0.571202
SibSp,-0.057527,-0.035322,0.083081,-0.114631,-0.265187,1.0,0.414838,0.159651,-0.04046,0.06823,0.227445,0.890712,-0.228527
Parch,-0.001652,0.081629,0.018443,-0.245489,-0.187636,0.414838,1.0,0.216225,0.036987,0.039798,0.257884,0.783111,-0.15258
Fare,0.012658,0.257307,-0.5495,-0.182333,0.097828,0.159651,0.216225,1.0,0.482075,-0.224719,0.160659,0.217138,-0.348808
Cabin,0.019919,0.316912,-0.725541,-0.140391,0.244745,-0.04046,0.036987,0.482075,1.0,-0.160196,0.075743,-0.009175,-0.39812
Embarked,0.013128,-0.167675,0.162098,0.108262,-0.015621,0.06823,0.039798,-0.224719,-0.160196,1.0,-0.091926,0.066516,0.140008


In [29]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Family_Size',
       'Age*PClass'],
      dtype='object')

In [30]:
df_train_X = df_train[['Pclass','Title','Sex','Age','Age*PClass', 'Parch', 'Fare', 'Cabin', 'Embarked']].values
df_train_y = df_train['Survived'].values

In [31]:
print(len(df_train_X), len(df_train_y))

891 891


In [33]:
# Normalizing the dataset 
df_train_X = preprocessing.StandardScaler().fit(df_train_X).transform(df_train_X.astype(float))

In [34]:
# Splitting the dataset into test-train with 20% testing size
X_train, X_test, y_train, y_test = train_test_split(df_train_X, df_train_y, test_size = 0.2, random_state=4)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

712 712
179 179


In [35]:
# Decision Tree Classifier
tree = DTC(criterion='entropy', max_depth=4)
tree.fit(X_train, y_train)
yhat_DTC = tree.predict(X_test)
yhat_DTC[:5]

array([0, 0, 1, 1, 0], dtype=int64)

In [36]:
print('Accuracy of the DTC Model ', accuracy_score(y_test, yhat_DTC))

Accuracy of the DTC Model  0.8491620111731844


In [37]:
# Support Vector Machine
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
yhat_SVM = clf.predict(X_test)
yhat_SVM[:5]

array([0, 0, 1, 1, 0], dtype=int64)

In [38]:
print('Accuracy of the SVM Model ', accuracy_score(y_test, yhat_SVM))

Accuracy of the SVM Model  0.8715083798882681


## Preparing for Submission

In [39]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [40]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [41]:
df_test['Title'] = df_test['Name'].str.extract('(\s[A-Z]\w{0,})')
df_test['Title'] = df_test['Title'].str.strip()
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs


In [42]:
df_test.drop(['Name'], axis=1, inplace= True)
df_test.at[(df_test.Age.isna()) & (df_test.Title == 'Mr'), 'Age'] = df_test[df_test.Title == 'Mr'].Age.median()
df_test.at[(df_test.Age.isna()) & (df_test.Title == 'Miss'), 'Age'] = df_test[df_test.Title == 'Miss'].Age.median()
df_test.at[(df_test.Age.isna()) & (df_test.Title == 'Mrs'), 'Age'] = df_test[df_test.Title == 'Mrs'].Age.median()
df_test.at[(df_test.Age.isna()) & (df_test.Title == 'Master'), 'Age'] = df_test[df_test.Title == 'Master'].Age.median()
df_test.Age.fillna(df_test[df_test.Title == 'Mr'].Age.median(), inplace= True)

In [43]:
# using funtion to change titles to mr, miss, ...
new_titles(df_test)
df_test.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master'], dtype=object)

In [44]:
df_test['Fare'].fillna(df_test['Fare'].median(), inplace = True)
df_test['Title'].replace(['Mr', 'Mrs', 'Master', 'Miss'], [1,2,3,4], inplace=True)
df_test.at[df_test.Cabin.isna() == False, 'Cabin'] = 1
df_test.at[df_test.Cabin.isna() == True, 'Cabin'] = 0
df_test['Embarked'].fillna(df_test['Embarked'].mode()[0], inplace= True)
df_test['Sex'] = df_test['Sex'].replace(['male','female'],[1,0]) 
df_test['Embarked'] = df_test['Embarked'].replace(['C','Q','S'], [1,2,3])
df_test.Cabin = df_test.Cabin.astype(int)

In [45]:
# Feature Engineering
df_test['Family_Size'] = df_test['SibSp'] + df_test['Parch'] + 1
df_test['Age*PClass'] = df_test['Age'] * df_test['Pclass']

In [46]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_Size,Age*PClass
0,892,3,1,34.5,0,0,330911,7.8292,0,2,1,1,103.5
1,893,3,0,47.0,1,0,363272,7.0,0,3,2,2,141.0
2,894,2,1,62.0,0,0,240276,9.6875,0,2,1,1,124.0
3,895,3,1,27.0,0,0,315154,8.6625,0,3,1,1,81.0
4,896,3,0,22.0,1,1,3101298,12.2875,0,3,2,3,66.0


In [48]:
df_test = df_test[['Pclass','Title','Sex','Age','Age*PClass', 'Parch', 'Fare', 'Cabin', 'Embarked']].values
df_test = preprocessing.StandardScaler().fit(df_test).transform(df_test.astype(float))
df_test[:5]

array([[ 0.87348191, -0.7141265 ,  0.75592895,  0.37520893,  1.46000699,
        -0.4002477 , -0.49741333, -0.52752958, -0.47091535],
       [ 0.87348191,  0.14860436, -1.32287566,  1.34286276,  2.78158205,
        -0.4002477 , -0.51227801, -0.52752958,  0.70076689],
       [-0.31581919, -0.7141265 ,  0.75592895,  2.50404735,  2.18246802,
        -0.4002477 , -0.46410047, -0.52752958, -0.47091535],
       [ 0.87348191, -0.7141265 ,  0.75592895, -0.20538337,  0.66706196,
        -0.4002477 , -0.48247516, -0.52752958,  0.70076689],
       [ 0.87348191,  0.14860436, -1.32287566, -0.5924449 ,  0.13843193,
         0.61989583, -0.4174915 , -0.52752958,  0.70076689]])

In [49]:
# Decision Tree Classifier
tree = DTC(criterion='entropy', max_depth=4)
tree.fit(df_train_X, df_train_y)
yhat_sub = tree.predict(df_test)
yhat_sub[:5]

array([0, 0, 0, 0, 1], dtype=int64)

In [None]:
df = pd.read_csv(r'gender_submission.csv')
df.head()

In [95]:
submission_23 = pd.DataFrame({'PassengerId': df.PassengerId.tolist(), 'Survived': yhat_sub.tolist()})
submission_23.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [96]:
submission_23.to_csv('submission_23.csv',index=False)

In [161]:
# Support Vector Machine
clf = svm.SVC(kernel='rbf')
clf.fit(df_train_X, df_train_y)
yhat_SVM_sub = clf.predict(df_test)
yhat_SVM_sub[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [47]:
submission_22 = pd.DataFrame({'PassengerId': df.PassengerId.tolist(), 'Survived': yhat_SVM_sub.tolist()})
submission_22.to_csv('submission_22.csv',index=False)