# Sinking of the Titanic

In [1]:
# import necessary modules
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
%matplotlib notebook

In [3]:
# reading train.csv and test.csv files
cwd = os.getcwd()
path = os.path.join(cwd, 'data')
file_test = 'test.csv'
file_train = 'train.csv'
test = pd.read_csv(os.path.join(path, file_test))
train = pd.read_csv(os.path.join(path, file_train))

In [4]:
# exploring test and train datasets
print(type(test))
print('Test shape:', test.shape)
print('Train shape:', train.shape)
print('Train dataset feature:', train.columns)
train.head(10)

<class 'pandas.core.frame.DataFrame'>
Test shape: (418, 11)
Train shape: (891, 12)
Train dataset feature: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
#     PassengerID - A column added by Kaggle to identify each row and make submissions easier
#     Survived - Whether the passenger survived or not and the value we are predicting (0=No, 1=Yes)
#     Pclass - The class of the ticket the passenger purchased (1=1st, 2=2nd, 3=3rd)
#     Sex - The passenger's sex
#     Age - The passenger's age in years
#     SibSp - The number of siblings or spouses the passenger had aboard the Titanic
#     Parch - The number of parents or children the passenger had aboard the Titanic
#     Ticket - The passenger's ticket number
#     Fare - The fare the passenger paid
#     Cabin - The passenger's cabin number
#     Embarked - The port where the passenger embarked (C=Cherbourg, Q=Queenstown, S=Southampton)

In [6]:
# sex ratio to survival
sex_pivot = train.pivot_table(index="Sex",values="Survived")
sex_pivot.plot.bar()
plt.show()

<IPython.core.display.Javascript object>

In [7]:
# the same stuff with passenger class
sex_pivot = train.pivot_table(index="Pclass",values="Survived")
sex_pivot.plot.bar()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fce5ec44390>

In [8]:
# explore Age column
train['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [9]:
# relationship between age and survival
fig, (ax1, ax2) = plt.subplots(1,2)
survived = train[train["Survived"] == 1]
died = train[train["Survived"] == 0]
ax1.hist(survived["Age"], color='red', bins=40);
ax2.hist(died["Age"], bins = 40);

<IPython.core.display.Javascript object>

  keep = (tmp_a >= first_edge)
  keep &= (tmp_a <= last_edge)


In [10]:
# separate age into a categorical feature by dividing it into ranges
def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5) # replacement of null by -0.5
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

In [11]:
cut_points = [-1,0,18,100]
label_names = ["Missing","Child","Adult"]

train = process_age(train,cut_points,label_names)
test = process_age(test,cut_points,label_names)

In [12]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_categories
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adult
5,6,0,3,"Moran, Mr. James",male,-0.5,0,0,330877,8.4583,,Q,Missing
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Adult
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Child
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Adult
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Child


In [13]:
cut_points = [-1,0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']

train = process_age(train,cut_points,label_names)
test = process_age(test,cut_points,label_names)

age_pivot = train.pivot_table(index="Age_categories", values='Survived')
age_pivot.plot.bar()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fce5ea71828>

In [15]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_categories
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Young Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Young Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Young Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Young Adult
5,6,0,3,"Moran, Mr. James",male,-0.5,0,0,330877,8.4583,,Q,Missing
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Adult
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Infant
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Young Adult
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Teenager


In [16]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [17]:
# using the create_dummies() function to create dummy variables for Pclass, Sex and Age_categories
# in order to remove relationship between the numbers (in case of Pclass)
train = create_dummies(train,"Pclass")
test = create_dummies(test,"Pclass")
train = create_dummies(train,"Sex")
test = create_dummies(test,"Sex")
train = create_dummies(train,"Age_categories")
test = create_dummies(test,"Age_categories")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Pclass_3,Sex_female,Sex_male,Age_categories_Missing,Age_categories_Infant,Age_categories_Child,Age_categories_Teenager,Age_categories_Young Adult,Age_categories_Adult,Age_categories_Senior
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,1,0,1,0,0,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,1,0,0,0,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,1,1,0,0,0,0,0,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,0,1,0,0,0,0,0,1,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,1,0,1,0,0,0,0,1,0,0


In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.svm import SVC

In [107]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

X = train[columns]
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# clf = LogisticRegression(solver='lbfgs', C=4).fit(X_train, y_train)
clf = SVC(gamma=1).fit(X_train, y_train)

print('Test shape:', X_test.shape)
print('Train shape:', X_train.shape)

Test shape: (179, 12)
Train shape: (712, 12)


In [108]:
scores = cross_val_score(clf, X, y, cv=10)
np.mean(scores)

0.7991462376574736

In [89]:
test_prediction = clf.predict(test[columns])

In [90]:
test_ids = test['PassengerId']
submission_df = {'PassengerId': test_ids,
                'Survived': test_prediction}
submission = pd.DataFrame(submission_df)

submission.to_csv('titanic_submission_svc.csv', index=False)