# Titani Data Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv('D:/Datasets/Titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.shape

(891, 12)

In [4]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
num_feat = [feature for feature in df.columns if df[feature].dtype!='O']
num_feat

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [6]:
disc_feat = [feature for feature in num_feat if len(df[feature].unique())<10]
disc_feat

['Survived', 'Pclass', 'SibSp', 'Parch']

In [7]:
cont_feat = [feature for feature in num_feat if feature not in disc_feat]
cont_feat

['PassengerId', 'Age', 'Fare']

In [8]:
def median_impute(data, feature):
    median = data[feature].median()
    data[feature] = data[feature].fillna(median)
    return data

In [9]:
df = median_impute(df, 'Age')

In [10]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
cat_feat = [feature for feature in df.columns if feature not in num_feat]
cat_feat

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [12]:
for feature in cat_feat:
    print(feature, ':',len(df[feature].unique()))

Name : 891
Sex : 2
Ticket : 681
Cabin : 148
Embarked : 4


In [13]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [14]:
df['Embarked'].isnull().sum()

0

In [15]:
df['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [16]:
df['Cabin'].fillna('Missing', inplace=True)

In [17]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [18]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,M,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,M,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,M,S


In [19]:
df.drop(columns=['Name', 'Ticket'], inplace=True)

In [20]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,7.25,M,S
1,2,1,1,female,38.0,1,0,71.2833,C,C
2,3,1,3,female,26.0,0,0,7.925,M,S
3,4,1,1,female,35.0,1,0,53.1,C,S
4,5,0,3,male,35.0,0,0,8.05,M,S


In [21]:
data = pd.read_csv('formatedTitanicTest.csv')
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,7.8292,M,Q
1,893,3,female,47.0,1,0,7.0,M,S
2,894,2,male,62.0,0,0,9.6875,M,Q
3,895,3,male,27.0,0,0,8.6625,M,S
4,896,3,female,22.0,1,1,12.2875,M,S


In [22]:
conc = pd.concat([df, data], axis=0)
conc.shape

(1309, 10)

In [23]:
conc.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
413,1305,,3,male,27.0,0,0,8.05,M,S
414,1306,,1,female,39.0,0,0,108.9,C,C
415,1307,,3,male,38.5,0,0,7.25,M,S
416,1308,,3,male,27.0,0,0,8.05,M,S
417,1309,,3,male,27.0,1,1,22.3583,M,C


In [24]:
conc = pd.get_dummies(conc, columns=['Sex', 'Cabin', 'Embarked'], drop_first=True)

In [25]:
conc.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Cabin_T,Embarked_Q,Embarked_S
0,1,0.0,3,22.0,1,0,7.25,1,0,0,0,0,0,0,1,0,0,1
1,2,1.0,1,38.0,1,0,71.2833,0,0,1,0,0,0,0,0,0,0,0
2,3,1.0,3,26.0,0,0,7.925,0,0,0,0,0,0,0,1,0,0,1
3,4,1.0,1,35.0,1,0,53.1,0,0,1,0,0,0,0,0,0,0,1
4,5,0.0,3,35.0,0,0,8.05,1,0,0,0,0,0,0,1,0,0,1


In [26]:
conc.shape

(1309, 18)

In [27]:
train_data = conc.iloc[:891, :]
test_data = conc.iloc[891:, :]

In [28]:
test_data.shape

(418, 18)

In [29]:
test_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Cabin_T,Embarked_Q,Embarked_S
0,892,,3,34.5,0,0,7.8292,1,0,0,0,0,0,0,1,0,1,0
1,893,,3,47.0,1,0,7.0,0,0,0,0,0,0,0,1,0,0,1
2,894,,2,62.0,0,0,9.6875,1,0,0,0,0,0,0,1,0,1,0
3,895,,3,27.0,0,0,8.6625,1,0,0,0,0,0,0,1,0,0,1
4,896,,3,22.0,1,1,12.2875,0,0,0,0,0,0,0,1,0,0,1


In [30]:
test_data.drop(columns='Survived', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [31]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Cabin_T,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1,0,0,0,0,0,0,1,0,1,0
1,893,3,47.0,1,0,7.0,0,0,0,0,0,0,0,1,0,0,1
2,894,2,62.0,0,0,9.6875,1,0,0,0,0,0,0,1,0,1,0
3,895,3,27.0,0,0,8.6625,1,0,0,0,0,0,0,1,0,0,1
4,896,3,22.0,1,1,12.2875,0,0,0,0,0,0,0,1,0,0,1


In [32]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Cabin_T,Embarked_Q,Embarked_S
0,1,0.0,3,22.0,1,0,7.25,1,0,0,0,0,0,0,1,0,0,1
1,2,1.0,1,38.0,1,0,71.2833,0,0,1,0,0,0,0,0,0,0,0
2,3,1.0,3,26.0,0,0,7.925,0,0,0,0,0,0,0,1,0,0,1
3,4,1.0,1,35.0,1,0,53.1,0,0,1,0,0,0,0,0,0,0,1
4,5,0.0,3,35.0,0,0,8.05,1,0,0,0,0,0,0,1,0,0,1


In [33]:
x = train_data.drop(columns=['Survived', 'PassengerId'])
y = train_data['Survived']

In [34]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

In [35]:
clf.fit(x, y)

RandomForestClassifier()

In [36]:
x_test = test_data.drop(columns=['PassengerId'])

In [37]:
y_pred = clf.predict(x_test)

In [38]:
sub = pd.read_csv('D:/Datasets/Titanic/gender_submission.csv')
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [39]:
y_pred = y_pred.astype('int64')

In [40]:
sub.drop(columns=['Survived'], inplace=True)

In [41]:
pred = pd.DataFrame(y_pred, columns=['Survived'])

In [42]:
sub = pd.concat([sub, pred], axis=1)
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [43]:
sub.shape

(418, 2)

In [44]:
sub.to_csv('gender_submission.csv', index=False)