# Business Question

use machine learning to create a model that predicts which passengers survived the Titanic shipwreck. we’ll gain access to two similar datasets that include passenger information like name, age, gender, socio-economic class, etc. One dataset is titled `train.csv` and the other is titled `test.csv`.

Train.csv will contain the details of a subset of the passengers on board (891 to be exact) and importantly, will reveal whether they survived or not, also known as the “ground truth”.

The `test.csv` dataset contains similar information but does not disclose the “ground truth” for each passenger. It’s your job to predict these outcomes

# importing

In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


import warnings
warnings.filterwarnings("ignore")

# Data Understanding

In [3]:
train_df=pd.read_csv("train.csv")
test_df=pd.read_csv("test.csv")
combine = [train_df, test_df]

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_df.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [6]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
print(test_df.shape)
print(train_df.shape)

(418, 11)
(891, 12)


In [8]:
train_df.info()
print('_'*50,"\n")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
__________________________________________________ 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0

In [9]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
train_df.describe(include="O")

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [11]:
for col in train_df.columns:
    print(col,":",train_df[col].nunique())
    print(train_df[col].value_counts().nlargest(5))
    print('\n' + '*' * 20 + '\n')

PassengerId : 891
1      1
599    1
588    1
589    1
590    1
Name: PassengerId, dtype: int64

********************

Survived : 2
0    549
1    342
Name: Survived, dtype: int64

********************

Pclass : 3
3    491
1    216
2    184
Name: Pclass, dtype: int64

********************

Name : 891
Braund, Mr. Owen Harris             1
Boulos, Mr. Hanna                   1
Frolicher-Stehli, Mr. Maxmillian    1
Gilinski, Mr. Eliezer               1
Murdlin, Mr. Joseph                 1
Name: Name, dtype: int64

********************

Sex : 2
male      577
female    314
Name: Sex, dtype: int64

********************

Age : 88
24.0    30
22.0    27
18.0    26
19.0    25
28.0    25
Name: Age, dtype: int64

********************

SibSp : 7
0    608
1    209
2     28
4     18
3     16
Name: SibSp, dtype: int64

********************

Parch : 7
0    678
1    118
2     80
5      5
3      5
Name: Parch, dtype: int64

********************

Ticket : 681
347082      7
CA. 2343    7
1601        7
31012

In [12]:
from IPython.display import display
for feature in ['Pclass','Sex','SibSp','Parch']:
    
    display(train_df[[feature, 'Survived']].groupby([feature], as_index=False).mean().sort_values(by='Survived', ascending=False))

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


# Data Wrangling

In [13]:
print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

"After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape

Before (891, 12) (418, 11) (891, 12) (418, 11)


('After', (891, 10), (418, 9), (891, 10), (418, 9))

In [14]:

for df in combine:
    
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])



Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [15]:
for df in combine: df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.697802
2,Mlle,1.0
3,Mme,1.0
4,Mr,0.156673
5,Mrs,0.792
6,Ms,1.0
7,Rare,0.347826


In [16]:

for df in combine:
    df['Title']=df['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5})
    df['Title'] = df['Title'].fillna(0)

train_df['Title'].value_counts()
    

1.0    517
2.0    182
3.0    125
4.0     40
5.0     23
0.0      4
Name: Title, dtype: int64

In [17]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,3.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1.0


In [18]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

((891, 9), (418, 9))

In [19]:
for df in combine:
    df.Sex=(df.Sex=='female').astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1.0
1,1,1,1,38.0,1,0,71.2833,C,3.0
2,1,3,1,26.0,0,0,7.925,S,2.0
3,1,1,1,35.0,1,0,53.1,S,3.0
4,0,3,0,35.0,0,0,8.05,S,1.0


In [20]:
for df in combine:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [21]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize
0,0,3,0,22.0,1,0,7.25,S,1.0,2
1,1,1,1,38.0,1,0,71.2833,C,3.0,2
2,1,3,1,26.0,0,0,7.925,S,2.0,1
3,1,1,1,35.0,1,0,53.1,S,3.0,2
4,0,3,0,35.0,0,0,8.05,S,1.0,1


In [22]:
for df in combine:
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
    

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [23]:
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_df, test_df]

In [24]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,22.0,7.2500,S,1.0,0
1,1,1,1,38.0,71.2833,C,3.0,0
2,1,3,1,26.0,7.9250,S,2.0,1
3,1,1,1,35.0,53.1000,S,3.0,0
4,0,3,0,35.0,8.0500,S,1.0,1
...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,13.0000,S,5.0,1
887,1,1,1,19.0,30.0000,S,2.0,1
888,0,3,1,,23.4500,S,2.0,0
889,1,1,0,26.0,30.0000,C,1.0,1


In [25]:

for df in combine:
    df['Embarked'] = df['Embarked'].fillna(train_df.Embarked.dropna().mode()[0])
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [26]:
for df in combine:
    df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,22.0,7.25,0,1.0,0
1,1,1,1,38.0,71.2833,1,3.0,0
2,1,3,1,26.0,7.925,0,2.0,1
3,1,1,1,35.0,53.1,0,3.0,0
4,0,3,0,35.0,8.05,0,1.0,1


In [27]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,892,3,0,34.5,7.8292,2,1,1
1,893,3,1,47.0,7.0,0,3,0
2,894,2,0,62.0,9.6875,2,1,1
3,895,3,0,27.0,8.6625,0,1,1
4,896,3,1,22.0,12.2875,0,3,0


In [28]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

for df in combine:
    testdf = df[df['Age'].isnull()==True]
    traindf = df[df['Age'].isnull()==False]
    y = traindf['Age']
        
    traindf.drop("Age",axis=1,inplace=True)
        
    lr.fit(traindf,y)
    testdf.drop("Age",axis=1,inplace=True)
    pred = lr.predict(testdf)
    df.loc[df['Age'].isnull()==True,'Age']= pred
    df["Age"]=df['Age'].astype(int)

In [29]:
train_df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
Title       0
IsAlone     0
dtype: int64

In [30]:
test_df.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
Fare           0
Embarked       0
Title          0
IsAlone        0
dtype: int64

In [31]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,22,7.2500,0,1.0,0
1,1,1,1,38,71.2833,1,3.0,0
2,1,3,1,26,7.9250,0,2.0,1
3,1,1,1,35,53.1000,0,3.0,0
4,0,3,0,35,8.0500,0,1.0,1
...,...,...,...,...,...,...,...,...
886,0,2,0,27,13.0000,0,5.0,1
887,1,1,1,19,30.0000,0,2.0,1
888,0,3,1,22,23.4500,0,2.0,0
889,1,1,0,26,30.0000,1,1.0,1


In [32]:
train_df['AgeRange'] = pd.cut(train_df['Age'], 5)
train_df[['AgeRange', 'Survived']].groupby(['AgeRange'], as_index=False).mean().sort_values(by='AgeRange', ascending=True)

Unnamed: 0,AgeRange,Survived
0,"(-0.08, 16.0]",0.598214
1,"(16.0, 32.0]",0.33125
2,"(32.0, 48.0]",0.388128
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [33]:
for df in combine:    
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age']
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,AgeRange
0,0,3,0,1,7.25,0,1.0,0,"(16.0, 32.0]"
1,1,1,1,2,71.2833,1,3.0,0,"(32.0, 48.0]"
2,1,3,1,1,7.925,0,2.0,1,"(16.0, 32.0]"
3,1,1,1,2,53.1,0,3.0,0,"(32.0, 48.0]"
4,0,3,0,2,8.05,0,1.0,1,"(32.0, 48.0]"


In [34]:
train_df = train_df.drop(['AgeRange'], axis=1)
combine = [train_df, test_df]

In [35]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,1,7.25,0,1.0,0
1,1,1,1,2,71.2833,1,3.0,0
2,1,3,1,1,7.925,0,2.0,1
3,1,1,1,2,53.1,0,3.0,0
4,0,3,0,2,8.05,0,1.0,1


In [36]:
train_df['FareRange'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareRange', 'Survived']].groupby(['FareRange'], as_index=False).mean().sort_values(by='FareRange', ascending=True)

Unnamed: 0,FareRange,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [37]:
for df in combine:
    df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[ df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)

train_df = train_df.drop(['FareRange'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,1,0,0,1.0,0
1,1,1,1,2,3,1,3.0,0
2,1,3,1,1,1,0,2.0,1
3,1,1,1,2,3,0,3.0,0
4,0,3,0,2,1,0,1.0,1
5,0,3,0,1,1,2,1.0,1
6,0,1,0,3,3,0,1.0,1
7,0,3,0,0,2,0,4.0,0
8,1,3,1,1,1,0,3.0,0
9,1,2,1,0,2,1,3.0,0


In [38]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

In [40]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

87.32