# Titanic Data
## Random Forests

In [2]:
# Load Packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier


In [3]:
## Load Data

Titanic = sns.load_dataset('titanic')
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Data Wrangling

### Recode String Data

In [4]:
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


### Drop the missing Data

In [5]:
Titanic.dropna(inplace = True)
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     182 non-null    int64   
 1   pclass       182 non-null    int64   
 2   sex          182 non-null    object  
 3   age          182 non-null    float64 
 4   sibsp        182 non-null    int64   
 5   parch        182 non-null    int64   
 6   fare         182 non-null    float64 
 7   embarked     182 non-null    object  
 8   class        182 non-null    category
 9   who          182 non-null    object  
 10  adult_male   182 non-null    bool    
 11  deck         182 non-null    category
 12  embark_town  182 non-null    object  
 13  alive        182 non-null    object  
 14  alone        182 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 18.2+ KB


In [6]:
for col in Titanic.columns:
    print(col)

survived
pclass
sex
age
sibsp
parch
fare
embarked
class
who
adult_male
deck
embark_town
alive
alone


In [7]:
Titanic = Titanic.drop(['fare', 'adult_male', 'embarked', 'embark_town', 'alive', 'alone'], axis=1)
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,class,who,deck
1,1,1,female,38.0,1,0,First,woman,C
3,1,1,female,35.0,1,0,First,woman,C
6,0,1,male,54.0,0,0,First,man,E
10,1,3,female,4.0,1,1,Third,child,G
11,1,1,female,58.0,0,0,First,woman,C


In [8]:
Titanic['sex'] = np.where(Titanic['sex'] == "female", 1, 0)
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,class,who,deck
1,1,1,1,38.0,1,0,First,woman,C
3,1,1,1,35.0,1,0,First,woman,C
6,0,1,0,54.0,0,0,First,man,E
10,1,3,1,4.0,1,1,Third,child,G
11,1,1,1,58.0,0,0,First,woman,C


In [9]:
Titanic.deck.value_counts()

C    51
B    43
D    31
E    30
A    12
F    11
G     4
Name: deck, dtype: int64

In [9]:
def deck_recode(series):
    if series == 'A':
        return '1'
    if series == 'B':
        return '2'
    if series == 'C':
        return '3'
    if series == 'D':
        return ' 4'
    if series == 'E':
        return '5'
    if series == 'F':
        return '6'
    if series == 'G':
        return '7'

Titanic['deck'] = Titanic['deck'].apply(deck_recode)
        
Titanic.deck.value_counts()

3     51
2     43
 4    31
5     30
1     12
6     11
7      4
Name: deck, dtype: int64

In [10]:
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,class,who,deck
1,1,1,1,38.0,1,0,First,woman,3
3,1,1,1,35.0,1,0,First,woman,3
6,0,1,0,54.0,0,0,First,man,5
10,1,3,1,4.0,1,1,Third,child,7
11,1,1,1,58.0,0,0,First,woman,3


In [15]:
Titanic.who.value_counts()

Series([], Name: who, dtype: int64)

In [18]:
Titanic = Titanic.drop(['class'], axis=1)
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,deck
1,1,1,1,38.0,1,0,3
3,1,1,1,35.0,1,0,3
6,0,1,0,54.0,0,0,5
10,1,3,1,4.0,1,1,7
11,1,1,1,58.0,0,0,3


In [20]:
Titanic.age = Titanic.age.astype(int)
Titanic.deck = Titanic.deck.astype(int)

In [21]:
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   survived  182 non-null    int64
 1   pclass    182 non-null    int64
 2   sex       182 non-null    int32
 3   age       182 non-null    int32
 4   sibsp     182 non-null    int64
 5   parch     182 non-null    int64
 6   deck      182 non-null    int32
dtypes: int32(3), int64(4)
memory usage: 9.2 KB


## Define X and Y Variables

In [27]:
x = Titanic.drop('survived', axis=1)
y = Titanic.survived

## Train Test Split

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 76)

## Decision Tree

In [29]:
decisionTree = DecisionTreeClassifier(random_state = 76)
decisionTree.fit(x_train, y_train)

In [33]:
treePredictions = decisionTree.predict(x_test)

In [34]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

           0       0.60      0.63      0.62        19
           1       0.80      0.78      0.79        36

    accuracy                           0.73        55
   macro avg       0.70      0.70      0.70        55
weighted avg       0.73      0.73      0.73        55



# Decision Tree: 73% weighted accuracy

## Random Forest

In [30]:
forest = RandomForestClassifier(n_estimators = 500, random_state = 76)
forest.fit(x_train, y_train)

In [31]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[11  8]
 [ 8 28]]
              precision    recall  f1-score   support

           0       0.58      0.58      0.58        19
           1       0.78      0.78      0.78        36

    accuracy                           0.71        55
   macro avg       0.68      0.68      0.68        55
weighted avg       0.71      0.71      0.71        55



# Random Forests: 71% Weighted Accuracy