# Exploring Titanic Data

We will try predict who will survive on the titanic

In [2]:
import pandas as pd
import numpy as np


train_df = pd.read_csv("./train.csv")
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_df = pd.read_csv("./test.csv")
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


NOTE: there is no survived column for test data. We are trying to predict this.

In [4]:
train_df.isnull().sum()
# there is some null values in these columns

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [36]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# preprocess 
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

def preprocess(train, test):
    
    names(train)
    names(test)
    
    age(train)
    age(test)
    
    train, test = cabin_num(train, test)
    
    train = cabin(train)
    test = cabin(test)
    
    embarked(train)
    embarked(test)
    
    fam_size(train)
    fam_size(test)
    
    ticket(train)
    ticket(test)
    
    train, test = dummies(train, test)

    return train, test

## names
def names(df):
    # create new field that is name length
    df['Name_Len'] = df['Name'].apply(lambda x: len(x))
    # create new field that is name title
    df['Name_Title'] = df['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
    del df['Name']

## age
def age(df):
    # add flag for when age is Null
    df['Age_Null_Flag'] = df['Age'].apply(lambda x: 1 if pd.isnull(x) else 0)
    # calculate mean age based on name title and class
    df['mean'] = df.groupby(['Name_Title', 'Pclass'])['Age'].transform('mean')
    # fill in missing ages as mean age
    df['Age'] = df['Age'].fillna(df['mean'])
    del df['mean']

## cabin
def cabin_num(train, test):
    for i in [train, test]:
        i['Cabin_num1'] = i['Cabin'].apply(lambda x: str(x).split(' ')[-1][1:])
        i['Cabin_num1'].replace('an', np.NaN, inplace = True)
        i['Cabin_num1'] = i['Cabin_num1'].apply(lambda x: int(x) if not pd.isnull(x) and x != '' else np.NaN)
        
    train['Cabin_num'], bins = pd.qcut(train['Cabin_num1'],3, retbins=True)
    test['Cabin_num'] = pd.cut(test['Cabin_num1'], bins=bins, include_lowest=True)
    
    train = pd.concat((train, pd.get_dummies(train['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
    test = pd.concat((test, pd.get_dummies(test['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
    
    del train['Cabin_num']
    del test['Cabin_num']
    del train['Cabin_num1']
    del test['Cabin_num1']
    return train, test

def cabin(df):
    # extract cabin letter
    df['Cabin_Letter'] = df['Cabin'].apply(lambda x: str(x)[0])
    del df['Cabin']
    return df

## embarked
def embarked(df):
    df['Embarked'] = df['Embarked'].fillna('S')

## family size
def fam_size(df):
    df['Fam_Size'] = np.where((df['SibSp']+df['Parch']) == 0 , 'Solo',
                           np.where((df['SibSp']+df['Parch']) <= 3,'Nuclear', 'Big'))
    del df['SibSp']
    del df['Parch']

## ticket length
def ticket(df):
    df['Ticket_Len'] = df['Ticket'].apply(lambda x: len(x))
    del df['Ticket']

def dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Cabin_Letter', 'Name_Title', 'Fam_Size']):
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        
        del train[column]
        del test[column]
        
    return train, test

# print(test_df.head())
train_df, test_df = preprocess(train_df, test_df)

test_df.dropna(inplace=True)
train_df.dropna(inplace=True)

print(test_df.iloc[87])

PassengerId                   979.00
Age                            18.00
Fare                            8.05
Name_Len                       26.00
Age_Null_Flag                   0.00
Cabin_num_[2, 28.667]           0.00
Cabin_num_(28.667, 65.667]      0.00
Cabin_num_(65.667, 148]         0.00
Ticket_Len                      9.00
Pclass_3                        1.00
Pclass_1                        0.00
Pclass_2                        0.00
Sex_male                        0.00
Sex_female                      1.00
Embarked_S                      1.00
Embarked_C                      0.00
Embarked_Q                      0.00
Cabin_Letter_n                  1.00
Cabin_Letter_C                  0.00
Cabin_Letter_E                  0.00
Cabin_Letter_G                  0.00
Cabin_Letter_D                  0.00
Cabin_Letter_A                  0.00
Cabin_Letter_B                  0.00
Cabin_Letter_F                  0.00
Name_Title_Mr.                  0.00
Name_Title_Mrs.                 0.00
N

In [78]:
# HYPERPARAMS
# rf = RandomForestClassifier(max_features='auto',
#                                 oob_score=True,
#                                 random_state=1,
#                                 n_jobs=-1)

# param_grid = { "criterion"   : ["gini", "entropy"],
#              "min_samples_leaf" : [1,5,10],
#              "min_samples_split" : [2, 4, 10, 12, 16],
#              "n_estimators": [50, 100, 400, 700, 1000]}

# gs = GridSearchCV(estimator=rf,
#                   param_grid=param_grid,
#                   scoring='accuracy',
#                   cv=3,
#                   n_jobs=-1)

# gs = gs.fit(train_df.iloc[:, 2:], train_df.iloc[:, 1])


# # INSPECT BEST PARAMS
# print(gs.best_score_)
# print(gs.best_params_)


# # FIT MODEL
rf = RandomForestClassifier(criterion='entropy',
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train_df.iloc[:, 2:], train_df.iloc[:, 1])
print("%.4f" % rf.oob_score_)


# INSPECT FEATURE RANK
pd.concat((pd.DataFrame(train_df.iloc[:, 2:].columns, columns = ['variable']),
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])),
          axis = 1).sort_values(by='importance', ascending = False)


# GENERATE SUBMISSION FILE
test_df['Fare'].fillna(train_df['Fare'].mean(), inplace = True)
predictions = rf.predict(test_df.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test_df.iloc[:, 0], predictions), axis = 1)
# print(predictions)
# predictions.to_csv('predictions.csv', sep=",", index=False)

import math

# fill any NA values
predictions['Survived'].fillna(1.0, inplace=True)

# catch any passengerID's not set
for index, row in predictions.iterrows():
    if math.isnan(row['PassengerId']) == True:
        passID = predictions.iloc[index-1]['PassengerId'] + 1
        predictions.set_value(index, 'PassengerId', passID)
    
predictions['PassengerId'] = predictions['PassengerId'].astype(np.int32)
predictions['Survived'] = predictions['Survived'].astype(np.int32)
# print(predictions)
predictions.to_csv('predictions.csv', sep=",", index=False)

0.8215


In [77]:
import math

# fill any NA values
predictions['Survived'].fillna(1.0, inplace=True)

# catch any passengerID's not set
for index, row in predictions.iterrows():
    if math.isnan(row['PassengerId']) == True:
        passID = predictions.iloc[index-1]['PassengerId'] + 1
        predictions.set_value(index, 'PassengerId', passID)
    
predictions['PassengerId'] = predictions['PassengerId'].astype(np.int32)
predictions['Survived'] = predictions['Survived'].astype(np.int32)
# print(predictions)
predictions.to_csv('predictions.csv', sep=",", index=False)