# Titanic Project - Work in Progress

# Finished Code

In [1]:
import numpy as np
import pandas as pd
import os
import re
from os.path import join
from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def find(s, ch):
    return np.array([i for i, ltr in enumerate(s) if ltr == ch]).astype(int)

In [2]:
# Loading the dataframe
def load():
    
    # Reading data into dataframe
    df = pd.read_csv('train.csv')
    df = df.rename(columns = {c:c.lower() for c in df.columns})
    
    # Sorting gender column
    if 'male' in df['sex'].values:
        df['sex'] = (df['sex']=='male').astype(int).values
    
    # Sorting df['age'] containing string 'Master' in column 'name' NaN values
    masterAge_mean = df['age'].loc[df['name'].str.contains('Master')].mean()
    masterAge_mean = round(masterAge_mean, 2)
    df['age'].loc[df['name'].str.contains('Master')] = df['age'].loc[df['name'].str.contains('Master')].fillna(masterAge_mean)
    
#     Sorting cabin column
#     cabin_letters=['A','B','C','D','E','F','G']
#     for letter in cabin_letters:
#         df[f'cabin_{letter}'] = df['cabin'].apply(lambda x: type(x)==type('') and letter in x) 
#         df[f'cabin_{letter}'] = df[f'cabin_{letter}'].astype(int)
#     df['cabin_number'] = df['cabin'].apply(lambda x: re.findall(r'[0-9]+', x) if type(x)==type('') else [np.nan]) 
#     df['cabin_number'] = df['cabin_number'].apply(lambda x: x[0] if len(x)>0 else np.nan) 

    return df

In [3]:
# Loading and displaying data
df = load()

# One-hot encoding 'embarked', 'pclass' columns
# Embarked column
df_embarked   = pd.get_dummies(df.embarked)
embarked_cols = list(df_embarked.columns)
df_embarked = df_embarked.rename(columns = {c: f'embarked_{c}' for c in embarked_cols})

# pclass column
df_pclass   = pd.get_dummies(df.pclass)
pclass_cols = list(df_pclass.columns)
df_pclass   = df_pclass.rename(columns = {c: f'pclass_{c}' for c in pclass_cols})

# Removing unnecessary columns
removed_cols = ['name','passengerid','ticket','embarked','pclass','cabin', 'cabin_number','survived']
df = pd.concat([df, df_embarked, df_pclass], axis=1)
df = df[[c for c in df if c not in removed_cols] + ['survived']]


# Displaying data
df.head()

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,pclass_1,pclass_2,pclass_3,survived
0,1,22.0,1,0,7.25,0,0,1,0,0,1,0
1,0,38.0,1,0,71.2833,1,0,0,1,0,0,1
2,0,26.0,0,0,7.925,0,0,1,0,0,1,1
3,0,35.0,1,0,53.1,0,0,1,1,0,0,1
4,1,35.0,0,0,8.05,0,0,1,0,0,1,0


In [4]:
# Isolating features containing NaNs
nan_features = []
for c in df.columns:
    if df[c].isnull().values.any(): 
        nan_features.append(c)

# Percentage of NaNs for each column with NaNs
for f in nan_features:
    nan_percent = 100*df[f].isnull().sum() /len(df)
    print('The feature',f,f'is {nan_percent:.3} percent NaN values\n') 

The feature age is 19.4 percent NaN values



In [5]:
# Converting NaNs to other values
def sortnans(dftr, dfcv, mode = 'zero'):
    if mode=='zero':
        dftr = dftr.fillna(0)
        dfcv = dfcv.fillna(0)
    # Calculating the mean for features that can be and replacing NaNs with the mean
    if mode=='mean':
        for nf in nan_features:
            if df[nf].dtype==('int64') or df[nf].dtype==('float64'):
                # Mean of the nf column in training dataframe (don't use cv dataframe as this fits data to cv data)
                mean   = dftr[nf].mean()
                # New columns with NaNs=mean
                newCol_tr = dftr[nf].fillna(mean) 
                newCol_cv = dfcv[nf].fillna(mean)
                # Replace old cols containing NaNs
                dftr.loc[:,nf] = newCol_tr
                dfcv.loc[:,nf] = newCol_cv
    
    return dftr, dfcv

# Processing dataframe
def process(df, sortnan_mode):
    
    dftr = df[:int(len(df)*.75)] 
    dfcv = df[int(len(df)*.75):]
    
    dftr, dfcv = sortnans(dftr,dfcv,mode=sortnan_mode)
    
    features = df.columns[:-1]
    
    return dftr, dfcv, features

In [6]:
# Training and cross validation data and features
dftr_zero, dfcv_zero, features = process(df, sortnan_mode = 'zero')
dftr_mean, dfcv_mean, features = process(df, sortnan_mode = 'mean')

# Features but removing final 3 columns
#features = features[:-3]

# X training data
Xtr_zero = dftr_zero[features].values
Xtr_mean = dftr_mean[features].values
# X cross validation data
Xcv_zero = dfcv_zero[features].values
Xcv_mean = dfcv_mean[features].values

# X data dictionary
X_modes = {
    'Zero': [Xtr_zero,Xcv_zero],
    'Mean': [Xtr_mean,Xcv_mean]}

# y outputs (dftr_mean=dftr_zero and dfcv_mean=dfcv_zero as these values aren't changed by sorting nans)
ytr = dftr_mean['survived'].values
ycv = dfcv_mean['survived'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [7]:
# Creating models
model_rf = RandomForestClassifier()
model_gb = GradientBoostingClassifier()
model_lr = LogisticRegression(max_iter=1e5)

# Models dictionary
models = {
    'Random Forest': model_rf,
    'Gradient Boosting': model_gb,
    'Logistic Regression': model_lr}

In [8]:
def results(models, X_modes, ytr, ycv):
    
    # Best f1 score
    f1_best=0
    
    # Looping through each model
    for m in models.keys():
        model=models[m]
        print(f'{model}:')
        
        # Looping through each set of X data
        for x in X_modes.keys():
            X = X_modes[x][0]
            y = ytr
            model.fit(X,y)
            
            # Evaluation metrics
            y_pred = model.predict(X_modes[x][1])
            acc    = accuracy_score(y_pred,ycv)
            f1     = f1_score(y_pred,ycv)
            if f1 > f1_best:
                f1_best   = f1
                model_idx = m
                X_idx     = x
        
            print(f'Using NaN={x}:\n accuracy:{acc:.6}\n F1 score:{f1:.6}')
        print('\n')
    
    return f1_best, model_idx, X_idx

f1_best, model_idx, X_idx = results(models, X_modes, ytr, ycv)
print(f'Mode: {X_idx}\nModel: {model_idx}\nBest f1 score: {f1_best:.6}')

RandomForestClassifier():
Using NaN=Zero:
 accuracy:0.807175
 F1 score:0.739394
Using NaN=Mean:
 accuracy:0.834081
 F1 score:0.775758


GradientBoostingClassifier():
Using NaN=Zero:
 accuracy:0.856502
 F1 score:0.789474
Using NaN=Mean:
 accuracy:0.852018
 F1 score:0.792453


LogisticRegression(max_iter=100000.0):
Using NaN=Zero:
 accuracy:0.807175
 F1 score:0.715232
Using NaN=Mean:
 accuracy:0.829596
 F1 score:0.75


Mode: Mean
Model: Gradient Boosting
Best f1 score: 0.792453


# Trial Code

# Next Step and Ideas: Implementing feature 'Cabin'

I have researched cabins (A-F) and ranked them in order of price and status. Then depending on the pclass (1st,2nd,3rd) and fare (£(continuous)) I will allocate each of the cabin=NaN values to one of the available cabins.

In [9]:
df2 = load()

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(df2[(~df2['cabin'].isnull()) & (df2['cabin'].str.contains(''))][['pclass','fare','cabin']])

# with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
#     print(df2[(df2['cabin'].isnull().values.any() & (df2['cabin'].str.contains('A')))]['fare'])

# df2[(df2['cabin'].isnull().values.any()) \
#     & (df2['cabin'].str.contains('A'))]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [10]:
# cabin_letters=['cabin_A', 'cabin_B', 'cabin_C', 'cabin_D', 'cabin_E', 'cabin_F', 'cabin_G']
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     df3 = df2[(~df2['cabin'].isnull()) & (df2['pclass']==1)][['cabin_A', 'cabin_B', 'cabin_C', 'cabin_D', 'cabin_E', 'cabin_F', 'cabin_G','pclass','fare']].sort_values('fare')
#     #print(df3.sum())
#     #print(df3[df3.fare > 20].sum())
#     print(df2[df2.pclass ==1].sum())

In [11]:
# Separating cabin column into cabin_letter and cabin_number
cabin_letters=['A','B','C','D','E','F','G']
for letter in cabin_letters:
    df2[f'cabin_{letter}'] = df2['cabin'].apply(lambda x: type(x)==type('') and letter in x) #1
    df2[f'cabin_{letter}'] = df2[f'cabin_{letter}'].astype(int)
df2['cabin_number'] = df2['cabin'].apply(lambda x: re.findall(r'[0-9]+', x) if type(x)==type('') else [np.nan]) #2
df2['cabin_number'] = df2['cabin_number'].apply(lambda x: x[0] if len(x)>0 else np.nan) #3

#1: loop is creating new colums cabin_A,...,cabin_F
# - lambda x: type(x)==type(''): sets the entries of df2['cabin'] to be strings
# - and letter in x: sets the letter in x to a 1 in the corresponding column_letter column

#2: creating new column cabin_number
# - lambda x: re.findall(r'[0-9]+', x) creates a list of the numbers separated by spaces
# - if type(x)==type(''): restricts lambda function to only the non-NaN values, else keep it as NaN

#3: sets the values in cabin_number that have multiple entries e.g. [23, 25, 27] to the first value

In [12]:
df5 = df2[~df2['cabin'].isnull()]['cabin'][5:8]
print(df5)
print(df5.apply(lambda x: re.findall(r'[0-9]+', x) if type(x)==type('') else [np.nan,]))
print(df5)

21            D56
23             A6
27    C23 C25 C27
Name: cabin, dtype: object
21            [56]
23             [6]
27    [23, 25, 27]
Name: cabin, dtype: object
21            D56
23             A6
27    C23 C25 C27
Name: cabin, dtype: object


In [13]:
# histogram of fares for the entries cabin=NaN 
# df2[pd.isnull(df2['cabin'])]['fare'].hist(bins=np.arange(0,175,5))

In [14]:
# histogram of fares for the entries with cabin != NaN and cabin
# df2[(df2['cabin'].isnull().values.any()) \
#     & (df2['cabin'].str.contains('A'))]['fare'].hist(bins=np.arange(0,200,5))