In [1]:
import os 
import json
import numpy as np
import pandas as pd
import dill as pickle
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings("ignore")

path = "../data/titanic/"

#data_path="https://raw.githubusercontent.com/ahmedbesbes/How-to-score-0.8134-in-Titanic-Kaggle-Challenge/master/data/"
data_path="/home/pedro/repos/ml_web_api/How-to-score-0.8134-in-Titanic-Kaggle-Challenge/data/"

In [2]:
train = pd.read_csv(data_path+'train.csv')
y = "Survived"
X = [x for x in train.columns if x != y]
X_train, X_test, y_train, y_test = train_test_split(train[X], train[y], test_size=0.25, random_state=42)
print("Shape: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Shape:  (668, 11) (223, 11) (668,) (223,)


In [3]:
X_train.head(2).transpose()

Unnamed: 0,298,884
PassengerId,299,885
Pclass,1,3
Name,"Saalfeld, Mr. Adolphe","Sutehall, Mr. Henry Jr"
Sex,male,male
Age,,25
SibSp,0,0
Parch,0,0
Ticket,19988,SOTON/OQ 392076
Fare,30.5,7.05
Cabin,C106,


In [39]:
def status(feature):
    print('Processing', feature, ': ok')

Title_Dictionary = {
        "Capt": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Jonkheer": "Royalty",
        "Don": "Royalty",
        "Sir" : "Royalty",
        "Dr": "Officer",
        "Rev": "Officer",
        "the Countess":"Royalty",
        "Mme": "Mrs",
        "Mlle": "Miss",
        "Ms": "Mrs",
        "Mr" : "Mr",
        "Mrs" : "Mrs",
        "Miss" : "Miss",
        "Master" : "Master",
        "Lady" : "Royalty"}


def get_titles(df):
    # we extract the title from each name
    df['Title'] = df['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    # a map of more aggregated title
    # we map each title
    df['Title'] = df.Title.map(Title_Dictionary)
    status('Title')
    return df
    
def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) & 
        (grouped_median_train['Title'] == row['Title']) & 
        (grouped_median_train['Pclass'] == row['Pclass'])) 
    return grouped_median_train[condition]['Age'].values[0]

def process_age(df):
    #global df
    # a function that fills the missing values of the Age variable
    df['Age'] = df.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    status('age')
    return df

def process_names(df):
    #global df
    # we clean the Name variable
    df.drop('Name', axis=1, inplace=True)
    # encoding in dummy variable
    titles_dummies = pd.get_dummies(df['Title'], prefix='Title')
    df = pd.concat([df, titles_dummies], axis=1)
    # removing the title variable
    df.drop('Title', axis=1, inplace=True)
    status('names')
    return df

def process_fares(df):
    #global df
    # there's one missing fare value - replacing it with the mean.
    df.Fare.fillna(df.iloc[:891].Fare.mean(), inplace=True)
    status('fare')
    return df

def process_embarked(df):
    #global df
    # two missing embarked values - filling them with the most frequent one in the train  set(S)
    df.Embarked.fillna('S', inplace=True)
    # dummy encoding 
    embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = pd.concat([df, embarked_dummies], axis=1)
    df.drop('Embarked', axis=1, inplace=True)
    status('embarked')
    return df

def process_cabin(df):
    #global df    
    # replacing missing cabins with U (for Uknown)
    df.Cabin.fillna('U', inplace=True)
    # mapping each Cabin value with the cabin letter
    df['Cabin'] = df['Cabin'].map(lambda c: c[0])
    # dummy encoding ...
    cabin_dummies = pd.get_dummies(df['Cabin'], prefix='Cabin')    
    df = pd.concat([df, cabin_dummies], axis=1)

    df.drop('Cabin', axis=1, inplace=True)
    status('cabin')
    return df

def process_sex(df):
    #global df
    # mapping string values to numerical one 
    df['Sex'] = df['Sex'].map({'male':1, 'female':0})
    status('Sex')
    return df

def process_pclass(df):
    #global df
    # encoding into 3 categories:
    pclass_dummies = pd.get_dummies(df['Pclass'], prefix="Pclass")
    # adding dummy variable
    df = pd.concat([df, pclass_dummies],axis=1)
    # removing "Pclass"
    df.drop('Pclass',axis=1,inplace=True)
    status('Pclass')
    return df

def process_ticket(df):
    #global df
    # a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip(), ticket)
        ticket = filter(lambda t : not t.isdigit(), ticket)
        if len(ticket) > 0:
            return ticket[0]
        else: 
            return 'XXX'

    # Extracting dummy variables from tickets:
    df['Ticket'] = df['Ticket'].map(cleanTicket)
    tickets_dummies = pd.get_dummies(df['Ticket'], prefix='Ticket')
    df = pd.concat([df, tickets_dummies], axis=1)
    df.drop('Ticket', inplace=True, axis=1)
    status('Ticket')
    return df

def process_family(df):
    #global df
    # introducing a new feature : the size of families (including the passenger)
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1

    # introducing other features based on the family size
    df['Singleton'] = df['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    df['SmallFamily'] = df['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    df['LargeFamily'] = df['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

    status('family')
    return df

In [40]:
df = X_train.copy()

In [41]:
df.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
298,299,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
884,885,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S


In [42]:
df = get_titles(df)
df.head(2)

Processing Title : ok


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
298,299,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S,Mr
884,885,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,Mr


In [43]:
grouped_train = df.groupby(['Sex','Pclass','Title'])
grouped_median_train = grouped_train.median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]

In [44]:
df = process_names(df)
df.head(2)

Processing names : ok


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
298,299,1,male,,0,0,19988,30.5,C106,S,0,0,1,0,0,0
884,885,3,male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0,0,1,0,0,0


In [45]:
#df = process_age(df)
#df.head(2)

In [46]:
df = process_fares(df)
df.head(2)

Processing fare : ok


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
298,299,1,male,,0,0,19988,30.5,C106,S,0,0,1,0,0,0
884,885,3,male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0,0,1,0,0,0


In [47]:
df = process_embarked(df)
df.head(2)

Processing embarked : ok


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
298,299,1,male,,0,0,19988,30.5,C106,0,0,1,0,0,0,0,0,1
884,885,3,male,25.0,0,0,SOTON/OQ 392076,7.05,,0,0,1,0,0,0,0,0,1


In [48]:
df = process_cabin(df)
df.head(2)

Processing cabin : ok


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,...,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
298,299,1,male,,0,0,19988,30.5,0,0,...,1,0,0,1,0,0,0,0,0,0
884,885,3,male,25.0,0,0,SOTON/OQ 392076,7.05,0,0,...,1,0,0,0,0,0,0,0,0,1


In [49]:
df = process_sex(df)
df.head(2)

Processing Sex : ok


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,...,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
298,299,1,1,,0,0,19988,30.5,0,0,...,1,0,0,1,0,0,0,0,0,0
884,885,3,1,25.0,0,0,SOTON/OQ 392076,7.05,0,0,...,1,0,0,0,0,0,0,0,0,1


In [50]:
df = process_pclass(df)
df.head(2)

Processing Pclass : ok


Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,Title_Mr,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Pclass_1,Pclass_2,Pclass_3
298,299,1,,0,0,19988,30.5,0,0,1,...,1,0,0,0,0,0,0,1,0,0
884,885,1,25.0,0,0,SOTON/OQ 392076,7.05,0,0,1,...,0,0,0,0,0,0,1,0,0,1


In [55]:
#df = process_ticket(df)
#df.head(2)

In [52]:
df = process_family(df)
df.head(2)

Processing family : ok


Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,Title_Mr,...,Cabin_G,Cabin_T,Cabin_U,Pclass_1,Pclass_2,Pclass_3,FamilySize,Singleton,SmallFamily,LargeFamily
298,299,1,,0,0,19988,30.5,0,0,1,...,0,0,0,1,0,0,1,1,0,0
884,885,1,25.0,0,0,SOTON/OQ 392076,7.05,0,0,1,...,0,0,1,0,0,1,1,1,0,0


In [None]:
#df = get_titles(df)
#df = process_names(df)
###df = process_age(df)
#df = process_fares(df)
#df = process_embarked(df)
#df = process_cabin(df)
#df = process_sex(df)
#df = process_pclass(df)
####df = process_ticket(df)
#df = process_family(df)
#print(df.info)
#return df.as_matrix()

In [56]:
pd.DataFrame(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 298 to 102
Data columns (total 32 columns):
PassengerId      668 non-null int64
Sex              668 non-null int64
Age              536 non-null float64
SibSp            668 non-null int64
Parch            668 non-null int64
Ticket           668 non-null object
Fare             668 non-null float64
Title_Master     668 non-null uint8
Title_Miss       668 non-null uint8
Title_Mr         668 non-null uint8
Title_Mrs        668 non-null uint8
Title_Officer    668 non-null uint8
Title_Royalty    668 non-null uint8
Embarked_C       668 non-null uint8
Embarked_Q       668 non-null uint8
Embarked_S       668 non-null uint8
Cabin_A          668 non-null uint8
Cabin_B          668 non-null uint8
Cabin_C          668 non-null uint8
Cabin_D          668 non-null uint8
Cabin_E          668 non-null uint8
Cabin_F          668 non-null uint8
Cabin_G          668 non-null uint8
Cabin_T          668 non-null uint8
Cabin_U          668 non

In [58]:
df.select_dtypes(include=[np.number]).shape

(668, 31)

In [59]:
df.shape

(668, 32)

In [8]:
import pandas as pd

In [12]:
path = "/home/pedro/repos/ml_web_api/ml-app-model/data/gridCV/"

In [23]:
y_hat = pd.read_csv(path+"prediction_results.csv")
y_hat.shape

(891, 2)

In [24]:
y_test = pd.read_csv(path+"train.csv")
y_test = y_test.reset_index(drop=False)
y_test =y_test[["index","Survived"]].copy()
y_test.shape

(891, 2)

In [38]:
res = y_test.merge(y_hat, left_on="index", right_on="ID", how="inner")

In [39]:
res = res[["ID","Survived", "y_hat"]]

In [40]:
res.head(2)

Unnamed: 0,ID,Survived,y_hat
0,0,0,0
1,1,1,1


In [41]:
from sklearn.metrics import confusion_matrix

In [42]:
confusion_matrix(y_true=res.Survived, y_pred=res.y_hat)

array([[490,  59],
       [193, 149]])

In [50]:
res_table = res.groupby(["Survived", "y_hat"]).ID.count().reset_index(drop=False)
res_table["perc"] = np.around(res_table.ID / res_table.ID.sum() * 100,1)
res_table

Unnamed: 0,Survived,y_hat,ID,perc
0,0,0,490,55.0
1,0,1,59,6.6
2,1,0,193,21.7
3,1,1,149,16.7


In [48]:
#Survived	y_hat	ID	perc
#0	0	0	490	55.0
#1	0	1	59	6.6
#2	1	0	193	21.7
#3	1	1	149	16.7