# Titanic - Machine Learning from Disaster

Variable Description

| Variable  | Definition                           | Key                                   |
|-----------|-------------------------------------|---------------------------------------|
| survival  | Survival                            | 0 = No, 1 = Yes                       |
| pclass    | Ticket class                        | 1 = 1st, 2 = 2nd, 3 = 3rd            |
| sex       | Sex                                 |                                       |
| Age       | Age in years                        |                                       |
| sibsp     | # of siblings / spouses aboard the Titanic |                                   |
| parch     | # of parents / children aboard the Titanic |                                   |
| ticket    | Ticket number                       |                                       |
| fare      | Passenger fare                      |                                       |
| cabin     | Cabin number                        |                                       |
| embarked  | Port of Embarkation                 | C = Cherbourg, Q = Queenstown, S = Southampton |


In [8]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction import DictVectorizer
import pickle

In [9]:
dftrain = pd.read_csv("titanic/train.csv")
dftest = pd.read_csv("titanic/test.csv")

In [10]:
dftrain.columns = [col.lower() for col in dftrain.columns]

In [11]:
cols_ind_vars = ['pclass','sex','age','sibsp','parch','fare','cabin','embarked'] #independent variables.
col_ids = ['passengerid','name'] #ignore from model but keep in data
col_ignore = ['cabin','ticket'] #ignore from model and data.

col_numeric = ['age','sibsp','fare','parch'] #numeric variables.
col_categorical = ['pclass','sex','embarked'] #categorical variables.

In [12]:
def prepare_data(data):
    
    df = data.copy()
    df.columns = [col.lower() for col in df.columns]
    df.drop(columns = col_ids, inplace =True)
    df.drop(columns = col_ignore, inplace = True, errors = 'ignore')
    df = df.ffill(axis = 1)
    df_dict = df.to_dict(orient = 'records')
    return df_dict

In [13]:
# save id cols
dftrain_id_cols = dftrain[col_ids].copy()
dftrain.drop(columns = col_ids, inplace = True)
# remove columns to ignore.
dftrain.drop(columns = col_ignore, inplace = True, errors = 'ignore')

dftrain = dftrain.ffill(axis = 1)
dftrain = dftrain.bfill(axis = 1)

In [14]:
dftrain_dict = dftrain.to_dict(orient = 'records')

In [15]:
dv = DictVectorizer(sparse = False)

In [16]:
xtrain = dv.fit_transform(dftrain_dict)

In [17]:
ytrain = dftrain.survived.values

In [18]:
model =RandomForestClassifier(n_estimators = 10)

In [19]:
model.fit(xtrain, ytrain.astype(str))

In [20]:
dftest_dict = prepare_data(dftest)
xtest = dv.transform(dftest_dict)

In [21]:
with open('saved_model.pkl','wb') as fout:
    pickle.dump((model, dv), fout)

In [32]:
dftest_dict[0]

{'pclass': 3,
 'sex': 'male',
 'age': 34.5,
 'sibsp': 0,
 'parch': 0,
 'fare': 7.8292,
 'embarked': 'Q'}

In [30]:
model.predict(xtest[[1]])

array(['0'], dtype='<U21')

In [33]:
row = dv.transform(dftest_dict[0])

In [43]:
model.predict_proba(row)[0][1]

0.0