# Titanic Challenge

In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

kaggle/input/titanic/test.csv
kaggle/input/titanic/titanic.zip
kaggle/input/titanic/train.csv
kaggle/input/titanic/gender_submission.csv


### Look for columns with missing values

In [15]:
t_train_orig = pd.read_csv("kaggle/input/titanic/train.csv")
t_train_orig.name = "Titanic Training Set"

t_test_orig = pd.read_csv("kaggle/input/titanic/test.csv")
t_test_orig.name = "Titanic Test Set"

def display_missing(df):    
    for col in df.columns.tolist(): 
        if df[col].isnull().sum() > 0:
            print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')

dfs = [t_train_orig, t_test_orig]

display_missing(dfs[0])

for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

Age column missing values: 177
Cabin column missing values: 687
Embarked column missing values: 2


Titanic Training Set
Age column missing values: 177
Cabin column missing values: 687
Embarked column missing values: 2


Titanic Test Set
Age column missing values: 86
Fare column missing values: 1
Cabin column missing values: 327




### See what  Age is correlated with best
so we can fill in nulls with more representative values

In [16]:
t_train_corr= t_train_orig.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
t_train_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
t_train_corr[t_train_corr["Feature 1"] == "Age"]

t_test_corr = t_train_orig.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
t_test_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
t_test_corr[t_test_corr["Feature 1"] == "Age"]



Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
5,Age,Age,1.0
12,Age,Pclass,0.369226
16,Age,SibSp,0.308247
21,Age,Parch,0.189119
26,Age,Fare,0.096067
31,Age,Survived,0.077221
36,Age,PassengerId,0.036847


### Sample training data

In [17]:
t_train_orig.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
399,400,1,2,"Trout, Mrs. William H (Jessie L)",female,28.0,0,0,240929,12.65,,S
750,751,1,2,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S
775,776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18.0,0,0,347078,7.75,,S
144,145,0,2,"Andrew, Mr. Edgardo Samuel",male,18.0,0,0,231945,11.5,,S
756,757,0,3,"Carlsson, Mr. August Sigfrid",male,28.0,0,0,350042,7.7958,,S
396,397,0,3,"Olsson, Miss. Elina",female,31.0,0,0,350407,7.8542,,S
477,478,0,3,"Braund, Mr. Lewis Richard",male,29.0,1,0,3460,7.0458,,S
670,671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth ...",female,40.0,1,1,29750,39.0,,S
457,458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S


## Feature exploration

Let's try a simple decision-tree based solution that gives weight to certain columns:
* Pclass (1 > 2 > 3)
* Fare ($$ > $)
* Sex (female > male)
* CabinPresent (1 > 0)
* Is child over 12 (1 > 0)


In [25]:
# let's clean-up the data for the decision tree classifier

import math

t_train = t_train_orig

# sex to numeric
t_train['SexNumeric'] = np.where(t_train['Sex'] == "male", 0, 1)

# passenger has a cabin (if NaN, no cabin)
t_train['CabinPresent'] = t_train['Cabin'].fillna(0)
t_train['CabinPresent'] = np.where(t_train['CabinPresent'] != 0, 1, 0)

# location of cabin.. naive mapping based on the first letter of the Cabin location 
import string
t_train['CabinLetter'] = \
    t_train['Cabin'].map(lambda cabin: string.ascii_uppercase.index(cabin[0]) if (type(cabin) == str) else -1)

# thought large number of siblings might help (it didn't)
t_train['SibSpLarge'] = t_train['SibSp'].map(lambda ct: 1 if ct > 3 else 0)
#t_train['SibSpLarge'].value_counts()

# people with the same last name may be a family; add feature that represent this count to each person
# doesn't seem to help
t_train['LastName'] = t_train['Name'].map(lambda name: name.split(',')[0])
familyCount = t_train.groupby('LastName')['LastName'].count()
t_train['FamilyCount'] = t_train['LastName'].map(lambda lastName: familyCount[lastName])
t_train['FamilyScore'] = t_train.FamilyCount + t_train.SibSp + t_train.Parch
# next, try using the SibSp to correlate against this count..
# but do people with siblings or spouses have a higher chance of surviving? 

#t_train['Fare'].map(lambda amt: int(amt / 10) * 10).value_counts()
ave_fare = t_train['Fare'].median()
t_train['FareWithDefault'] = t_train['Fare'].map(lambda fare: fare if fare > 0 else ave_fare)

fare_p1 = t_train[t_train['Pclass'] == 1]['Fare'].median()
fare_p2 = t_train[t_train['Pclass'] == 2]['Fare'].median()
fare_p3 = t_train[t_train['Pclass'] == 3]['Fare'].median()
median_fares = [0, fare_p1, fare_p2, fare_p3]

def fare_for_class(row):
    pclass = row['Pclass']
    fare = row['Fare']
    fare_for_class = median_fares[pclass]
    adjusted_fare = fare_for_class if fare == 0 else fare
    return adjusted_fare

t_train['FareAdjusted'] = t_train.apply(fare_for_class,axis=1)

# get average age for different classes
age_p1 = t_train[t_train['Pclass'] == 1]['Age'].median()
age_p2 = t_train[t_train['Pclass'] == 2]['Age'].median()
age_p3 = t_train[t_train['Pclass'] == 3]['Age'].median()
median_ages = [0, age_p1, age_p2, age_p3]

def age_for_class(row):
    pclass = row['Pclass']
    age = row['Age']
    age_for_class = median_ages[pclass]
    adjusted_age = age_for_class if np.isnan(age) else age
    return adjusted_age

t_train['AgeAdjusted'] = t_train.apply(age_for_class,axis=1)

# this helps a bit
t_train['IsChild'] = t_train['AgeAdjusted'].map(lambda age: 1 if (age <= 12.0) else 0)

In [29]:
#t_train[t_train['Fare'] != t_train['FareAdjusted']]

In [30]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


feature_cols = ['FareAdjusted','SexNumeric', 'AgeAdjusted', 'IsChild', 'Pclass']
#              [ 0.3309298  0.2972776 0.2367167  0.10798761 ]
feature_cols = ['SexNumeric', 'Fare','IsChild',  'Pclass', 'AgeAdjusted']
#Accuracy: 0.8395522388059702

feature_cols = ['FareAdjusted', 'Pclass', 'SexNumeric', 'IsChild', ]
#Accuracy: 0.8246268656716418
#Accuracy: 0.8283582089552238 sometimes.. why?

#feature_cols = ['Fare', 'Pclass', 'SexNumeric', 'CabinPresent']
#Accuracy: 0.8208955223880597

#feature_cols = ['Pclass', 'SexNumeric', 'Fare']
# Accuracy: 0.8134328358208955

#feature_cols = ['Fare', 'Pclass']
#Accuracy: 0.6865671641791045

X = t_train[feature_cols]
y = t_train.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8395522388059702


In [31]:
#let's visualize the results

from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  

!pip install pydotplus
import pydotplus

from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('titanic.png')
Image(graph.create_png())




InvocationException: GraphViz's executables not found

## Try a random forrest classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

clf.fit(X_train,y_train) 


print(f'feature importance: {clf.feature_importances_}')
print(f'score: {clf.score(X_test, y_test)}')
# 20 samples
#[0.27465804 0.22880346 0.45345524 0.04308326]
# 100 samples
#[0.14283224 0.18341415 0.60423145 0.06952216]

# ['Fare', 'Pclass', 'SexNumeric', 'CabinPresent', 'IsChild']
# [0.40740532 0.10936106 0.37225673 0.05263696 0.05833993]

feature importance: [0.40988466 0.1319164  0.39231598 0.06588296]
score: 0.8283582089552238


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train, y_train)

prob = clf.predict_proba(X)

print(f'probabilty (count = {len(prob)}\n{prob}\n')
print(f'score: {clf.score(X, y)}')
