Import Libraries & Configure Visualizations

In [None]:
#Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

#Handle tabular data & matrices
import numpy as np
import pandas as pd

#Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#Helpers for Modelling
from sklearn.preprocessing import Imputer, Normalizer, scale
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import RFECV

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
#import statsmodels as sns
import seaborn as sns

#Configure Visualizations
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams[ 'figure.figsize'] = 8 , 6

Define Helper Functions

In [None]:
#Helper functions for making good looking plots
def plot_histograms(df, variables, n_rows, n_cols):
    fig = plt.figure(figsize = (16, 12))
    for i, var_name in enumerate(variables):
        ax = fig.add_subplot(n_rows, n_cols, i+1)
        df[var_name].hist(bins = 10, ax = ax)
        ax.set_title('Skew: ' + str(round ( float( df[var_name].skew() ),) ) )
        ax.set_xticklabels( [], visible = False)
        ax.set_yticklabels( [], visible = False)
        fig.tight_layout()
        plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories(df, cat, target, **kwargs):
    row = kwargs.get('row', None)
    col = kwargs.get('col', None)
    facet = sns.FacetGrid(df, row = row, col = col)
    facet.map(sns.barplot, cat, target)
    facet.add_legend()

def plot_correlation_map(df):
    corr = titanic.corr()
    _ , ax = plt.subplots(figsize = (12, 10))
    cmap = sns.diverging_palette(220,10,as_cmap = True)
    _ = sns.heatmap(corr, cmap = cmap, square = True, cbar_kws = {'shrink': .0}, ax = ax, annot = True, annot_kws = {'fontsize': 12})

def describe_more(df):
    var = []; l = []; t = []
    for x in df:
        var.append(x)
        l.append(len(pd.value_counts(df[x])))
        l.append(df[x].dtypes)
    levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t})
    levels.sort_values(by = 'Levels', implace = True)
    return levels

def plot_variable_importance(X, y):
    tree = DecisionTreeClassifier(random_state = 99)
    tree.fit(X, y)
    plot_model_var_imp(tree, X, y)

def plot_model_var_imp(model, X, y):
    imp = pd.DataFrame(model.feature_importances_, columns = ['Importance'], index = X.columns)
    imp = imp.sort_values(['Importance'], ascending = True)
    imp[:10].plot(kind = 'barh')
    print(model.score(X, y))

#def determine_Rich(money):
 #   if(money['Title'] == "Royalty"):
  #      return True
   # elif(money['Pclass'] == 1):
    #    return True
    #elif(money['Fare'] > money['Fare'].quantile(0.75)):
     #   return True
    #else:
     #   return False

#def determine_Middle(money):
    #if(money['Pclass'] == 2):
     #   return True
    #elif(money['Fare'].quantile(0.25)<= money['Fare'] <= money['Fare'].quantile(0.75)):
     #   return True
    #else:
     #   return False

#def determine_Poor(money):
   # if(money['Pclass'] == 3):
    #    return True
    #elif(money['Fare'] < money['Fare'].quantile(0.25)):
    #    return True
    #else:
      #  return False

Import the data and peak at it

In [None]:
train = pd.read_csv("../input/train.csv") #change filepath later
test = pd.read_csv('../input/test.csv') #change filepath later

full = train.append(test, ignore_index = True)
titanic = full[ :891 ]

del train, test
print('Datasets:', 'full:', full.shape, 'titanic:', titanic.shape)

Peak at the data

In [None]:
titanic.head()

Describe the full data

In [None]:
titanic.describe()

Run the main function

In [None]:
#Plot correlation heat map
plot_correlation_map(titanic)

#Plot distribution of Age of passengers
#plot_distribution(titanic, var = 'Age', target = 'Survived', row = 'Sex')

#Plot distribution of Fare of passengers
#plot_distribution(titanic, var = 'Fare', target = 'Survived', row = 'Pclass')

#Plot survival rate by embarked
plot_categories(titanic, cat = 'Embarked', target = 'Survived')

#Plot survival rate by Sex
plot_categories(titanic, cat = 'Sex', target = 'Survived')

#Plot survival rate by Pclass
plot_categories(titanic, cat = 'Pclass', target = 'Survived')

#Plot surivival rate by SibSp
plot_categories(titanic, cat = 'SibSp', target = 'Survived')

#Plot survival rate by Parch
plot_categories(titanic, cat = 'Parch', target = 'Survived')

In [None]:
#Make sex into binary values 0 & 1 (needs to be numerical data)
sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name = 'Sex')

#Create new variable for every unique embarked variable
embarked = pd.get_dummies(full.Embarked, prefix = 'Embarked')
embarked.head()

In [None]:
#Create new variable for every unique value of Passenger Class
pclass = pd.get_dummies(full.Pclass, prefix = 'Pclass')
pclass.head()

In [None]:
Fare = pd.DataFrame()
Fare['Fare'] = full.Fare.fillna(full.Fare.median())
Fare.describe()
print(Fare.isnull().any().any())

In [None]:
#Extracting title
title = pd.DataFrame()
title['Title'] = full['Name'].map( lambda name: name.split(',')[1].split('.')[0].strip())
Title_Dictionary = {
        "Capt": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Jonkheer": "Royalty",
        "Don": "Royalty",
        "Sir": "Royalty",
        "Dr": "Officer",
        "Rev": "Officer",
        "the Countess": "Royalty",
        "Dona": "Royalty",
        "Mme": "Mrs",
        "Mlle": "Miss",
        "Ms": "Mrs",
        "Mr": "Mr",
        "Mrs": "Mrs",
        "Miss": "Miss",
        "Master": "Royalty",
        "Lady": "Royalty"
}
stuff = pd.DataFrame()
stuff['Title'] = title.Title
title['Title'] = title.Title.map(Title_Dictionary)
title = pd.get_dummies(title.Title)
#title pd.concat([title, titles_dummies], axis = 1)
title.head()

In [None]:
#Option 3: fill missing ages with medians that are seperated by group
stuff['Sex'] = full.Sex
stuff['Pclass'] = full.Pclass
stuff['Age'] = full.Age
stuff['Age'] = stuff.groupby(['Sex', 'Pclass', 'Title'])['Age'].transform(lambda x: x.fillna(x.median()))
stuff['Age'] = stuff.Age.fillna(stuff.Age.median())
Age = pd.DataFrame()
Age['Age'] = stuff.Age

del stuff
Age.describe()

In [None]:
#Create family size variable
family = pd.DataFrame()

family['FamilySize'] = full['Parch'] + full['SibSp'] + 1

#Single, small or large family
family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if s == 1 else 0)
family['Family_Small'] = family['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
family['Family_Large'] = family['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

family.head()

In [None]:
#Create a wealth variable
title = pd.DataFrame()
title['Title'] = full['Name'].map( lambda name: name.split(',')[1].split('.')[0].strip())
money = pd.DataFrame()
money['Pclass'] = full.Pclass
money['Fare'] = full.Fare
money['Title'] = title.Title
money['Rich'] = money['Pclass'].map(lambda s: 1 if s == 1 else 0)
money['Middle'] = money['Pclass'].map(lambda s: 1 if s == 2 else 0)
money['Poor'] = money['Pclass'].map(lambda s: 1 if s == 3 else 0)
money.head()

wealth = pd.DataFrame()
wealth['Rich'] = money.Rich
wealth['Middle'] = money.Middle
wealth['Poor'] = money.Poor

In [None]:

#Create all datasets neccessary to test models
Pclass = pd.DataFrame()
Pclass = full.Pclass
full_X = pd.concat([Age, sex, Pclass, family, Fare], axis = 1)
full_X.head()
train_valid_X = full_X[0:891]
train_valid_Y = titanic.Survived
test_X = full_X[891:]
train_X, valid_X, train_Y, valid_Y = train_test_split(train_valid_X, train_valid_Y, train_size = 0.7)
print (full_X.shape, train_X.shape, valid_X.shape, train_Y.shape, valid_Y.shape, test_X.shape)

plot_variable_importance(train_X, train_Y)

In [None]:
#Run several different models
model1 = RandomForestClassifier(n_estimators = 100)
model2 = KNeighborsClassifier(n_neighbors = 3)
model3 = GradientBoostingClassifier()
model4 = GaussianNB()
model5 = LogisticRegression()
model6 = SVC()

model1.fit(train_X, train_Y)
model2.fit(train_X, train_Y)
model3.fit(train_X, train_Y)
model4.fit(train_X, train_Y)
model5.fit(train_X, train_Y)
model6.fit(train_X, train_Y)

train_score1 = model1.score(train_X, train_Y)
train_score2 = model2.score(train_X, train_Y)
train_score3 = model3.score(train_X, train_Y)
train_score4 = model4.score(train_X, train_Y)
train_score5 = model5.score(train_X, train_Y)
train_score6 = model6.score(train_X, train_Y)
    
valid_score1 = model1.score(valid_X, valid_Y)
valid_score2 = model2.score(valid_X, valid_Y)
valid_score3 = model3.score(valid_X, valid_Y)
valid_score4 = model4.score(valid_X, valid_Y)
valid_score5 = model5.score(valid_X, valid_Y)
valid_score6 = model6.score(valid_X, valid_Y)

#Print out score comparisons
print("Train Data Score: Validation Data Score:")
print(train_score1, valid_score1)
print(train_score2, valid_score2)
print(train_score3, valid_score3)
print(train_score4, valid_score4)
print(train_score5, valid_score5)
print(train_score6, valid_score5)

#Hopefully find the Optimal Features for the model
plot_model_var_imp(model1, train_X, train_Y)
#rfecv = RFECV(estimator = model5, step = 1, cv = StratifiedKFold(train_Y, 2), scoring = 'accuracy')
#rfecv.fit(train_X, train_Y)
#print(rfecv.score(train_X, train_Y), rfecv.score(valid_X, Valid_Y))
#print("Optimal number of features: %d" % refecv.n_features_)

#Plot number of features vs. cross Validcation Scores
#plt.figure()
#plt.xlabel("Number of features selected")
#plt.ylabel("Cross validation score (nb of correct classification")
#plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores)
#plt.show()

In [None]:
#Model 1
test_Y1 = model1.predict( test_X )
passenger_id = full[891:].PassengerId
test1 = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': test_Y1 } )
test1.head()

In [None]:
#Model 2
test_Y2 = model2.predict( test_X )
passenger_id = full[891:].PassengerId
test2 = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': test_Y2 } )
test2.head()

In [None]:
#Model 3
test_Y3 = model3.predict( test_X )
passenger_id = full[891:].PassengerId
test3 = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': test_Y3 } )
test3.head()

In [None]:
#Model 4
test_Y4 = model4.predict( test_X )
passenger_id = full[891:].PassengerId
test4 = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': test_Y4 } )
test4.head()

In [None]:
#Model 5
test_Y5 = model5.predict( test_X )
passenger_id = full[891:].PassengerId
test5 = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': test_Y5 } )
test5.head()

In [None]:
#Model 6
test_Y6 = model6.predict( test_X )
passenger_id = full[891:].PassengerId
test6 = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': test_Y6 } )
test6.head()

In [None]:
#Average all the different test data
test_total = test1.add(test2, fill_value = 0)
test_total2 = test3.add(test4, fill_value = 0)
test_total2 = test_total2.add(test6, fill_value = 0)
test_total = test_total.add(test5, fill_value = 0)
test_total = test_total.add(test_total2, fill_value = 0)
test_total = test_total.divide(6)
test_total['PassengerId'] = test_total['PassengerId'].astype(int)
test_total['Survived'] = test_total['Survived'].round(0)
test_total['Survived'] = test_total['Survived'].astype(int)
test_total.head()

In [None]:
test_total.to_csv( 'titanic_pred.csv' , index = False )