# 1. Data Understanding

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# 1. collect the initial data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
ship = train.append(test, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [3]:
# 2.0 describe the data
ship.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0,891.0
mean,29.881138,33.295479,0.385027,655.0,2.294882,0.498854,0.383838
std,14.413493,51.758668,0.86556,378.020061,0.837836,1.041658,0.486592
min,0.17,0.0,0.0,1.0,1.0,0.0,0.0
25%,21.0,7.8958,0.0,328.0,2.0,0.0,0.0
50%,28.0,14.4542,0.0,655.0,3.0,0.0,0.0
75%,39.0,31.275,0.0,982.0,3.0,1.0,1.0
max,80.0,512.3292,9.0,1309.0,3.0,8.0,1.0


In [4]:
# explore the data
# Need to target 'Survived'
ship.sample(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
745,70.0,B22,S,71.0,"Crosby, Capt. Edward Gifford",1,746,1,male,1,0.0,WE/P 5735
1216,23.0,,S,7.05,"Assam, Mr. Ali",0,1217,3,male,0,,SOTON/O.Q. 3101309
1180,,,S,8.05,"Ford, Mr. Arthur",0,1181,3,male,0,,A/5 1478
921,50.0,,S,26.0,"Louch, Mr. Charles Alexander",0,922,2,male,1,,SC/AH 3085
904,63.0,,S,26.0,"Howard, Mr. Benjamin",0,905,2,male,1,,24065
702,18.0,,C,14.4542,"Barbara, Miss. Saiide",1,703,3,female,0,0.0,2691
271,25.0,,S,0.0,"Tornquist, Mr. William Henry",0,272,3,male,0,1.0,LINE
372,19.0,,S,8.05,"Beavan, Mr. William Thomas",0,373,3,male,0,0.0,323951
519,32.0,,S,7.8958,"Pavlovic, Mr. Stefo",0,520,3,male,0,0.0,349242
351,,C128,S,35.0,"Williams-Lambert, Mr. Fletcher Fellows",0,352,1,male,0,0.0,113510


In [5]:
ship.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [6]:
ship.shape

(1309, 12)

# 2. Data Preperation

In [7]:
# define the training set
ship['Title'] = ship.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

In [8]:
# normalize the titles
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

In [9]:
# map the normalized titles to the current titles 
ship.Title = ship.Title.map(normalized_titles)

In [10]:
# view value counts for the normalized titles
print(ship.Title.value_counts())

Mr         757
Miss       262
Mrs        200
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64


In [11]:
# group by Sex, Pclass, and Title 
grouped = ship.groupby(['Sex','Pclass', 'Title']) 

In [12]:
# view the median Age by the grouped features 
grouped.Age.median()

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        45.0
                Officer    49.0
                Royalty    39.0
        2       Miss       20.0
                Mrs        30.0
        3       Miss       18.0
                Mrs        31.0
male    1       Master      6.0
                Mr         41.5
                Officer    52.0
                Royalty    40.0
        2       Master      2.0
                Mr         30.0
                Officer    41.5
        3       Master      6.0
                Mr         26.0
Name: Age, dtype: float64

In [13]:
# apply the grouped median value on the Age NaN
ship.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))

In [14]:
# fill Cabin NaN with U for unknown
ship.Cabin = ship.Cabin.fillna('U')

In [15]:
# find most frequent Embarked value and store in variable
most_embarked = ship.Embarked.value_counts().index[0]

In [16]:
# fill NaN with most_embarked value
ship.Embarked = ship.Embarked.fillna(most_embarked)

In [17]:
# fill NaN with median fare
ship.Fare = ship.Fare.fillna(ship.Fare.median())

In [18]:
# view changes
ship.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
Age            1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
Title          1309 non-null object
dtypes: float64(3), int64(4), object(6)
memory usage: 133.0+ KB


In [19]:
# size of families (including the passenger)
ship['FamilySize'] = ship.Parch + ship.SibSp + 1

In [20]:
# map first letter of cabin to itself
ship.Cabin = ship.Cabin.map(lambda x: x[0])

In [21]:
# Convert the male and female groups to integer form
ship.Sex = ship.Sex.map({"male": 0, "female":1})

In [22]:
# create dummy variables for categorical features
pclass_dummies = pd.get_dummies(ship.Pclass, prefix="Pclass")
title_dummies = pd.get_dummies(ship.Title, prefix="Title")
cabin_dummies = pd.get_dummies(ship.Cabin, prefix="Cabin")
embarked_dummies = pd.get_dummies(ship.Embarked, prefix="Embarked")

In [23]:
# concatenate dummy columns with main dataset
ship_dummies = pd.concat([ship, pclass_dummies, title_dummies, cabin_dummies, embarked_dummies], axis=1)

In [24]:
# drop categorical fields
ship_dummies.drop(['Pclass', 'Title', 'Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

In [25]:
ship_dummies.head()

Unnamed: 0,Age,Fare,Parch,PassengerId,Sex,SibSp,Survived,FamilySize,Pclass_1,Pclass_2,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,0,1,0,1,0.0,2,0,0,...,0,0,0,0,0,0,1,0,0,1
1,38.0,71.2833,0,2,1,1,1.0,2,1,0,...,1,0,0,0,0,0,0,1,0,0
2,26.0,7.925,0,3,1,0,1.0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,35.0,53.1,0,4,1,1,1.0,2,1,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,8.05,0,5,0,0,0.0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [26]:
# split the data back up 
editTrain = ship_dummies[:891]
print(len(editTrain))
editTest = ship_dummies[891:]
print(len(editTest))

891
418


# 3. Modeling

## Linear Regression

In [27]:
target = editTrain['Survived']
features = editTrain.copy()
del features['Survived']

In [28]:
# Import Algorithms
# Set Parameters
from sklearn.linear_model import LinearRegression
from scipy import stats
from sklearn.metrics import r2_score, mean_squared_error
L = LinearRegression()

In [29]:
# Fit the data
L.fit(features, target)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [30]:
# Assess the metrics
r2_score(target, L.predict(features))

0.45610103857979156

In [31]:
mean_squared_error(target, L.predict(features))

0.12863562825956973

## Ridge

In [32]:
from sklearn.linear_model import Ridge
R = Ridge()

In [33]:
# Fit the data
R.fit(features, target)


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [34]:
# asses the metrics
r2_score(target, R.predict(features))

0.4555636610429954

In [35]:
mean_squared_error(target, R.predict(features))

0.12876272152865387

## Lasso

In [36]:
from sklearn.linear_model import Lasso
La = Lasso()

In [37]:
# fit the data
La.fit(features, target)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [38]:
# assess the metrics
r2_score(target, La.predict(features))

0.06449250471851964

In [39]:
mean_squared_error(target, La.predict(features))

0.22125358372232132

## Elastic Net

In [40]:
from sklearn.linear_model import ElasticNet
E = ElasticNet()

In [41]:
# fit the data
E.fit(features, target)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [42]:
# assess the metrics
r2_score(target, E.predict(features))

0.06811460503558342

In [43]:
mean_squared_error(target, E.predict(features))

0.2203969335299987

## Naive Bayes

In [44]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
g = GaussianNB()
b = BernoulliNB()

In [45]:
# fit the data
g.fit(features, target)
b.fit(features, target)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [46]:
# Assess the metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [47]:
accuracy = []
precision = []
recall = []
f1 = []

In [48]:
algorithms = [g,b]
names = ['GaussianNB', 'BernoulliNB']

In [49]:
for i in range(len(algorithms)):
    accuracy.append(accuracy_score(target, algorithms[i].predict(features)))
    precision.append(precision_score(target, algorithms[i].predict(features)))
    recall.append(recall_score(target, algorithms[i].predict(features)))
    f1.append(f1_score(target, algorithms[i].predict(features)))

In [50]:
metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'],
                      index = names)
metrics['Accuracy'] = accuracy
metrics['Precision'] = precision
metrics['Recall'] = recall
metrics['F1'] = f1
metrics

Unnamed: 0,Accuracy,Precision,Recall,F1
GaussianNB,0.799102,0.698297,0.839181,0.762284
BernoulliNB,0.791246,0.703125,0.789474,0.743802


## K-Nearest Neighbors

In [51]:
from sklearn.neighbors import KNeighborsClassifier
k = KNeighborsClassifier()

In [52]:
# fit the data
k.fit(features, target)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [53]:
# assess the metrics
def TITANICasses():
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(target, algorithms[i].predict(features)))
        precision.append(precision_score(target, algorithms[i].predict(features)))
        recall.append(recall_score(target, algorithms[i].predict(features)))
        f1.append(f1_score(target, algorithms[i].predict(features)))
    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index = names)
    metrics['Accuracy'] = accuracy
    metrics['Precision'] = precision
    metrics['Recall'] = recall
    metrics['F1'] = f1
    return metrics

In [54]:
algorithms = [g,b,k]
names = ['GaussianNB', 'BernoulliNB', 'KNeighborsClassifier']
TITANICasses()

Unnamed: 0,Accuracy,Precision,Recall,F1
GaussianNB,0.799102,0.698297,0.839181,0.762284
BernoulliNB,0.791246,0.703125,0.789474,0.743802
KNeighborsClassifier,0.762065,0.751938,0.567251,0.646667


## Logistic Regression

In [55]:
from sklearn.linear_model import LogisticRegression
log  = LogisticRegression()

In [56]:
def TITANICasses(X, y, algorithms, names):
    # auto-fit the data set
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X, y)
    # print metrics
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        precision.append(precision_score(y, algorithms[i].predict(X)))
        recall.append(recall_score(y, algorithms[i].predict(X)))
        f1.append(f1_score(y, algorithms[i].predict(X)))
    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index = names)
    metrics['Accuracy'] = accuracy
    metrics['Precision'] = precision
    metrics['Recall'] = recall
    metrics['F1'] = f1
    return metrics

In [57]:
algorithms = [g, b, k, log]
names = ['GaussianNB', 'BernoulliNB', 'KNeighborsClassifier', 'LogisticRegression']
TITANICasses(features, target, algorithms, names)

Unnamed: 0,Accuracy,Precision,Recall,F1
GaussianNB,0.799102,0.698297,0.839181,0.762284
BernoulliNB,0.791246,0.703125,0.789474,0.743802
KNeighborsClassifier,0.762065,0.751938,0.567251,0.646667
LogisticRegression,0.840629,0.804878,0.77193,0.78806


## Decision Tree

In [58]:
from sklearn.tree import DecisionTreeClassifier
d = DecisionTreeClassifier()

In [59]:
# fit the data
def TITANICasses(X = features, y = target, algorithms = algorithms, names = names):
    # auto-fit the data set
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X, y)
    # print metrics
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        precision.append(precision_score(y, algorithms[i].predict(X)))
        recall.append(recall_score(y, algorithms[i].predict(X)))
        f1.append(f1_score(y, algorithms[i].predict(X)))
    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index = names)
    metrics['Accuracy'] = accuracy
    metrics['Precision'] = precision
    metrics['Recall'] = recall
    metrics['F1'] = f1
    return metrics

In [60]:
algorithms.append(d)
names.append('Single Tree')

In [61]:
TITANICasses()

Unnamed: 0,Accuracy,Precision,Recall,F1
GaussianNB,0.799102,0.698297,0.839181,0.762284
BernoulliNB,0.791246,0.703125,0.789474,0.743802
KNeighborsClassifier,0.762065,0.751938,0.567251,0.646667
LogisticRegression,0.840629,0.804878,0.77193,0.78806
Single Tree,1.0,1.0,1.0,1.0


### Vizualize the Tree

In [62]:
!pip install pydotplus

[31mkeyring 13.2.1 requires secretstorage<3, which is not installed.[0m
[31mgrin 1.2.1 requires argparse>=1.1, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [63]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(d, out_file=dot_data,
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('survived.png')

True

## Random Forest Classifier

In [64]:
from sklearn.ensemble import RandomForestClassifier
r = RandomForestClassifier()

  from numpy.core.umath_tests import inner1d


In [65]:
def TITANICasses(X = features, y = target, algorithms = algorithms, names = names):
    # auto-fit the data set
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X, y)
    # print metrics
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        precision.append(precision_score(y, algorithms[i].predict(X)))
        recall.append(recall_score(y, algorithms[i].predict(X)))
        f1.append(f1_score(y, algorithms[i].predict(X)))
    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index = names)
    metrics['Accuracy'] = accuracy
    metrics['Precision'] = precision
    metrics['Recall'] = recall
    metrics['F1'] = f1
    return metrics.sort_values('F1', ascending = False)

In [66]:
algorithms.append(r)
names.append('Random Forest')
TITANICasses()

Unnamed: 0,Accuracy,Precision,Recall,F1
Single Tree,1.0,1.0,1.0,1.0
Random Forest,0.993266,1.0,0.982456,0.99115
LogisticRegression,0.840629,0.804878,0.77193,0.78806
GaussianNB,0.799102,0.698297,0.839181,0.762284
BernoulliNB,0.791246,0.703125,0.789474,0.743802
KNeighborsClassifier,0.762065,0.751938,0.567251,0.646667


## Gradient Boost

In [67]:
from sklearn.ensemble import GradientBoostingClassifier

In [68]:
gbc = GradientBoostingClassifier(n_estimators=5000)

In [69]:
def TDMasses(X=features,y=target,algorithms=algorithms,names=names):
   #fit the data
   for i in range(len(algorithms)):
       algorithms[i] = algorithms[i].fit(X,y)
   #print metrics
   accuracy = []
   precision = []
   recall = []
   f1 = []
   for i in range(len(algorithms)):
       if accuracy_score(y,algorithms[i].predict(X))==1:
           accuracy.append(0)
       else:
           accuracy.append(accuracy_score(y,algorithms[i].predict(X)))
       if precision_score(y,algorithms[i].predict(X))==1:
           precision.append(0)
       else:
           precision.append(accuracy_score(y,algorithms[i].predict(X)))
       if recall_score(y,algorithms[i].predict(X))==1:
           recall.append(0)
       else:
           recall.append(accuracy_score(y,algorithms[i].predict(X)))
       if f1_score(y,algorithms[i].predict(X))==1:
           f1.append(0)
       else:
           f1.append(accuracy_score(y,algorithms[i].predict(X)))

   metrics = pd.DataFrame(columns = ['Accuracy','Precision', 'Recall', 'F1'],index = names)
   metrics['Accuracy'] = accuracy
   metrics['Precision'] = precision
   metrics['Recall'] = recall
   metrics['F1'] = f1
   return metrics.sort_values('F1',ascending = False)



In [70]:
algorithms.append(gbc)
names.append('GradientBoostingClassifier')
TDMasses()

Unnamed: 0,Accuracy,Precision,Recall,F1
Random Forest,0.986532,0.986532,0.986532,0.986532
LogisticRegression,0.840629,0.840629,0.840629,0.840629
GaussianNB,0.799102,0.799102,0.799102,0.799102
BernoulliNB,0.791246,0.791246,0.791246,0.791246
KNeighborsClassifier,0.762065,0.762065,0.762065,0.762065
Single Tree,0.0,0.0,0.0,0.0
GradientBoostingClassifier,0.0,0.0,0.0,0.0


# 4. Evaluation

In [71]:
# define implement set
test.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [72]:
features.head()

Unnamed: 0,Age,Fare,Parch,PassengerId,Sex,SibSp,FamilySize,Pclass_1,Pclass_2,Pclass_3,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,0,1,0,1,2,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,38.0,71.2833,0,2,1,1,2,1,0,0,...,1,0,0,0,0,0,0,1,0,0
2,26.0,7.925,0,3,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,35.0,53.1,0,4,1,1,2,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,8.05,0,5,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1


In [73]:
X = editTest
del X['Survived']

In [74]:
predictions = r.predict(X)

In [75]:
predictions.sum(), predictions.min(), predictions.max()

(150.0, 0.0, 1.0)

In [76]:
result = pd.DataFrame(columns= ['PassengerID', 'Survived'])
result['PassengerID'] = test['PassengerId']
result['Survived'] = predictions.astype('int')
result.head()

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [77]:
result.to_csv('msgTitanic3.csv', index=False)