In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Preparing the Data

In [2]:
# reading the data
train_data = pd.read_csv('./data/train.csv',header=0)
test_data  = pd.read_csv('./data/test.csv',header=0)

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# Add a dummy column named Survived so that test and train will have name headers
test_data["Survived"] = np.nan

# Add another column so that we can distinguish test and train data
test_data['Train'] = False
train_data['Train'] = True

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Train
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True


In [6]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,Train
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,,False
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,,False
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,,False
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,,False


In [7]:
# merge both test and train data so that we can clean them together
full_data = pd.concat([train_data,test_data])

full_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Train
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,True
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,True
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,True
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,True
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,True


## Cleaning the Data

In [8]:
# looking for missing data
full_data.apply(lambda x: np.sum(pd.isnull(x)))

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
Train             0
dtype: int64

Seems like a lot of **Age** values are missing.

Let's fix **Embarked** and **Fare** columns first.

Ignore the missing values in **Survived** as we added them.

In [9]:
def show_distribution(df,col_name):
    # get the count of each value
    counts = df[col_name].value_counts()

    total = df[col_name].shape[0]

    for index,val in counts.iteritems():
        print("{}: {:.2f}%".format(index,val*100/total))
        
show_distribution(full_data,"Embarked")

S: 69.82%
C: 20.63%
Q: 9.40%


**'S'** is the most common(~70%) occurence therefore we can try replacing the missing values with it.

In [10]:
# create a mask to identify the missing values
mask = pd.isnull(full_data["Embarked"])

# replace the missing values with S
full_data["Embarked"] = full_data["Embarked"].where(~mask,other='S')

# see if there are anymore missing values
np.sum(pd.isnull(full_data["Embarked"]))

0

With that we're done with the **Embarked** column.

Let's move on to **Fare** column.

There is only one missing value so maybe the mean value will work.

In [11]:
mask = pd.isnull(full_data["Fare"])

fare_mean = full_data["Fare"].mean()

full_data["Fare"] = full_data["Fare"].where(~mask,other=fare_mean)

np.sum(pd.isnull(full_data["Fare"]))

0

In [12]:
full_data.apply(lambda x: np.sum(pd.isnull(x)))

Age             263
Cabin          1014
Embarked          0
Fare              0
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
Train             0
dtype: int64

We are left with **Age** and **Cabin** columns.

Instead of using the mean value for **Age** we can try using a **Linear Regression** trained on other features to predict the missing values

### Linear Regression on Age

Features that we can use to predict Age
- Sex
- Number of parents/children
- Number of siblings/spouses
- Ticket class
- Ticket Fare

In [13]:
# select features for the Linear Regression
X = full_data[["Sex","Parch","SibSp","Pclass","Fare"]].copy()
Y = full_data[["Age"]].copy()

# change string to int(categorical)
X["Sex"] = X["Sex"].apply(lambda x: int(x=="male"))

# create a mask where Age is null
mask = pd.isnull(Y)["Age"]

X_pred = X[mask]
X = X[~mask]
Y = Y[~mask]

print("X: {} Y: {} X-pred: {}".format(X.shape[0],Y.shape[0],X_pred.shape[0]))

X: 1046 Y: 1046 X-pred: 263


In [14]:
from sklearn.linear_model import LinearRegression

# creating the model
lr = LinearRegression(normalize=True)

# fit the model
lr.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [15]:
# predict the age values for the missing entries
Y_pred = lr.predict(X_pred)

# replace values less than 1 with mean value
Y_pred[Y_pred < 1] = Y_pred.mean()


Age = full_data["Age"].copy()

Age[mask] = Y_pred

# assign the new Age column to our dataframe
full_data.loc[:,"Age"] = Age

In [16]:
full_data.apply(lambda x: np.sum(pd.isnull(x)))

Age               0
Cabin          1014
Embarked          0
Fare              0
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
Train             0
dtype: int64

Moving on to **Cabin** column.

In [17]:
p = np.sum(pd.isnull(full_data["Cabin"]))/full_data["Cabin"].shape[0]

print("{:.1f}% of data is missing".format(p*100))

77.5% of data is missing


As more than 75% of the entries are missing, it is better to avoid that column.

Features in the cleaned dataset
- Age
- Embarked
- Fare
- Name
- Parch
- PassengerId
- Pclass
- Sex
- SibSp
- Survived
- Ticket

In [18]:
full_data = full_data[["PassengerId","Age","Embarked","Fare","Name","Parch",
                       "Pclass","Sex","SibSp","Ticket","Survived","Train"]]
full_data.head()

Unnamed: 0,PassengerId,Age,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Survived,Train
0,1,22.0,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,0.0,True
1,2,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,1.0,True
2,3,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,1.0,True
3,4,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,1.0,True
4,5,35.0,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,0.0,True


# Feature Extraction

One new feature that can be extracted from the name is the person's personal title.

In [19]:
import re

# list of personal title we want to look for
personal_titles = [
    "Mr",
    "Mrs",
    "Miss",
    "Prof",
    "Dr",
    "Capt",
    "Major",
    "Gen"
]

# mappings (0 -> no title)
ptitle_mapping = { 
                title.lower() : index+1
                for index,title in enumerate(personal_titles)
        }

# regex pattern to look for
pattern = r"({})\.".format('|'.join(personal_titles).lower())

pattern = re.compile(pattern)


In [20]:
def match_ptitle(pattern,string):
    m = pattern.search(string.lower())
    
    if m: 
        return m.group(1)
    
    return ''

personalTitle = full_data["Name"].apply(lambda x: ptitle_mapping.get(match_ptitle(pattern,x),0))

for index,val in personalTitle.value_counts().iteritems():
    cname = personal_titles[index-1] if index > 0 else "No Title"
    print("{} : {}".format(cname,val))

Mr : 757
Miss : 260
Mrs : 197
No Title : 84
Dr : 8
Major : 2
Capt : 1


In [21]:
full_data["personal_title"] = personalTitle

full_data.head()

Unnamed: 0,PassengerId,Age,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Survived,Train,personal_title
0,1,22.0,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,0.0,True,1
1,2,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,1.0,True,2
2,3,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,1.0,True,3
3,4,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,1.0,True,2
4,5,35.0,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,0.0,True,1


#### Investigating the Ticket Numbers

In [22]:
Tnum = full_data["Ticket"]

# pattern to extract ticket prefixes
pattern = re.compile(r"([A-Za-z]*)[\/|\s]")

def extract_prefix(pattern,string):
    
    m = pattern.search(string)
    m = m.group(1) if m else 'NA'
    
    return m if m else 'NA'

Tprefix = Tnum.apply(lambda x: extract_prefix(pattern,x))

show_distribution(pd.DataFrame({"prefix":Tprefix}),"prefix")

NA: 82.28%
PC: 7.03%
A: 2.44%
SOTON: 2.06%
SC: 1.91%
STON: 1.68%
CA: 0.76%
C: 0.61%
PP: 0.31%
AQ: 0.15%
WE: 0.15%
P: 0.15%
SO: 0.08%
Fa: 0.08%
LP: 0.08%
SCO: 0.08%
SW: 0.08%
W: 0.08%


In [23]:
# enumerate the ticket prefixes
for index,val in enumerate(Tprefix.unique()):
    Tprefix[Tprefix == val] = index
    
Tprefix = Tprefix.astype(int)

Tprefix.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17])

In [24]:
# add this new column to out dataframe
full_data["Tprefix"] = Tprefix

full_data.head()

Unnamed: 0,PassengerId,Age,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Survived,Train,personal_title,Tprefix
0,1,22.0,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,0.0,True,1,0
1,2,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,1.0,True,2,1
2,3,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,1.0,True,3,2
3,4,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,1.0,True,2,3
4,5,35.0,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,0.0,True,1,3


## Enumerate Categorical Values

In [25]:
def enumerate_cat_values(df,col_name):
    col = df[col_name].copy()
    
    for index,val in enumerate(col.unique()):
        col[col == val] = index
    return col

# categorical col name
cols = ["Embarked","Sex"]

for col in cols:
    full_data[col] = enumerate_cat_values(full_data,col)
    
full_data.head()

Unnamed: 0,PassengerId,Age,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Survived,Train,personal_title,Tprefix
0,1,22.0,0,7.25,"Braund, Mr. Owen Harris",0,3,0,1,A/5 21171,0.0,True,1,0
1,2,38.0,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,1,1,PC 17599,1.0,True,2,1
2,3,26.0,0,7.925,"Heikkinen, Miss. Laina",0,3,1,0,STON/O2. 3101282,1.0,True,3,2
3,4,35.0,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,1,1,113803,1.0,True,2,3
4,5,35.0,0,8.05,"Allen, Mr. William Henry",0,3,0,0,373450,0.0,True,1,3


#### Removing unnecessary columns

In [26]:
full_data = full_data[["PassengerId","Age","Embarked","Fare","Parch",
                       "SibSp","Sex","Pclass","personal_title","Tprefix",
                       "Survived","Train"]]
full_data.head()

Unnamed: 0,PassengerId,Age,Embarked,Fare,Parch,SibSp,Sex,Pclass,personal_title,Tprefix,Survived,Train
0,1,22.0,0,7.25,0,1,0,3,1,0,0.0,True
1,2,38.0,1,71.2833,0,1,1,1,2,1,1.0,True
2,3,26.0,0,7.925,0,0,1,3,3,2,1.0,True
3,4,35.0,0,53.1,0,1,1,1,2,3,1.0,True
4,5,35.0,0,8.05,0,0,0,3,1,3,0.0,True


Split the data back into **test** and **train** sets 

In [27]:
train_data = full_data[full_data["Train"]]
test_data  = full_data[~full_data["Train"]]

# remove Train column as we don't need it anymore
train_data = train_data.drop(['Train'],axis=1)
# remove Survived column from the test set 
test_data = test_data.drop(['Train','Survived'],axis=1)




print("Test: {} \nTrain: {}".format(train_data.shape,test_data.shape))

Test: (891, 11) 
Train: (418, 10)


# Building the Predictive Models

In [28]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# initialize our Random Forest classifier
rf = RandomForestClassifier(
            n_estimators = 10 # number of trees
        )

# initialize our Gradient Boosting classifier
gb = GradientBoostingClassifier(
            learning_rate = 0.1
        )

# initialize our Naive Bayes classifier
nb = GaussianNB()

# initialize our SVC classifier

svc = SVC()

classifiers = {
    "RandomForest":rf,
    "GradientBoostedTree":gb,
    "GaussianNaiveBayes":nb,
    "SVC":svc
}

## Training the model

In [29]:
# get everything except passengerId and Survived
X = train_data.drop(['PassengerId','Survived'],axis=1)
Y = train_data['Survived'].astype(int)

# to get reproducible results for debugging
np.random.seed(1)


CROSS_VALIDATION_SPLIT = 0.8

mask = np.random.rand(X.shape[0]) < CROSS_VALIDATION_SPLIT

# train set
X_train = X[mask]
Y_train = Y[mask]

# test set for cross-validation
if CROSS_VALIDATION_SPLIT < 1:
    X_test = X[~mask]
    Y_test = Y[~mask]

In [30]:
for name,clf in classifiers.items():
    clf.fit(X_train,Y_train)
    
    if CROSS_VALIDATION_SPLIT < 1:
        acc = clf.score(X_test,Y_test)
    
        print("{} - Accuracy: {:.2f}%".format(name,acc*100))

RandomForest - Accuracy: 83.52%
GradientBoostedTree - Accuracy: 83.52%
GaussianNaiveBayes - Accuracy: 77.84%
SVC - Accuracy: 71.59%


# Make Predictions

In [31]:
X_pred = test_data.drop(['PassengerId'],axis=1)

passengerId = test_data['PassengerId']

predictions = {}

for name,clf in classifiers.items():
    pred = clf.predict(X_pred).astype(int)
    predictions[name] = pred
    
    percent = np.sum(pred)/pred.shape[0]
    
    print("{} - {:.2f}% Survived".format(name,percent*100))

RandomForest - 36.84% Survived
GradientBoostedTree - 33.49% Survived
GaussianNaiveBayes - 39.47% Survived
SVC - 27.03% Survived


In [32]:
predictions_df = pd.DataFrame({**{"PassengerId":passengerId}, **predictions})
predictions_df.head()

Unnamed: 0,GaussianNaiveBayes,GradientBoostedTree,PassengerId,RandomForest,SVC
0,0,0,892,0,0
1,1,0,893,0,0
2,0,0,894,0,1
3,0,0,895,1,0
4,1,1,896,1,0


In [33]:
# write the results to files
for name in predictions.keys():
    predictions_df[['PassengerId',name]].to_csv(
        '{}_pred.csv'.format(name.lower()),
        index=False,
        header=['PassengerId','Survived']
    )

In [34]:
# using all the four models to vote for the final prediction
cols = list(predictions_df.columns)

cols.remove('PassengerId')

predictions_df["Voted"] = predictions_df[cols].sum(axis=1)/(predictions_df.shape[1]-1)

predictions_df["Voted"] = predictions_df["Voted"] > 0.5

predictions_df["Voted"] = predictions_df["Voted"].astype(int)

predictions_df.head()

Unnamed: 0,GaussianNaiveBayes,GradientBoostedTree,PassengerId,RandomForest,SVC,Voted
0,0,0,892,0,0,0
1,1,0,893,0,0,0
2,0,0,894,0,1,0
3,0,0,895,1,0,0
4,1,1,896,1,0,1


In [35]:
predictions_df[["PassengerId","Voted"]].to_csv("voted_pred.csv",header=["PassengerId","Survived"],index=False)

# An Attempt to Improve the Voting System

Warning! Logistic Regression Ahead

In [36]:
# predictions by our classifiers on the training data
train_predictions = {}

for name,clf in classifiers.items():
    train_predictions[name] = clf.predict(X) 
    
train_predictions["target"] = Y.copy()

# our new dataframe 
df = pd.DataFrame(train_predictions)
df.head()

Unnamed: 0,GaussianNaiveBayes,GradientBoostedTree,RandomForest,SVC,target
0,0,0,0,0,0
1,1,1,1,1,1
2,1,1,1,1,1
3,1,1,1,1,1
4,0,0,0,0,0


In [37]:
from sklearn.linear_model import SGDClassifier

# features and target for our new model
X_log = df.drop(["target"],axis=1)
Y_log = df["target"]

# CROSS_VALIDATION_SPLIT = 0.8

np.random.seed(2)

mask = np.random.rand(X_log.shape[0]) < CROSS_VALIDATION_SPLIT

# training set
X_log_train = X_log[mask]
Y_log_train = Y_log[mask]

# testing set
if CROSS_VALIDATION_SPLIT < 1:
    X_log_test = X_log[~mask]
    Y_log_test = Y_log[~mask]
    

# initialize our model
sgd = SGDClassifier(
            loss="log",
            max_iter=100 
        )

sgd.fit(X_log_train,Y_log_train)

if CROSS_VALIDATION_SPLIT < 1:
    acc = sgd.score(X_log_test,Y_log_test)
    
    print("Accuracy - {:.2f}%".format(acc*100))

Accuracy - 95.24%


## Let's try our new voting system

In [38]:
X_log_pred = predictions_df.drop(["PassengerId","Voted"],axis=1)
X_log_pred = X_log_pred.as_matrix()

Y_log_pred = sgd.predict(X_log_pred)
predictions_df["sgd"] = Y_log_pred
predictions_df.head()

Unnamed: 0,GaussianNaiveBayes,GradientBoostedTree,PassengerId,RandomForest,SVC,Voted,sgd
0,0,0,892,0,0,0,0
1,1,0,893,0,0,0,0
2,0,0,894,0,1,0,0
3,0,0,895,1,0,0,1
4,1,1,896,1,0,1,1


In [39]:
predictions_df[["PassengerId","sgd"]].to_csv(
                        "improved_voting_pred.csv",
                        header=["PassengerId","Survived"],
                        index=False
                        )