# Steps

1. Import libraries

2. Read file
    Merge Train(0-891) and Test(0-413) files together
    Keep track of the indecies
    
3. Look at Data

4. Clean

5. Feature Engineering

6. Select X and Y
    Train + test split
    
7. Split train + test

8. Define Alg

9. Fit Alg

10. Predict

11. Calculate Error

12. Predict(test)

13. Save as .csv with no index

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.cross_validation import train_test_split
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score
import math

%matplotlib inline



In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
#Appending the data
data = train_data.append(test_data)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


### Cleaning the Data

In [5]:
#Replacing 2 NaN values in embarked
data.Embarked.fillna(value='S', inplace=True)

In [6]:
#Filling in NaN cabin variables
data_cabins = data['Cabin']
data_cabins.fillna(value='U', inplace=True)

#Editing the Cabin column so that only the prefix is there
data_cabins = data['Cabin'].astype(str).str[0]
data['Cabin'] = data_cabins

In [7]:
#Taking the prefix of tickets
def pref_extract(x):
    try:
        int(x)
        return 'numb_only'
    except:
        return x.split(' ', 1)[0]

data['Tix_Pre'] = data.Ticket.apply(lambda x: pref_extract(x))

In [8]:
#Taking the numbers of the tickets
#import re

#data['Tix_Num'] = data.Ticket.apply(lambda x: ''.join(ch for ch in x if ch.isdigit()))


In [9]:
#We have one index that does not have a fare, let's fill it in with just an average age probably won't skew data too much
avg_fare = int(np.mean(data.Fare))
data.Fare.fillna(value=avg_fare, inplace=True)

In [10]:
# Creating a Prefix column (which I will use to determine missing age values)
def strParse(x):
    if 'Mr.' in x:
        return 'Mr.'
    elif 'Mrs.' in x:
        return 'Mrs.'
    elif 'Master.' in x:
        return 'Master.'
    elif 'Miss.' in x:
        return 'Miss.'
    elif 'Rev.' in x:
        return 'Rev.'
    else:
        return 'None'

data['Prefix'] = data.Name.apply(lambda x: strParse(x))

In [11]:
data.groupby('Prefix')[['Age']].mean()

Unnamed: 0_level_0,Age
Prefix,Unnamed: 1_level_1
Master.,5.482642
Miss.,21.774238
Mr.,32.252151
Mrs.,36.994118
,43.125
Rev.,41.25


In [12]:
#Using the Prefix column to estimate the age
df = []

for index, row in data.iterrows():
    if math.isnan(row['Age']):
        if row['Prefix'] == 'Master.':
            df.append(6)
        elif row['Prefix'] == 'Miss.':
            df.append(22)
        elif row['Prefix'] == 'Mr.':
            df.append(32)
        elif row['Prefix'] == 'Mrs.':
            df.append(36)
        elif row['Prefix'] == 'Rev.':
            df.append(41)
        else:
            df.append(29)
    else:
        df.append(row['Age'])
        
data['Age'] = df

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 14 columns):
Age            1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
Tix_Pre        1309 non-null object
Prefix         1309 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 153.4+ KB


### Feature Engineering

In [14]:
#Adding dummy columns for Embarked
data_embarked = pd.get_dummies(data['Embarked'], drop_first=True)
data = pd.concat([data, data_embarked], axis=1)

In [15]:
#Using Dummy Variables with Cabin
dummy_cabins = pd.get_dummies(data['Cabin'], drop_first=True)
data = pd.concat([data, dummy_cabins], axis=1)

In [16]:
#Using Dummy Variables with Name Prefix
dummy_prefix = pd.get_dummies(data['Prefix'], drop_first=True)
data = pd.concat([data, dummy_prefix], axis=1)

In [17]:
#Using Dummy Variables with Ticket Prefixes
Tix_pre_dummy = pd.get_dummies(data['Tix_Pre'], drop_first=True)
data = pd.concat([data, Tix_pre_dummy], axis=1)

In [18]:
#Using Dummy Variables with Sex
dummy_sex = pd.get_dummies(data['Sex'], drop_first=True)
data = pd.concat([data, dummy_sex], axis=1)

In [19]:
#Creating a new column based on whether the individual paid more than a $50 fare
#data['Fare>50'] = data['Fare'].apply(lambda x: 1 if x > 50 else 0)
data['Fare_rank'] = data.Fare.apply(lambda x: 1 if x <= 50 else 2 if x <= 100 else 3 if x <= 200 else 4)

In [20]:
#Was the individual travelling with a family?

#Parch_SibSp = data['Parch'] + data['SibSp']
#data['Family'] = Parch_SibSp.apply(lambda x: 1 if x > 0 else 0)

In [21]:
#Parch > 2 means you must have been travelling with a child
#data['w/child'] = data.Parch.apply(lambda x: 1 if x > 2 else 0)

In [22]:
#Does age over 18 matter?
data['>=18'] = data.Age.apply(lambda x: 0 if x >= 18 else 1)

In [23]:
#Changing Pclass order
data['Pclass1'] = data.Pclass.apply(lambda x: 'Upper' if x==1 else 'Middle' if x==2 else 'Lower')
Pclass_dummy = pd.get_dummies(data['Pclass1'], drop_first=True)
data = pd.concat([data, Pclass_dummy], axis=1)

In [24]:
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,W.E.P.,W/C,WE/P,numb_only,male,Fare_rank,>=18,Pclass1,Middle,Upper
0,22.0,U,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,...,0,0,0,0,1,1,0,Lower,0,0
1,38.0,C,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,0,0,0,0,0,2,0,Upper,0,1
2,26.0,U,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,...,0,0,0,0,0,1,0,Lower,0,0
3,35.0,C,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,...,0,0,0,1,0,2,0,Upper,0,1
4,35.0,U,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,...,0,0,0,1,1,1,0,Lower,0,0


In [25]:
data.corr()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived,Q,S,B,...,W./C.,W.E.P.,W/C,WE/P,numb_only,male,Fare_rank,>=18,Middle,Upper
Age,1.000000,0.170563,-0.138284,0.025520,-0.377460,-0.219653,-0.085876,-0.048220,-0.040700,0.113911,...,-0.036589,0.049525,0.000328,0.068852,-0.024331,0.081941,0.189021,-0.596894,-0.009882,0.371316
Fare,0.170563,1.000000,0.221524,0.031411,-0.558481,0.160226,0.257307,-0.130053,-0.169897,0.393744,...,-0.021670,0.021087,-0.012187,0.028518,-0.174692,-0.185487,0.916443,-0.015156,-0.121370,0.599958
Parch,-0.138284,0.221524,1.000000,0.008942,0.018322,0.373587,0.081629,-0.100943,0.071881,0.073051,...,0.125403,-0.017408,-0.012304,0.050409,-0.084573,-0.213125,0.165032,0.293928,-0.010057,-0.013033
PassengerId,0.025520,0.031411,0.008942,1.000000,-0.038354,-0.055224,-0.005007,0.011585,-0.049836,0.015895,...,0.006348,-0.016098,-0.031830,-0.001190,0.003538,0.013406,0.033394,-0.036970,0.022714,0.026495
Pclass,-0.377460,-0.558481,0.018322,-0.038354,1.000000,0.060832,-0.338481,0.230491,0.091320,-0.353414,...,0.052068,-0.060480,-0.009735,-0.060480,0.088045,0.124617,-0.572886,0.141901,-0.182413,-0.884911
SibSp,-0.219653,0.160226,0.373587,-0.055224,0.060832,1.000000,-0.035322,-0.048678,0.073709,-0.011569,...,0.035777,0.018827,-0.013247,0.000043,-0.078437,-0.109609,0.136906,0.332375,-0.052419,-0.034256
Survived,-0.085876,0.257307,0.081629,-0.005007,-0.338481,-0.035322,1.000000,0.003650,-0.149683,0.175095,...,-0.056649,-0.026456,-0.026456,0.011329,0.001492,-0.543351,0.264172,0.123616,0.093349,0.285904
Q,-0.048220,-0.130053,-0.100943,0.011585,0.230491,-0.048678,0.003650,1.000000,-0.491656,-0.073613,...,-0.033484,-0.012598,-0.008904,-0.012598,0.177596,-0.088651,-0.121795,-0.049472,-0.121973,-0.166101
S,-0.040700,-0.169897,0.071881,-0.049836,0.091320,0.073709,-0.149683,-0.491656,1.000000,-0.095790,...,0.068105,0.025623,0.018111,0.025623,-0.021353,0.115193,-0.180778,0.023470,0.196532,-0.181800
B,0.113911,0.393744,0.073051,0.015895,-0.353414,-0.011569,0.175095,-0.073613,-0.095790,1.000000,...,-0.023767,-0.008942,-0.006320,0.171132,-0.059652,-0.094453,0.374619,-0.011152,-0.118426,0.399378


### Splitting back to original files

In [26]:
# Splitting the data back to train and test
train_data = data.iloc[0:891, :]
test_data = data.iloc[891:, :]

### Splitting X and Y

In [27]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 85 columns):
Age            891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
Fare           891 non-null float64
Name           891 non-null object
Parch          891 non-null int64
PassengerId    891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
SibSp          891 non-null int64
Survived       891 non-null float64
Ticket         891 non-null object
Tix_Pre        891 non-null object
Prefix         891 non-null object
Q              891 non-null uint8
S              891 non-null uint8
B              891 non-null uint8
C              891 non-null uint8
D              891 non-null uint8
E              891 non-null uint8
F              891 non-null uint8
G              891 non-null uint8
T              891 non-null uint8
U              891 non-null uint8
Miss.          891 non-null uint8
Mr.            891 non-null uint

In [28]:
x = train_data.drop(['Name', 'PassengerId', 'Ticket', 'Prefix', 'Cabin', 'Sex', 'Embarked', 'Survived', 'Pclass', 'Pclass1', 'Tix_Pre'], axis=1)
y = train_data['Survived']

### Splitting Training Data

In [29]:
#Train and test split

X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=0)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(712, 74) (179, 74) (712L,) (179L,)


### Regression techniques

In [30]:
#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier()
#clf = clf.fit(X_train,y_train)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
logmodel = lr.fit(X_train, y_train)

#from sklearn.ensemble import GradientBoostingClassifier
#clf = GradientBoostingClassifier()
#clf = clf.fit(X_train, y_train)

In [31]:
#prediction = clf.predict(X_test)
prediction = logmodel.predict(X_test)

In [32]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [33]:
accuracy_score(y_test, prediction)
#0.81564245810055869

0.81564245810055869

In [34]:
recall_score(y_test, prediction)
#0.78260869565217395

0.78260869565217395

In [35]:
f1_score(y_test, prediction)
#0.76595744680851074

0.76595744680851074

### Submission

In [36]:
x1 = test_data.drop(['Name', 'PassengerId', 'Ticket', 'Prefix', 'Cabin', 'Sex', 'Embarked', 'Survived', 'Pclass', 'Pclass1', 'Tix_Pre'], axis=1)
#submission = clf.predict(x1)
submission = logmodel.predict(x1)

In [37]:
submit = pd.DataFrame()
submit['PassengerId'] = test_data.PassengerId
submit['Survived'] = submission
submit = submit.astype(int)

submit.shape

(418, 2)

In [38]:
submit.to_csv('Submission.csv', index=False)