##Import and check the data

In [100]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [90]:
#read in the raw data
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [91]:
df.info()
#df.Embarked.value_counts()
#df.Cabin.value_counts() 
df.Sex.value_counts()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


male      577
female    314
dtype: int64

##Convert categorical features

In [92]:
#embarked has only 2 missing values so just replace those with the most common value 
df.Embarked.fillna('S', inplace=True)

#doesn't seem to have that much useful information so let's drop it
df.drop('Cabin',1,inplace=True)
#ticket looks very difficult to use also so let's drop that for now
df.drop('Ticket',1, inplace=True)

In [93]:
#convert male and female to boolean

df['gender_num'] = df['Sex'].map({'male':1, 'female':0})

#convert embarked to 3 dummy variables 

embarked_categories = pd.get_dummies(df.Embarked,prefix='Embarked')
df[embarked_categories.columns] = embarked_categories

#convert Pclass to 3 dummy variables 

pclass_categories = pd.get_dummies(df.Pclass,prefix='pclass')
df[pclass_categories.columns] = pclass_categories

df.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,gender_num,Embarked_C,Embarked_Q,Embarked_S,pclass_1,pclass_2,pclass_3
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,7.25,S,1,0,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71.2833,C,0,1,0,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,7.925,S,0,0,0,1,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53.1,S,0,0,0,1,1,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,8.05,S,1,0,0,1,0,0,1


In [94]:
#extract the person's title from the name (Mr., Master, Miss etc) and convert to dummy variables 

df['title'] = df['Name'].apply(lambda x: x.split(' ')[1].strip())

#print df.title.value_counts()

title_categories = pd.get_dummies(df.title,prefix='title')
#keep only the top 4 titles
title_categories = title_categories[['title_Mr.','title_Miss.','title_Mrs.','title_Master.']]
df[title_categories.columns] = title_categories
df.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,...,Embarked_Q,Embarked_S,pclass_1,pclass_2,pclass_3,title,title_Mr.,title_Miss.,title_Mrs.,title_Master.
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,7.25,S,...,0,1,0,0,1,Mr.,1,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71.2833,C,...,0,0,1,0,0,Mrs.,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,7.925,S,...,0,1,0,0,1,Miss.,0,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53.1,S,...,0,1,1,0,0,Mrs.,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,8.05,S,...,0,1,0,0,1,Mr.,1,0,0,0


In [95]:
#drop all of columns we no longer need

df.drop(['PassengerId','Pclass','Name','Sex','Embarked','title'], axis=1, inplace=1)


##Fill in the missing values for Age

In [96]:
#fill in missing values for age 
#first understand if the data is MCAR, MAR, or MNAR

#1.  Check if the age being missing is correlated to survival 

df_age = df[['Age','Survived']]

df_age = df_age.fillna(value=0)#.astype(int)
df_age.info()
df_age['is_missing'] = df_age.Age != 0

print pd.crosstab(df_age.is_missing, df_age.Survived)

print 'Survival rate for rows with age: ', float(52)/(125+52)
print 'Survival rate for rows missing age: ', float(290)/(424+290)

#rows missing age are more likely to survive
#the rows are not missing at random 


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 2 columns):
Age         891 non-null float64
Survived    891 non-null int64
dtypes: float64(1), int64(1)
memory usage: 20.9 KB
Survived      0    1
is_missing          
False       125   52
True        424  290
Survival rate for rows with age:  0.293785310734
Survival rate for rows missing age:  0.406162464986


In [139]:
#try to use regression imputation to fill in the missing values for Age
#training data should be everything that isn't missing
#then use the model to predict age on the values that are missing 

nonmissing = df.dropna()
missing = df[pd.isnull(df.Age)]

#print missing.info()
#print nonmissing.info()

nonmissing_X = sm.add_constant(nonmissing.drop(['Age', 'Survived'], axis=1).values, prepend=True)
nonmissing_y = nonmissing['Age'].values


#run ols
results = sm.OLS(nonmissing_y, nonmissing_X).fit()
bhat = results.params
print bhat
results.summary()



[  1.84319058e+01  -1.96971209e+00  -4.09140125e-01  -1.48664763e-02
   1.09905340e+01   2.64302352e+00   1.05696093e+01   5.21927302e+00
   1.42526914e+01   3.91734943e+00   2.61865003e-01  -5.09987982e+00
  -4.85236164e+00   9.24101409e+00  -2.73768976e+01]


0,1,2,3
Dep. Variable:,y,R-squared:,0.424
Model:,OLS,Adj. R-squared:,0.414
Method:,Least Squares,F-statistic:,42.93
Date:,"Fri, 04 Sep 2015",Prob (F-statistic):,8.8e-76
Time:,10:31:42,Log-Likelihood:,-2726.6
No. Observations:,714,AIC:,5479.0
Df Residuals:,701,BIC:,5539.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,18.4319,1.855,9.937,0.000,14.790 22.074
x1,-1.9697,0.529,-3.723,0.000,-3.008 -0.931
x2,-0.4091,0.572,-0.716,0.474,-1.532 0.713
x3,-0.0149,0.011,-1.414,0.158,-0.036 0.006
x4,10.9905,3.577,3.073,0.002,3.968 18.013
x5,2.6430,1.110,2.381,0.018,0.464 4.822
x6,10.5696,1.735,6.091,0.000,7.163 13.976
x7,5.2193,0.947,5.514,0.000,3.361 7.078
x8,14.2527,1.040,13.702,0.000,12.210 16.295

0,1,2,3
Omnibus:,54.688,Durbin-Watson:,1.923
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66.149
Skew:,0.689,Prob(JB):,4.32e-15
Kurtosis:,3.572,Cond. No.,4.15e+17


In [153]:
#use the coefficients to predict age on the missing value dataset 

missing_X = sm.add_constant(missing.drop(['Age', 'Survived'], axis=1).values, prepend=True)

print missing_X.shape
print bhat.shape

#get the dot product of these two matricies to get a vector of predicted ages 

pred = np.dot(missing_X, bhat)
print len(pred)

#now put the missing predicted age values back into hte missing dataframe
pred_series = pd.Series(pred, index=missing.index, name='age_imputed')
missing_p = pd.concat([missing, pred_series], axis=1)


#replace the missing age with the predicted ages
missing_p['Age'] = missing_p['age_imputed']
missing_p.drop('age_imputed', axis=1, inplace=True)

#join non missing and missing back together

df_imp = pd.concat([nonmissing, missing_p])


#pred_series.head()

df_imp.to_csv('df_imp.csv')
missing_p.head(10)

(177, 15)
(15,)
177


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,gender_num,Embarked_C,Embarked_Q,Embarked_S,pclass_1,pclass_2,pclass_3,title_Mr.,title_Miss.,title_Mrs.,title_Master.
5,0,35.028289,0,0,8.4583,1,0,1,0,0,0,1,1,0,0,0
17,1,33.265918,0,0,13.0,1,0,0,1,0,1,0,1,0,0,0
19,1,30.470398,0,0,7.225,0,1,0,0,0,0,1,0,0,1,0
26,0,27.120038,0,0,7.225,1,1,0,0,0,0,1,1,0,0,0
28,1,24.293883,0,0,7.8792,0,0,1,0,0,0,1,0,1,0,0
29,0,29.686315,0,0,7.8958,1,0,0,1,0,0,1,1,0,0,0
31,1,40.420675,1,0,146.5208,0,1,0,0,1,0,0,0,0,1,0
32,1,24.295803,0,0,7.75,0,0,1,0,0,0,1,0,1,0,0
36,1,27.119976,0,0,7.2292,1,1,0,0,0,0,1,1,0,0,0
42,0,27.110066,0,0,7.8958,1,1,0,0,0,0,1,1,0,0,0
