In [3]:
#import packages 

import pandas as pd
import numpy as np

from bokeh.plotting import figure,show,output_notebook
from bokeh.models import Range1d

from sklearn import datasets
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',60)

In [4]:
#read in data 

data = pd.read_csv('/Users/maxcameron/Desktop/General Assembly/DAT_SF_19/data/titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [5]:
#investigate data
# print data.head()
# print data.describe()
print data.info()

# Munging Tasks: Drop unecessary columns, Get gender-specific averages for Age, Sex 
# should be boolean instead of a string, Pclass should be dummy variable. 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
None


In [6]:
#drop unecessary columns

df = data.drop(['Name','Ticket','Fare','Cabin','Embarked'], axis=1)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,male,22,1,0
1,2,1,1,female,38,1,0
2,3,1,3,female,26,0,0
3,4,1,1,female,35,1,0
4,5,0,3,male,35,0,0


In [7]:
# Convert sex to boolean
df.Sex = df.Sex.replace(['female','male'],[0,1])
print data[['Name','Sex']].head()

                                                Name     Sex
0                            Braund, Mr. Owen Harris    male
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female
2                             Heikkinen, Miss. Laina  female
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female
4                           Allen, Mr. William Henry    male


In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,1,22,1,0
1,2,1,1,0,38,1,0
2,3,1,3,0,26,0,0
3,4,1,1,0,35,1,0
4,5,0,3,1,35,0,0


In [9]:
#Impute avg age

avg_age = df.Age.mean()
df.Age = df.Age.fillna(avg_age)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 7 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
dtypes: float64(1), int64(6)
memory usage: 55.7 KB


In [10]:
# put Pclass into dummy variables 

pclass = pd.get_dummies(df.Pclass, prefix = 'Pclass')
print pclass.head()

   Pclass_1  Pclass_2  Pclass_3
0         0         0         1
1         1         0         0
2         0         0         1
3         1         0         0
4         0         0         1


In [11]:
#merge into DF

df = pd.merge(df,pclass,left_index=True, right_index=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3
0,1,0,3,1,22,1,0,0,0,1
1,2,1,1,0,38,1,0,1,0,0
2,3,1,3,0,26,0,0,0,0,1
3,4,1,1,0,35,1,0,1,0,0
4,5,0,3,1,35,0,0,0,0,1


In [12]:
#Drop Pclass

df.drop('Pclass', axis=1, inplace = True)
df.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,22,1,0,0,0,1
1,2,1,0,38,1,0,1,0,0
2,3,1,0,26,0,0,0,0,1
3,4,1,0,35,1,0,1,0,0
4,5,0,1,35,0,0,0,0,1


In [13]:
#make sure everything looks ok 
print df.describe()

       PassengerId    Survived         Sex         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    0.647587   29.699118    0.523008   
std     257.353842    0.486592    0.477990   13.002015    1.102743   
min       1.000000    0.000000    0.000000    0.420000    0.000000   
25%     223.500000    0.000000    0.000000   22.000000    0.000000   
50%     446.000000    0.000000    1.000000   29.699118    0.000000   
75%     668.500000    1.000000    1.000000   35.000000    1.000000   
max     891.000000    1.000000    1.000000   80.000000    8.000000   

            Parch    Pclass_1    Pclass_2    Pclass_3  
count  891.000000  891.000000  891.000000  891.000000  
mean     0.381594    0.242424    0.206510    0.551066  
std      0.806057    0.428790    0.405028    0.497665  
min      0.000000    0.000000    0.000000    0.000000  
25%      0.000000    0.000000    0.000000    0.000000  
50%      0.000000    0.000000    

In [14]:
#Create an instance of Logistic Regression

model_lr = LogisticRegression(C=1)
features = df.drop(['Survived','PassengerId'], axis=1)
target = df.Survived

#print features.head()
print target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [15]:
print features.head()
print target.head()
print target.shape
print features.shape

   Sex  Age  SibSp  Parch  Pclass_1  Pclass_2  Pclass_3
0    1   22      1      0         0         0         1
1    0   38      1      0         1         0         0
2    0   26      0      0         0         0         1
3    0   35      1      0         1         0         0
4    1   35      0      0         0         0         1
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64
(891,)
(891, 7)


In [16]:
## Fitting a logistic regression 
lr_fit = LogisticRegression(C=1).fit(features, target)
feat_coef =  lr_fit.coef_
names = np.array(features.columns.values)
feat_coef = pd.DataFrame(feat_coef, columns = names)
print feat_coef

        Sex       Age     SibSp    Parch  Pclass_1  Pclass_2  Pclass_3
0 -2.594755 -0.035003 -0.300274 -0.05462  1.738592  0.646305 -0.448613


# Question 2

In [18]:
import math as m
int_lr_fit = lr_fit.intercept_
print m.exp(int_lr_fit) ##odds of surving as male according to my logistic model

6.93293965465


In [26]:
print m.exp(feat_coef['Pclass_1'])
print m.exp(feat_coef['Pclass_2'])
print m.exp(feat_coef['Pclass_3'])

print m.exp(feat_coef['Sex'])

print df.groupby('Survived')['Sex'].value_counts()

5.68932503323
1.90847573284
0.638513423818
0.0746641631604
Survived  Sex
0         1      468
          0       81
1         0      233
          1      109
dtype: int64


# Question 3

In [27]:
from sklearn.cross_validation import train_test_split

In [31]:
#instantiate train-test-split

X_train, X_test, y_train, y_test = train_test_split( features, 
target, test_size=0.20, random_state=0)

In [32]:
lr_fit_testtrain = LogisticRegression(C=1).fit(X_train, y_train)

In [33]:
predictions = lr_fit_testtrain.predict(X_test)

In [34]:
predictions

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0])

In [35]:
lr_fit.score(features, target)

0.79012345679012341

In [37]:
from sklearn.cross_validation import cross_val_score

In [45]:
print cross_val_score(model_lr,features,target,cv=2).mean()
print cross_val_score(model_lr,features,target,cv=4).mean()
print cross_val_score(model_lr,features,target,cv=6).mean()

0.783387917569
0.793497568477
0.786746478022


In [46]:
reg_features = pd.DataFrame(StandardScaler().fit_transform(features), columns = features.columns.values)
print reg_features.head()

        Sex       Age     SibSp     Parch  Pclass_1  Pclass_2  Pclass_3
0  0.737695 -0.592481  0.432793 -0.473674 -0.565685 -0.510152  0.902587
1 -1.355574  0.638789  0.432793 -0.473674  1.767767 -0.510152 -1.107926
2 -1.355574 -0.284663 -0.474545 -0.473674 -0.565685 -0.510152  0.902587
3 -1.355574  0.407926  0.432793 -0.473674  1.767767 -0.510152 -1.107926
4  0.737695  0.407926 -0.474545 -0.473674 -0.565685 -0.510152  0.902587


In [47]:
print cross_val_score(model_lr,reg_features,target,cv=4).mean()

0.795744770836
