## Data Munging

In [8]:
import pandas as pd
import numpy as np

1) Load data, assign it to an object and get a general idea of what we are looking it

In [9]:
titanic_raw = pd.read_csv('train.csv')
titanic_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [10]:
titanic_raw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
titanic_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


2) Convert data into numerical form and add new features

2a) Find all missing values in Age and fill in the value using the average of the surround values

In [12]:
# values of all observations which have a missing Age value
missing_values_index = titanic_raw['Age'].index[titanic_raw['Age'].apply(np.isnan)]
# function to help determine what age's to average. If a surrounding value is
# also NaN, use 0 as the age.
age_filler = lambda a: 0 if np.isnan(a) == True else a
for x in missing_values_index:
    s = [age_filler(titanic_raw['Age'][x-1]),age_filler(titanic_raw['Age'][x+1])]
    age = np.average(s)
    titanic_raw['Age'].iloc[x] = age
# check if all Age values have a value, an empty array is desired
missing_values_index = titanic_raw['Age'].index[titanic_raw['Age'].apply(np.isnan)]
print missing_values_index

Int64Index([], dtype='int64')


2b) Convert the value found in the Sex column to binary value

In [13]:
titanic_raw['Sex'] = titanic_raw['Sex'].apply(lambda x: 1 if x == 'male' else 0)
# old code below
#for x in xrange(titanic_raw['Sex'].count()):
#    if titanic_raw['Sex'][x] == 'male':
#        titanic_raw['Sex'][x] = 1
#    else:
#        titanic_raw['Sex'][x] = 0

2c) Determine who the children are. I believe this will affect survival rates

In [14]:
titanic_raw['is_child'] = titanic_raw['Age'].apply(lambda x: 1 if x < 13 else 0)
# old code below
#for x in xrange(1,titanic_raw['Age'].count()):
#    if titanic_raw['Age'][x] < 13:
#        titanic_raw['is_child'][x] = 1
#    else:
#        titanic_raw['is_child'][x] = 0

# Check that returns were returned correctly
titanic_raw[(titanic_raw.Age < 13)].head(),titanic_raw[(titanic_raw.Age > 13)].head()


(    PassengerId  Survived  Pclass                             Name  Sex  Age  \
 7             8         0       3   Palsson, Master. Gosta Leonard    1  2.0   
 10           11         1       3  Sandstrom, Miss. Marguerite Rut    0  4.0   
 16           17         0       3             Rice, Master. Eugene    1  2.0   
 24           25         0       3    Palsson, Miss. Torborg Danira    0  8.0   
 28           29         1       3    O'Dwyer, Miss. Ellen "Nellie"    0  9.5   
 
     SibSp  Parch   Ticket     Fare Cabin Embarked  is_child  
 7       3      1   349909  21.0750   NaN        S         1  
 10      1      1  PP 9549  16.7000    G6        S         1  
 16      4      1   382652  29.1250   NaN        Q         1  
 24      3      1   349909  21.0750   NaN        S         1  
 28      0      0   330959   7.8792   NaN        Q         1  ,
    PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3       

3) Create a my observations and responses dataframe

In [15]:
titanic_feat = titanic_raw[['Pclass','Sex','Age','is_child']]
titanic_resp = titanic_raw['Survived']

In [16]:
# Confirm shape of dataframe are of equal length and suitable for SciPy use
titanic_feat.shape, titanic_resp.shape

((891, 4), (891,))

In [17]:
# load test data and add feature columns return only relevant features
test_data = pd.read_csv('test.csv')
# values of all observations which have a missing Age value
missing_values_index = test_data['Age'].index[test_data['Age'].apply(np.isnan)]
print missing_values_index
# function to help determine what age's to average. If a surrounding value is
# also NaN, use 0 as the age.
age_filler = lambda a: 0 if np.isnan(a) == True else a
for x in missing_values_index:
    position = x
    if x + 1 >= test_data['Sex'].count():
        x = x/2
    s = [age_filler(test_data['Age'][x-1]),age_filler(test_data['Age'][x+1])]
    age = np.average(s)
    test_data['Age'][position] = age
# check if all Age values have a value, an empty array is desired
missing_values_index = test_data['Age'].index[test_data['Age'].apply(np.isnan)]
print missing_values_index
test_data['is_child'] = test_data['Age'].apply(lambda x: 1 if x < 13 else 0)
test_data['Sex'] = test_data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test_data = test_data[['Pclass','Sex','Age','is_child']]
# old code below
#test_data['is_child'] = 0
#for x in xrange(titanic_raw['Age'].count()):
#    if test_data['Age'][x] < 13:
#        test_data['is_child'] = 1
#    else:
#        test_data['is_child'] = 0
test_data.describe()

Int64Index([ 10,  22,  29,  33,  36,  39,  41,  47,  54,  58,  65,  76,  83,
             84,  85,  88,  91,  93, 102, 107, 108, 111, 116, 121, 124, 127,
            132, 133, 146, 148, 151, 160, 163, 168, 170, 173, 183, 188, 191,
            199, 200, 205, 211, 216, 219, 225, 227, 233, 243, 244, 249, 255,
            256, 265, 266, 267, 268, 271, 273, 274, 282, 286, 288, 289, 290,
            292, 297, 301, 304, 312, 332, 339, 342, 344, 357, 358, 365, 366,
            380, 382, 384, 408, 410, 413, 416, 417],
           dtype='int64')
Int64Index([], dtype='int64')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Pclass,Sex,Age,is_child
count,418.0,418.0,418.0,418.0
mean,2.26555,0.636364,29.170156,0.090909
std,0.841838,0.481622,13.749192,0.287824
min,1.0,0.0,0.17,0.0
25%,1.0,0.0,21.0,0.0
50%,3.0,1.0,27.0,0.0
75%,3.0,1.0,37.375,0.0
max,3.0,1.0,76.0,1.0


## Machine Learning

In [18]:
# load SciPy KNeighbors test
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)

In [19]:
# fit observation features to responses
knn.fit(titanic_feat,titanic_resp)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [20]:
# run a random prediction. Observation Pclass = 3, Sex = 1, Age = 9, is_child = 1
# return 0 means did  not survive, 1 means survived
knn.predict([3,1,9,1])



array([1])

In [21]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,is_child
0,3,1,34.5,0
1,3,0,47.0,0
2,2,1,62.0,0
3,3,1,27.0,0
4,3,0,22.0,0


In [22]:
# run prediction analysis against the test_data
knnresults = knn.predict(test_data)

In [23]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,is_child
0,3,1,34.5,0
1,3,0,47.0,0
2,2,1,62.0,0
3,3,1,27.0,0
4,3,0,22.0,0


In [24]:
input_output_combined = test_data[['Pclass','Sex','Age','is_child']]
input_output_combined['prediction'] = 0 
for x in xrange(input_output_combined['Pclass'].count()):
    input_output_combined['prediction'].iloc[x] = knnresults[x]

In [25]:
input_output_combined.head()

Unnamed: 0,Pclass,Sex,Age,is_child,prediction
0,3,1,34.5,0,0
1,3,0,47.0,0,0
2,2,1,62.0,0,0
3,3,1,27.0,0,0
4,3,0,22.0,0,1


## Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
is_child       891 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 97.5+ KB


(None,    PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name  Sex  Age  SibSp  Parch  \
 0                            Braund, Mr. Owen Harris    1   22      1      0   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0   38      1      0   
 2                             Heikkinen, Miss. Laina    0   26      0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0   35      1      0   
 4                           Allen, Mr. William Henry    1   35      0      0   
 
              Ticket     Fare Cabin Embarked  is_child  
 0         A/5 21171   7.2500   NaN        S         0  
 1          PC 17599  71.2833   C85        C         0  
 2  STON/O2. 3101282   7.9250   NaN        S         0  
 3            113803  53.1000  C123        S         0

In [None]:
titanic_log = pd.read_csv('train.csv')
x = titanic_raw[['Pclass','Sex','Age','is_child']]
y = titanic_raw[['Survived']]

In [None]:
x.head(),x.describe(),y.head(),y.describe()

### Import training module

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:

demo_train, demo_test, life_train, life_test = train_test_split(x,y,test_size = .3)
logreg = LogisticRegression()
logreg.fit(demo_train,life_train)
logreg.predict(demo_test)

In [None]:
demo_train.shape, demo_test.shape, life_train.shape, life_test.shape

In [None]:
logreg.fit(demo_train,life_train.values)

In [None]:
demo_train.shape, demo_test.shape

In [None]:
life_train.Survived.shape

In [None]:
logreg.fit(demo_train,life_train.Survived)

In [64]:
life_pred = logreg.predict(demo_test)

In [None]:
from sklearn import metrics
print metrics.accuracy_score(life_test,life_pred)

In [None]:
logreg.intercept_
logreg.coef_

In [None]:
feat_col = ['Pclass','Sex','Age','is_child']
zip(feat_col,logreg.coef_)

In [None]:
np.count_nonzero(titanic_log(titanic_log.Sex == 0))

In [None]:
logreg.intercept_

In [None]:
demo_train.head()

In [83]:
knn_return = knn.predict(test)
len(knn_return)
count = 0
for x in xrange(len(knn_return)):
    if x == 0:
        count += 1
    else: pass
np.count_nonzero(knn_return)

152

In [84]:
titanic_log = pd.read_csv('train.csv')

#fix missing age values
missing_index = titanic_log['Age'].index[titanic_log['Age'].apply(np.isnan)]
age_filler = lambda a: 0 if np.isnan(a) == True else a
for x in missing_index:
    s = [age_filler(titanic_log['Age'][x-1]),age_filler(titanic_log['Age'][x+1])]
    age = np.average(s)
    titanic_log['Age'][x] = age


#convert sex
for x in xrange(titanic_log['Sex'].count()):
    if titanic_log['Sex'][x] == 'male':
        titanic_log['Sex'][x] = 1
    else:
        titanic_log['Sex'][x] = 0


#create is_child feature and fill data
titanic_log['is_child'] = 0
for x in xrange(titanic_log['Age'].count()):
    if titanic_log['Age'][x] < 13:
        titanic_log['is_child'][x] = 1
    else:
        titanic_log['is_child'][x] = 0


x = titanic_log[['Pclass','Sex','Age','is_child']]
y = titanic_log[['Survived']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [85]:
x.head(),x.describe(),y.head(),y.describe()

(   Pclass Sex  Age  is_child
 0       3   1   22         0
 1       1   0   38         0
 2       3   0   26         0
 3       1   0   35         0
 4       3   1   35         0,            Pclass         Age    is_child
 count  891.000000  891.000000  891.000000
 mean     2.308642   29.101552    0.106622
 std      0.836071   14.153845    0.308805
 min      1.000000    0.420000    0.000000
 25%      2.000000   20.000000    0.000000
 50%      3.000000   28.000000    0.000000
 75%      3.000000   37.250000    0.000000
 max      3.000000   80.000000    1.000000,    Survived
 0         0
 1         1
 2         1
 3         1
 4         0,          Survived
 count  891.000000
 mean     0.383838
 std      0.486592
 min      0.000000
 25%      0.000000
 50%      0.000000
 75%      1.000000
 max      1.000000)

In [86]:
from sklearn.cross_validation import train_test_split
demo_train, demo_test, life_train, life_test = train_test_split(x,y,test_size = .3)

In [87]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(demo_train,life_train)
logreg.predict(demo_test)

  y = column_or_1d(y, warn=True)


array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1])

In [114]:
demo_train.shape, demo_test.shape, life_train.shape, life_test.shape

((623, 4), (268, 4), (623, 1), (268, 1))

In [89]:
logreg.fit(demo_train,life_train.values)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [90]:
demo_train.shape, demo_test.shape

((623, 4), (268, 4))

In [91]:
life_train.Survived.shape

(623,)

In [92]:
logreg.fit(demo_train,life_train.Survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [93]:
life_pred = logreg.predict(demo_test)

In [95]:
from sklearn import metrics
print metrics.accuracy_score(life_test,life_pred)

0.817164179104


In [96]:
logreg.intercept_
logreg.coef_

array([[-0.96422375, -2.30044287, -0.01412233,  0.71915524]])

In [97]:
feat_col = ['Pclass','Sex','Age','is_child']
zip(feat_col,logreg.coef_)

[('Pclass', array([-0.96422375, -2.30044287, -0.01412233,  0.71915524]))]

In [110]:
np.count_nonzero(titanic_log(titanic_log.Sex == 0))

TypeError: 'DataFrame' object is not callable

In [111]:
np.count_nonzero(titanic_log(titanic_log.Sex == 0))

TypeError: 'DataFrame' object is not callable

In [112]:
logreg.intercept_

array([ 3.42122022])

In [113]:
demo_train.head()

Unnamed: 0,Pclass,Sex,Age,is_child
180,3,0,18.0,0
678,3,0,43.0,0
704,3,1,26.0,0
401,3,1,26.0,0
589,3,1,28.5,0
