Following the Kaggle recommended Python tutorial by [Data Quest](https://www.dataquest.io/mission/74/getting-started-with-kaggle/)

# Looking at the Data

In [91]:
import pandas
titanic = pandas.read_csv('data_sets/train.csv')
print(titanic.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   
5                                   Moran, Mr. James    male  NaN      0   
6                            McCarthy, Mr. Timothy J    male   54      

In [92]:
print(titanic.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [93]:
print(titanic.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
None


# Missing Data

In [94]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

# Converting Non-Numeric Data

First check which labels the person in charge of the data set used for sex:

In [95]:
print(titanic['Sex'].unique())

['male' 'female']


In [96]:
titanic.loc[titanic['Sex']=='male', 'Sex'] = 0
titanic.loc[titanic['Sex']=='female', 'Sex'] = 1

In [97]:
print(titanic['Embarked'].unique())
pandas.Series.value_counts(titanic['Embarked'])

['S' 'C' 'Q' nan]


S    644
C    168
Q     77
dtype: int64

### Replacing most missing data by S, since that's the mode of the 'Emabark' distribution. 

In [98]:
titanic['Embarked'] = titanic['Embarked'].fillna('S')

In [99]:
keys = titanic['Embarked'].unique()
values = range(len(keys))
embarkedDict = dict(zip(keys,values))
for key in embarkedDict:
    titanic.loc[titanic['Embarked'] == key, 'Embarked'] = embarkedDict[key]

# Machine Learning

In [103]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
alg = LinearRegression()
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

In [104]:
predictions = []
for train, test in kf:
    train_predictors = (titanic[predictors].iloc[train,:])
    train_target = titanic["Survived"].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)

# Evaluating Error

For this case, the error metric we'll use is percentage of correct predictions. E.g., how many say survived in predictions and how many agree for test_prections

In [115]:
import numpy as np
predictions = np.concatenate(predictions, axis=0)
# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

In [127]:
accuracy = np.sum(predictions == titanic['Survived'])/float(titanic.shape[0])
print (accuracy)

0.783389450056
