In [51]:
import pandas
titanic = pandas.read_csv('data_sets/train.csv')
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

titanic.loc[titanic['Sex']=='male', 'Sex'] = 0
titanic.loc[titanic['Sex']=='female', 'Sex'] = 1

titanic['Embarked'] = titanic['Embarked'].fillna('S')

embarkedDict = {'Q': 2, 'C': 1, 'S': 0}
for key in embarkedDict:
    titanic.loc[titanic['Embarked'] == key, 'Embarked'] = embarkedDict[key]

{'Q': 2, 'C': 1, 'S': 0}


In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
print(scores.mean())

0.787878787879


# On to the Test Set! 

In [53]:
titanic_test = pandas.read_csv("data_sets/test.csv")
print (titanic_test.describe())

       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000    1.000000   21.000000    0.000000    0.000000    7.895800
50%    1100.500000    3.000000   27.000000    0.000000    0.000000   14.454200
75%    1204.750000    3.000000   39.000000    1.000000    0.000000   31.500000
max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200


In [54]:
print(titanic_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 39.2+ KB
None


In [55]:
titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())

In [56]:
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic['Fare'].median())

In [57]:
titanic_test['Embarked'].unique()

array(['Q', 'S', 'C'], dtype=object)

In [58]:
titanic_test.loc[titanic_test['Sex']=='male', 'Sex'] = 0
titanic_test.loc[titanic_test['Sex']=='female', 'Sex'] = 1

print (embarkedDict)
for key in embarkedDict:
    titanic_test.loc[titanic_test['Embarked'] == key, 'Embarked'] = embarkedDict[key]

{'Q': 2, 'C': 1, 'S': 0}


In [59]:
print (titanic_test[predictors])

     Pclass Sex   Age  SibSp  Parch      Fare Embarked
0         3   0  34.5      0      0    7.8292        2
1         3   1  47.0      1      0    7.0000        0
2         2   0  62.0      0      0    9.6875        2
3         3   0  27.0      0      0    8.6625        0
4         3   1  22.0      1      1   12.2875        0
5         3   0  14.0      0      0    9.2250        0
6         3   1  30.0      0      0    7.6292        2
7         2   0  26.0      1      1   29.0000        0
8         3   1  18.0      0      0    7.2292        1
9         3   0  21.0      2      0   24.1500        0
10        3   0  28.0      0      0    7.8958        0
11        1   0  46.0      0      0   26.0000        0
12        1   1  23.0      1      0   82.2667        0
13        2   0  63.0      1      0   26.0000        0
14        1   1  47.0      1      0   61.1750        0
15        2   1  24.0      1      0   27.7208        1
16        2   0  35.0      0      0   12.3500        2
17        

# Generating a Submission File

In [60]:
# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

In [62]:
print (submission)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         1
19           911         1
20           912         0
21           913         0
22           914         1
23           915         1
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         1
391         1283         1
392         1284         0
3