<b>Import libraries, read in data, and inspect.</b>

In [87]:
import pandas as pd
import numpy as np
titanic_train = pd.read_csv("train.csv")
titanic_test = pd.read_csv("test.csv")

print(titanic_train.head(5))
print(titanic_train.describe())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

<br>
<b>Fixing up the data: 

First, set values for all missing age attributes

Second, encode the gender values</b>

In [88]:
titanic_train["Age"] = titanic_train["Age"].fillna(titanic_train["Age"].median())

#Set the "Sex" value to 0 for any entry that has "Sex" value of "male" and set the value to 1 for any entry that has
#"Sex" value of "female"
titanic_train.loc[titanic_train["Sex"] == "male", "Sex"] = 0
titanic_train.loc[titanic_train["Sex"] == "female", "Sex"] = 1


titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

Check the values of other non-numeric features; encode them accordingly.

In [59]:
print(titanic_train["Embarked"].unique())

['S' 'C' 'Q' nan]


In [89]:
titanic_train.loc[titanic_train["Embarked"] == "nan", "Embarked"] = "S"
titanic_train.loc[titanic_train["Embarked"] == "S", "Embarked"] = 0
titanic_train.loc[titanic_train["Embarked"] == "C", "Embarked"] = 1
titanic_train.loc[titanic_train["Embarked"] == "Q", "Embarked"] = 2

titanic_test.loc[titanic_test["Embarked"] == "nan", "Embarked"] = "S"
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

In [102]:
#Clean data by removing useless or irrelevant columns4
features = ["Pclass", "Sex", "Age", "SibSp", "Fare"]
titanic_train_features = titanic_train[features]
titanic_train_labels = titanic_train["Survived"]


<br>
<b>Create k folds cross validator, and begin training and testing.</b>

In [113]:
#from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import KFold, GridSearchCV

parameters = {'min_samples_split':[2,6,7,8], 'max_depth':[3,6,7,8], 'min_samples_leaf':[1,2,3]}
scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(DecisionTreeClassifier(random_state=32), parameters, scorer)

kfold = KFold(n_splits=3, random_state=1)

predictions = []

for train, test in kfold.split(titanic_train_features):
    train_features = (titanic_train_features.iloc[train,:])
    train_target = titanic_train_labels.iloc[train]
    
    grid_fit = grid_obj.fit(train_features, train_target)
    best_clf = grid_fit.best_estimator_
    best_clf.fit(train_features, train_target)
    test_predictions = best_clf.predict(titanic_train_features.iloc[test,:])
    predictions.append(test_predictions)

predictions = np.concatenate(predictions, axis=0)
score = accuracy_score(titanic_train["Survived"], predictions)

print (score)

best_clf.fit(titanic_train_features, titanic_train_labels)
predictions = best_clf.predict(titanic_train_features)
score = accuracy_score(titanic_train["Survived"], predictions)

print (score)



0.813692480359
0.827160493827


In [130]:
titanic_test_features = titanic_test[features]
test_pred = best_clf.predict(titanic_test_features)
d = {'PassengerId':titanic_test["PassengerId"] , 'Survived': test_pred}
df = pd.DataFrame(data=d)
df.to_csv("predictions.csv")