In [24]:
import pandas as pd
import numpy as np
import matplotlib as plt

#Split values based on the two CSVs
training_csv = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
testing_csv = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")

data = training_csv
target = data['Survived'] 

%matplotlib inline

In [25]:
# Show percentage survivors by sex and by class
pvt_train = pd.pivot_table(data=training, index=['Survived'], columns=['Sex', 'Embarked'], values =['PassengerId'], aggfunc='count')
pvt_train.apply(lambda x: x / (x.max() + x.min()))

Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId
Sex,female,female,female,male,male,male
Embarked,C,Q,S,C,Q,S
Survived,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
0,0.123288,0.25,0.310345,0.694737,0.926829,0.825397
1,0.876712,0.75,0.689655,0.305263,0.073171,0.174603


In [26]:
# Split values into numerical and categorical features
numerical_features = data[['Fare','Pclass','Age']]
# Need to handle values where Age is missing

In [41]:
features = pd.concat([data[['Fare', 'Pclass', 'Age']],
                           pd.get_dummies(data['Sex'], prefix='Sex'),
                           pd.get_dummies(data['Embarked'], prefix='Embarked')],
                          axis=1)
features.head(5)

Unnamed: 0,Fare,Pclass,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,7.25,3,22,0,1,0,0,1
1,71.2833,1,38,1,0,1,0,0
2,7.925,3,26,1,0,0,0,1
3,53.1,1,35,1,0,0,0,1
4,8.05,3,35,0,1,0,0,1


In [42]:
# Because sex can only be male or female, we can drop the Sex_male column
features = features.drop('Sex_male', 1)
print '\n', features.head(5), '\n\n------------------------------\n'
# Looking at the feature values we see that there are some ages missing
print features.count(), '\n'

      Fare  Pclass  Age  Sex_female  Embarked_C  Embarked_Q  Embarked_S
0   7.2500       3   22           0           0           0           1
1  71.2833       1   38           1           1           0           0
2   7.9250       3   26           1           0           0           1
3  53.1000       1   35           1           0           0           1
4   8.0500       3   35           0           0           0           1 

------------------------------

Fare          891
Pclass        891
Age           714
Sex_female    891
Embarked_C    891
Embarked_Q    891
Embarked_S    891
dtype: int64


In [43]:
# features.dropna().median()

In [91]:
# We will try to loop through the different classes and sex to apply the median age
for sex in xrange(0,2):
    for cabin_class in xrange(0,4):
        median_age_group = features[(features.Pclass == cabin_class) & (features.Sex_female == sex)].dropna().median()
        features[(features.Pclass == cabin_class) 
                 & (features.Sex_female == sex)] = features[(features.Pclass == cabin_class) 
                                                                     & (features.Sex_female == sex)
                                                                    ].fillna(median_age_group)
# adding validation for if the person is a child
features['child'] = (features.Age <= 18)*1

In [92]:
features.head()

Unnamed: 0,Fare,Pclass,Age,Sex_female,Embarked_C,Embarked_Q,Embarked_S,child
0,7.25,3,22,0,0,0,1,0
1,71.2833,1,38,1,1,0,0,0
2,7.925,3,26,1,0,0,1,0
3,53.1,1,35,1,0,0,1,0
4,8.05,3,35,0,0,0,1,0


In [93]:
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB

mnb = BernoulliNB()

params = {
    'alpha': [0.005, 0.0051, 0.00059],
}
mnbgs = GridSearchCV(mnb, params, cv=5, scoring='roc_auc')
mnbgs.fit(features, target)

print mnbgs.best_score_
print mnbgs.best_params_

0.799451201536
{'alpha': 0.005}


In [94]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

rf = RandomForestClassifier(n_estimators=100)

params = {
    'max_features': [0.75, 0.8, 0.9],
    'max_depth': [3.75, 3.9, 4, 4.1],
}
rgs = GridSearchCV(rf, params, cv=5, scoring='roc_auc')
rgs.fit(features, target)
print rgs.best_score_
print rgs.best_params_

0.868623852281
{'max_features': 0.9, 'max_depth': 4}


In [79]:
print rgs.best_score_
print rgs.best_params_

0.866916607304
{'max_features': 0.75, 'max_depth': 4.1}


In [96]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

gb = GradientBoostingClassifier(n_estimators=100, subsample=.8)

params = {
    'learning_rate': [0.05, 0.1, 0.2, 0.5],
    'max_features': [0.7, 0.75, 0.8],
    'max_depth': [ 3.5, 3.6, 3.75, 3.9, 4],
}
gs = GridSearchCV(gb, params, cv=5, scoring='roc_auc')
gs.fit(features, target)
print gs.best_score_
print gs.best_params_

0.877983240881
{'max_features': 0.8, 'learning_rate': 0.2, 'max_depth': 3.75}


In [61]:
print gs.best_score_
print gs.best_params_

0.878693523426
{'max_features': 0.75, 'learning_rate': 0.1, 'max_depth': 3.5}


In [24]:
survived_prediction = gs.predict(rich_features_final)

In [40]:
submdf = pd.concat( [testing['PassengerId'], pd.DataFrame(survived_prediction)], axis=1)
submdf.columns = ["PassengerId", "Survived"]
submdf.to_csv("Titanic_Graident_Boosted_GridSearch_20150616.csv", index=False)

In [203]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

# [['Pclass','SibSp','Parch','child','Survived','sex_bool']]
data = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
data['embarked_val'] = data['Embarked'].dropna().map({'S': 1, 'C': 2, 'Q': 3}).astype(int)
data['child'] = (data.Age.dropna() <= 18)*1
data['sex_bool'] = (data.Sex.dropna() == 'male')*1
cleaned_data = data[['Pclass','child','Survived','sex_bool','embarked_val']].dropna() 
X = cleaned_data[['Pclass','child','sex_bool','embarked_val']]
y = cleaned_data[['Survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [147]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print 'Decision Tree Classifier'
print classification_report(y_test, predictions)

Decision Tree Classifier
             precision    recall  f1-score   support

          0       0.76      0.94      0.84       103
          1       0.88      0.60      0.71        75

avg / total       0.81      0.80      0.79       178



In [148]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train.ravel())
predictions = clf.predict(X_test)
print 'RandomForest Classifier'
print classification_report(y_test, predictions)

RandomForest Classifier
             precision    recall  f1-score   support

          0       0.76      0.94      0.84       103
          1       0.88      0.59      0.70        75

avg / total       0.81      0.79      0.78       178



In [149]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train.ravel())
predictions = clf.predict(X_test)
print 'Gradient Boosting Classifier'
print classification_report(y_test, predictions)

Gradient Boosting Classifier
             precision    recall  f1-score   support

          0       0.76      0.94      0.84       103
          1       0.88      0.60      0.71        75

avg / total       0.81      0.80      0.79       178



In [150]:
clf = ExtraTreesClassifier()
clf.fit(X_train, y_train.ravel())
predictions = clf.predict(X_test)
print 'Extra Trees Classifier'
print classification_report(y_test, predictions)

Extra Trees Classifier
             precision    recall  f1-score   support

          0       0.76      0.94      0.84       103
          1       0.88      0.60      0.71        75

avg / total       0.81      0.80      0.79       178



In [151]:
clf = LogisticRegression()
clf.fit(X_train, y_train.ravel())
predictions = clf.predict(X_test)
print 'Logistic Regression Classifier'
print classification_report(y_test, predictions)

Logistic Regression Classifier
             precision    recall  f1-score   support

          0       0.78      0.94      0.85       103
          1       0.89      0.63      0.73        75

avg / total       0.82      0.81      0.80       178



In [190]:
# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train.ravel())
predictions = knn.predict(X_test)
print 'K Nearest Neighbor Regression Classifier'
print classification_report(y_test, predictions)

K Nearest Neighbor Regression Classifier
             precision    recall  f1-score   support

          0       0.76      0.97      0.85       103
          1       0.93      0.57      0.71        75

avg / total       0.83      0.80      0.79       178



In [202]:
clf = SVC()
clf.fit(X_train, y_train.ravel())
predictions = clf.predict(X_test)
print 'Support Vector Machines Classifier'
print classification_report(y_test, predictions)

Support Vector Machines Classifier
             precision    recall  f1-score   support

          0       0.77      0.95      0.85       103
          1       0.90      0.61      0.73        75

avg / total       0.83      0.81      0.80       178



In [204]:
clf = LinearSVC()
clf.fit(X_train, y_train.ravel())
predictions = clf.predict(X_test)
print 'Support Vector Machines Classifier'
print classification_report(y_test, predictions)

Support Vector Machines Classifier
             precision    recall  f1-score   support

          0       0.83      0.87      0.85       106
          1       0.79      0.74      0.76        72

avg / total       0.81      0.81      0.81       178



In [211]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
predictions = clf.predict(X_test)
print classification_report(y_test, predictions)

             precision    recall  f1-score   support

          0       0.83      0.87      0.85       106
          1       0.79      0.74      0.76        72

avg / total       0.81      0.81      0.81       178



In [185]:
def get_classification(classification_report_string):
    str_cl = [x.split() for x in classification_report(y_test, predictions).splitlines()][-1][3:-1]
    cl_rep = np.array([float(x) for x in str_cl]).mean()
    return cl_rep

In [213]:
max_val = {'m': [0, 0]}
for x in range(1,300):
    knn = KNeighborsClassifier(n_neighbors=x)
    knn.fit(X_train, y_train.ravel())
    predictions = knn.predict(X_test)
    t = get_classification(classification_report(y_test, predictions))
    if t > max_val['m'][1]:
        max_val['m'] = [x, t]
max_val

{'m': [188, 0.78333333333333333]}