In [156]:
#import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import collections
from biokit.viz import corrplot
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
from sklearn import tree
from sklearn import neighbors 
from sklearn import naive_bayes
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from time import clock
import fancyimpute
from statsmodels.graphics.mosaicplot import mosaic
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## Data

In [157]:
#import our data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
data = pd.read_csv("data/processed_data.csv")
train = data[:len(train)]
test = data[len(train):]
X, Y = train.drop("Survived", axis=1).as_matrix(), train.Survived

## Utils

In [238]:
def report(clf, X, Y):
    start=clock()
    predicted = cross_validation.cross_val_predict(clf, X, Y, cv=10)
    end = clock()
    print("Accuracy: ", metrics.accuracy_score(Y, predicted))
    print("Recall: ", metrics.recall_score(Y, predicted))
    print("Precision: ", metrics.precision_score(Y, predicted))
    print("F1: ", metrics.f1_score(Y, predicted))
    print("Time elapsed: ", end-start)
    return predicted

def train_test_model(model, hyperparameters, X_train, X_test, y_train, y_test,
                    folds = 10, score='accuracy'):
    """
    Given a [model] and a set of possible [hyperparameters], along with 
    matricies corresponding to hold-out cross-validation, returns a model w/ 
    optimized hyperparameters, and prints out model evaluation metrics.
    """
    optimized_model = GridSearchCV(model, hyperparameters, cv = folds, n_jobs = -1, scoring=score)
    optimized_model.fit(X_train, y_train)
    predicted = optimized_model.predict(X_test)
    print ('Optimized parameters:', optimized_model.best_params_)
    print ('Model accuracy (hold-out):', optimized_model.score(X_test, y_test))
    kfold_score = np.mean(cross_validation.cross_val_score(
            optimized_model.best_estimator_, np.append(X_train, X_test, axis = 0), 
            np.append(y_train, y_test), cv = folds, n_jobs = -1))
    print ('Model accuracy ({0}-fold):'.format(str(folds)), kfold_score, '\n')
    return optimized_model

def make_submission_file(filename, predictions):
    results = pd.DataFrame()
    results['Survived'] = [int(i) for i in predictions]
    results['PassengerId'] = np.array(test.axes[0])+1
    results.to_csv(filename,index=False)

## Men

In [159]:
male_train = train[train.Sex==0]
X_male, Y_male = male_train.drop("Survived", axis=1).as_matrix(), np.array(male_train.Survived)
X_train_men, X_test_men, y_train_men, y_test_men = cross_validation.train_test_split(X_male, Y_male, test_size = 0.2)

In [160]:
#Logistic Regression
lr_men = train_test_model(
    linear_model.LogisticRegression(), {
        'C': [0.0001,0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:10}, {0:1, 1:100}, {0:1, 1:1000}]}, 
    X_train_men, X_test_men, y_train_men, y_test_men)
lr_men = lr_men.best_estimator_
report(lr_men, X_male, Y_male)

Optimized parameters: {'class_weight': None, 'C': 10}
Model accuracy (hold-out): 0.887931034483
Model accuracy (10-fold): 0.830240471869 

Accuracy:  0.830155979203
Recall:  0.275229357798
Precision:  0.612244897959
F1:  0.379746835443
Time elapsed:  0.26143500000011954


In [161]:
# K Nearest Neighbors
knn_men = train_test_model(
    neighbors.KNeighborsClassifier(), {
        'n_neighbors': np.array([num + 1 for num in range(0, 20) if num % 2 == 0])}, 
    X_train_men, X_test_men, y_train_men, y_test_men)
knn_men = knn_men.best_estimator_
report(knn_men, X_male, Y_male)

Optimized parameters: {'n_neighbors': 9}
Model accuracy (hold-out): 0.870689655172
Model accuracy (10-fold): 0.83362609109 

Accuracy:  0.82842287695
Recall:  0.256880733945
Precision:  0.608695652174
F1:  0.361290322581
Time elapsed:  0.7719740000000002


In [162]:
# SVM
svm_men = train_test_model(
    svm.SVC(probability = True, random_state = 25), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'gamma': np.logspace(-9, 3, 13)}, 
    X_train_men, X_test_men, y_train_men, y_test_men)
svm_men = svm_men.best_estimator_
report(svm_men, X_male, Y_male)

Optimized parameters: {'gamma': 0.01, 'C': 10}
Model accuracy (hold-out): 0.922413793103
Model accuracy (10-fold): 0.849328061533 

Accuracy:  0.852686308492
Recall:  0.220183486239
Precision:  1.0
F1:  0.360902255639
Time elapsed:  12.39700599999992


In [163]:
# Random Forest
rf_men = train_test_model(
    ensemble.RandomForestClassifier(random_state = 25), {
    "warm_start":[True, False],
    "n_estimators":[50, 400, 800],
    "max_depth":[None, 5, 10, 20],
    "max_features":[None, 100, 300, 500]
}, X_train_men, X_test_men, y_train_men, y_test_men)
rf_men = rf_men.best_estimator_

report(rf_men, X_male, Y_male)

Optimized parameters: {'warm_start': True, 'max_depth': None, 'max_features': 100, 'n_estimators': 400}
Model accuracy (hold-out): 0.913793103448
Model accuracy (10-fold): 0.833779491833 

Accuracy:  0.847487001733
Recall:  0.238532110092
Precision:  0.838709677419
F1:  0.371428571429
Time elapsed:  11.885559999999941


In [235]:
ridge_men = linear_model.RidgeClassifier(alpha=10)
report(ridge_men, X_male, Y_male)

Accuracy:  0.847487001733
Recall:  0.229357798165
Precision:  0.862068965517
F1:  0.36231884058
Time elapsed:  0.540903999999955


## Women

In [165]:
female_train = train[train.Sex==1]
X_female, Y_female = female_train.drop("Survived", axis=1).as_matrix(), np.array(female_train.Survived)
X_train_women, X_test_women, y_train_women, y_test_women = cross_validation.train_test_split(X_female, 
                                                                                             Y_female, test_size = 0.2)

In [166]:
#Logistic Regression
lr_women = train_test_model(
    linear_model.LogisticRegression(), {
        'C': [0.0001,0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:10}, {0:1, 1:100}, {0:1, 1:1000}]}, 
    X_train_women, X_test_women, y_train_women, y_test_women)
lr_women = lr_women.best_estimator_
report(lr_women, X_female, Y_female)

Optimized parameters: {'class_weight': {0: 1, 1: 10}, 'C': 10}
Model accuracy (hold-out): 0.809523809524
Model accuracy (10-fold): 0.856717375367 

Accuracy:  0.856687898089
Recall:  0.987124463519
Precision:  0.845588235294
F1:  0.910891089109
Time elapsed:  0.1478349999999864


In [167]:
# K Nearest Neighbors
knn_women = train_test_model(
    neighbors.KNeighborsClassifier(), {
        'n_neighbors': np.array([num + 1 for num in range(0, 20) if num % 2 == 0])}, 
    X_train_women, X_test_women, y_train_women, y_test_women)
knn_women = knn_women.best_estimator_
report(knn_women, X_female, Y_female)

Optimized parameters: {'n_neighbors': 11}
Model accuracy (hold-out): 0.761904761905
Model accuracy (10-fold): 0.802880620723 

Accuracy:  0.796178343949
Recall:  0.909871244635
Precision:  0.83137254902
F1:  0.868852459016
Time elapsed:  0.19590100000004895


In [168]:
# SVM
svm_women = train_test_model(
    svm.SVC(probability = True, random_state = 25), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'gamma': np.logspace(-9, 3, 13)}, 
    X_train_women, X_test_women, y_train_women, y_test_women)
svm_women = svm_women.best_estimator_
report(svm_women, X_female, Y_female)

Optimized parameters: {'gamma': 0.10000000000000001, 'C': 10}
Model accuracy (hold-out): 0.777777777778
Model accuracy (10-fold): 0.834243646139 

Accuracy:  0.837579617834
Recall:  0.905579399142
Precision:  0.879166666667
F1:  0.892177589852
Time elapsed:  3.7402669999999034


In [169]:
# Random Forest
rf_women = train_test_model(
    ensemble.RandomForestClassifier(random_state = 25), {
    "warm_start":[True, False],
    "n_estimators":[50, 400, 800],
    "max_depth":[None, 5, 10, 20],
    "max_features":[None, 100, 300, 500]
}, X_train_women, X_test_women, y_train_women, y_test_women)
rf_women = rf_women.best_estimator_

report(rf_women, X_female, Y_female)

Optimized parameters: {'warm_start': True, 'max_depth': None, 'max_features': 100, 'n_estimators': 50}
Model accuracy (hold-out): 0.761904761905
Model accuracy (10-fold): 0.828289956012 

Accuracy:  0.815286624204
Recall:  0.914163090129
Precision:  0.848605577689
F1:  0.880165289256
Time elapsed:  0.9390450000000783


In [170]:
combined_w = ensemble.VotingClassifier([('2', lr_women), ('4', svm_women)],voting='soft')
report(combined_w, X_female, Y_female)

Accuracy:  0.859872611465
Recall:  0.974248927039
Precision:  0.856603773585
F1:  0.911646586345
Time elapsed:  6.430282999999918


In [None]:
# if you combine svm_men and combined_w you get 80.8 on the leaderboard

## Submission

In [207]:
X_test = test.drop("Survived", axis=1).as_matrix()
sex = np.array(test.Sex)

In [17]:
svm_men.fit(X_male, Y_male)
combined_w.fit(X_female, Y_female)

VotingClassifier(estimators=[('2', LogisticRegression(C=1000, class_weight={0: 1, 1: 2}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)), ('4', ...bf',
  max_iter=-1, probability=True, random_state=25, shrinking=True,
  tol=0.001, verbose=False))],
         voting='soft', weights=[3, 2])

In [22]:
preds = []
for i in range(len(sex)):
    if sex[i]==0:
        preds.append(svm_men.predict(X_test[i]))
    else:
        preds.append(combined_w.predict(X_test[i]))

In [222]:
combo.fit(X, Y)
preds = combo.predict(X_test)

In [223]:
#make submission
filename = "predictions/new_combo.csv"
make_submission_file(filename, preds)