In [3]:
#import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import collections
from biokit.viz import corrplot
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
from sklearn import tree
from sklearn import neighbors 
from sklearn import naive_bayes
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from time import clock
import fancyimpute
from statsmodels.graphics.mosaicplot import mosaic
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## Data

In [10]:
#import our data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
data = pd.read_csv("data/processed_data.csv")
train = data[:len(train)]
test = data[len(train):]

## Utils

In [120]:
def report(clf, X, Y):
    start=clock()
    predicted = cross_validation.cross_val_predict(clf, X, Y, cv=10)
    end = clock()
    print("Accuracy: ", metrics.accuracy_score(Y, predicted))
    print("Recall: ", metrics.recall_score(Y, predicted))
    print("Precision: ", metrics.precision_score(Y, predicted))
    print("F1: ", metrics.f1_score(Y, predicted))
    print("Time elapsed: ", end-start)

def train_test_model(model, hyperparameters, X_train, X_test, y_train, y_test,
                    folds = 10, score='accuracy'):
    """
    Given a [model] and a set of possible [hyperparameters], along with 
    matricies corresponding to hold-out cross-validation, returns a model w/ 
    optimized hyperparameters, and prints out model evaluation metrics.
    """
    optimized_model = GridSearchCV(model, hyperparameters, cv = folds, n_jobs = -1, scoring=score)
    optimized_model.fit(X_train, y_train)
    predicted = optimized_model.predict(X_test)
    print ('Optimized parameters:', optimized_model.best_params_)
    print ('Model accuracy (hold-out):', optimized_model.score(X_test, y_test))
    kfold_score = np.mean(cross_validation.cross_val_score(
            optimized_model.best_estimator_, np.append(X_train, X_test, axis = 0), 
            np.append(y_train, y_test), cv = folds, n_jobs = -1))
    print ('Model accuracy ({0}-fold):'.format(str(folds)), kfold_score, '\n')
    return optimized_model

def make_submission_file(filename, predictions):
    results = pd.DataFrame()
    results['Survived'] = [int(i) for i in predictions]
    results['PassengerId'] = np.array(test.axes[0])+1
    results.to_csv(filename,index=False)

## Men

In [121]:
male_train = train[train.Sex==0]
X_male, Y_male = male_train.drop("Survived", axis=1).as_matrix(), np.array(male_train.Survived)
X_train_men, X_test_men, y_train_men, y_test_men = cross_validation.train_test_split(X_male, Y_male, test_size = 0.2)

In [122]:
#Logistic Regression
lr_men = train_test_model(
    linear_model.LogisticRegression(), {
        'C': [0.0001,0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:10}, {0:1, 1:100}, {0:1, 1:1000}]}, 
    X_train_men, X_test_men, y_train_men, y_test_men)
lr_men = lr_men.best_estimator_
report(lr_men, X_male, Y_male)

Optimized parameters: {'class_weight': None, 'C': 10}
Model accuracy (hold-out): 0.818965517241
Model accuracy (10-fold): 0.83713810388 

Accuracy:  0.830155979203
Recall:  0.275229357798
Precision:  0.612244897959
F1:  0.379746835443
Time elapsed:  0.12097400000004654


In [123]:
# K Nearest Neighbors
knn_men = train_test_model(
    neighbors.KNeighborsClassifier(), {
        'n_neighbors': np.array([num + 1 for num in range(0, 20) if num % 2 == 0])}, 
    X_train_men, X_test_men, y_train_men, y_test_men)
knn_men = knn_men.best_estimator_
report(knn_men, X_male, Y_male)

Optimized parameters: {'n_neighbors': 5}
Model accuracy (hold-out): 0.818965517241
Model accuracy (10-fold): 0.83011731916 

Accuracy:  0.82842287695
Recall:  0.284403669725
Precision:  0.596153846154
F1:  0.385093167702
Time elapsed:  0.7843369999999368


In [None]:
# SVM
svm_men = train_test_model(
    svm.SVC(probability = True, random_state = 25), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'gamma': np.logspace(-9, 3, 13)}, 
    X_train_men, X_test_men, y_train_men, y_test_men)
svm_men = svm_men.best_estimator_
report(svm_men, X_male, Y_male)

Optimized parameters: {'gamma': 0.01, 'C': 10}
Model accuracy (hold-out): 0.818965517241
Model accuracy (10-fold): 0.852714761041 

Accuracy:  0.852686308492
Recall:  0.220183486239
Precision:  1.0
F1:  0.360902255639
Time elapsed:  10.243816000000038


In [None]:
# Random Forest
rf_men = train_test_model(
    ensemble.RandomForestClassifier(random_state = 25), {
    "warm_start":[True, False],
    "n_estimators":[50, 400, 800],
    "max_depth":[None, 5, 10, 20],
    "max_features":[None, 100, 300, 500]
}, X_train_men, X_test_men, y_train_men, y_test_men)
rf_men = rf_men.best_estimator_

report(rf_men, X_male, Y_male)

In [None]:
combined = ensemble.VotingClassifier([('2', rf_men), ('4', svm_men)], 
                                     weights=[1,2],voting='soft)
report(combined, X_male, Y_male)

## Women

In [None]:
female_train = train[train.Sex==1]
X_female, Y_female = female_train.drop("Survived", axis=1).as_matrix(), np.array(female_train.Survived)
X_train_women, X_test_women, y_train_women, y_test_women = cross_validation.train_test_split(X_female, 
                                                                                             Y_female, test_size = 0.2)

In [None]:
#Logistic Regression
lr_women = train_test_model(
    linear_model.LogisticRegression(), {
        'C': [0.0001,0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:10}, {0:1, 1:100}, {0:1, 1:1000}]}, 
    X_train_women, X_test_women, y_train_women, y_test_women)
lr_women = lr_women.best_estimator_
report(lr_women, X_female, Y_female)

In [None]:
# K Nearest Neighbors
knn_women = train_test_model(
    neighbors.KNeighborsClassifier(), {
        'n_neighbors': np.array([num + 1 for num in range(0, 20) if num % 2 == 0])}, 
    X_train_women, X_test_women, y_train_women, y_test_women)
knn_women = knn_women.best_estimator_
report(knn_women, X_female, Y_female)

In [None]:
# SVM
svm_women = train_test_model(
    svm.SVC(probability = True, random_state = 25), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'gamma': np.logspace(-9, 3, 13)}, 
    X_train_women, X_test_women, y_train_women, y_test_women)
svm_women = svm_women.best_estimator_
report(svm_women, X_female, Y_female)

In [None]:
# Random Forest
rf_women = train_test_model(
    ensemble.RandomForestClassifier(random_state = 25), {
    "warm_start":[True, False],
    "n_estimators":[50, 400, 800],
    "max_depth":[None, 5, 10, 20],
    "max_features":[None, 100, 300, 500]
}, X_train_women, X_test_women, y_train_women, y_test_women)
rf_women = rf_women.best_estimator_

report(rf_women, X_female, Y_female)

In [None]:
combined_w = ensemble.VotingClassifier([('2', lr_women), ('4', svm_women)], 
                                     weights=[3,2],voting='soft')
report(combined_w, X_female, Y_female)

## Submission

In [None]:
X_test = test.drop("Survived", axis=1).as_matrix()
sex = np.array(test.Sex)

In [None]:
combined.fit(X_male, Y_male)
combined_w.fit(X_female, Y_female)

In [102]:
# 80.03 % on leaderboard
preds = []
for i in range(len(sex)):
    if sex[i]==0:
        preds.append(combined.predict(X_test[i]))
    else:
        preds.append(combined_w.predict(X_test[i]))

In [103]:
#make submission
filename = "predictions/women_men.csv"
make_submission_file(filename, preds)