In [973]:
#import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import collections
from biokit.viz import corrplot
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
from sklearn import tree
from sklearn import neighbors 
from sklearn import naive_bayes
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from time import clock
import fancyimpute
import xgboost as xgb
from statsmodels.graphics.mosaicplot import mosaic
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## Data

In [7]:
#import our data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
data = pd.read_csv("data/processed_data.csv")

## Utils

In [9]:
def report(clf, X, Y):
    start=clock()
    predicted = cross_validation.cross_val_predict(clf, X, Y, cv=10)
    end = clock()
    print("Accuracy: ", metrics.accuracy_score(Y, predicted))
    print("Recall: ", metrics.recall_score(Y, predicted))
    print("Precision: ", metrics.precision_score(Y, predicted))
    print("F1: ", metrics.f1_score(Y, predicted))
    print("Time elapsed: ", end-start)

def train_test_model(model, hyperparameters, X_train, X_test, y_train, y_test,
                    folds = 5):
    """
    Given a [model] and a set of possible [hyperparameters], along with 
    matricies corresponding to hold-out cross-validation, returns a model w/ 
    optimized hyperparameters, and prints out model evaluation metrics.
    """
    optimized_model = GridSearchCV(model, hyperparameters, cv = folds, n_jobs = -1)
    optimized_model.fit(X_train, y_train)
    predicted = optimized_model.predict(X_test)
    print ('Optimized parameters:', optimized_model.best_params_)
    print ('Model accuracy (hold-out):', optimized_model.score(X_test, y_test))
    kfold_score = np.mean(cross_validation.cross_val_score(
            optimized_model.best_estimator_, np.append(X_train, X_test, axis = 0), 
            np.append(y_train, y_test), cv = folds, n_jobs = -1))
    print ('Model accuracy ({0}-fold):'.format(str(folds)), kfold_score, '\n')
    return optimized_model

def make_submission_file(filename, predictions):
    results = pd.DataFrame()
    results['Survived'] = [int(i) for i in predictions]
    results['PassengerId'] = np.array(test.axes[0])+1
    results.to_csv(filename,index=False)

## Men

## Women

## Submission

In [969]:
combined_hard.fit(X, Y)
combined_hard_preds = combined_hard.predict(test.drop('Survived', axis=1))

In [971]:
#make submission
filename = "predictions/test.csv"
make_submission_file(filename, combined_hard_preds)