In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 5
%autosave 15

import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
import math

import sklearn
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV

Autosaving every 15 seconds


In [2]:
train_data = pd.read_csv('Data/train.csv')
train_data = train_data.drop("id", axis = 1)
def convert_target(x):
    return int(x["target"].split('_')[1])
train_data["target"] = train_data.apply(convert_target, axis = 1)

In [3]:
from sklearn.metrics import log_loss, accuracy_score
import datetime, time
def test_method(method, a_train, a_test, b_train, b_test):
    print method.__class__
    method.fit(a_train, b_train)
    print log_loss(b_train, method.predict_proba(a_train))

def test_methods(methods, a_train, a_test, b_train, b_test):
    for method in methods:
        test_method(method, a_train, a_test, b_train, b_test)

def test_models(data):
    target = data.target
    train = data.drop('target', axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.2, random_state = 42)
    methods = [RandomForestClassifier(n_estimators = 100)]
    test_methods(methods, X_train, X_test, y_train, y_test)

def get_best_params(method, params, data):
    target = data.target
    train = data.drop('target', axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.2, random_state = 42)
    errors_train, errors_test = aggregate_train_test_errors(method, params, X_train, y_train, X_test, y_test, [log_loss], print_flag=True)
    plot_train_test_errors(method, params, errors_train, errors_test)
    
def print_score(model, metric, X_train, y_train, X_test, y_test):
        print("Model: %s Metric: %s On test: %f" % (model.__class__.__name__, metric.__name__, metric(y_test, model.predict_proba(X_test))))
        
def print_scores(model, metrics, X_train, y_train, X_test, y_test, parameter = None):
    for metric in metrics:
        if parameter != None:
            print("%s %s" % (parameter[0], parameter[1]))
            print_score(model, metric, X_train, y_train, X_test, y_test)
        else:
            print_score(model, metric, X_train, y_train, X_test, y_test)
    
def aggregate_train_test_errors(model, params, X_train, y_train, X_test, y_test, metrics = [], print_flag = False):
    errors_train = {key: {'params': params[key], 'values': []} for key in params.keys()}
    errors_test = {key: {'params': params[key], 'values': []}  for key in params.keys()}
    base_parameters = model.get_params()
    start_time = time.time()
    prev_time = start_time
    for key, values in params.iteritems():
        if isinstance(values, list):
            for value in values:
                model.set_params(**{key: value})
                model.fit(X_train, y_train)
                cur_time = time.time()
                from_start = datetime.datetime.fromtimestamp(cur_time - start_time + time.timezone).strftime('%H:%M:%S')
                from_last = datetime.datetime.fromtimestamp(cur_time - prev_time + time.timezone).strftime('%H:%M:%S')
                print from_start + " (" + from_last + ")"
                prev_time = cur_time
                errors_train[key]['values'].append(model.score(X_train, y_train))
                errors_test[key]['values'].append(model.score(X_test, y_test))
                if print_flag == True and metrics:
                    print_scores(model, metrics, X_train, y_train, X_test, y_test, (key, value))
            model.set_params(**base_parameters)
        else:
            raise Exception("Take only list of parameters!")   
    return errors_train, errors_test

def plot_train_test_errors(model, model_params, errors_train, errors_test, save = None):
    fig, axes = plt.subplots(ncols=len(model_params))
    fig.set_size_inches((15,9))
    for ind, param in enumerate(model_params.keys()):
        if isinstance(errors_train[param]['params'][0], str):
            axes[ind].plot(range(len(errors_train[param]['params'])), errors_train[param]['values'], label = 'on train')
            axes[ind].plot(range(len(errors_train[param]['params'])), errors_test[param]['values'], label = 'on test')
            axes[ind].xaxis.set_ticks(range(len(errors_train[param]['params'])), errors_train[param]['params'])
        else:            
            axes[ind].plot(errors_train[param]['params'], errors_train[param]['values'], label = 'on train')
            axes[ind].plot(errors_train[param]['params'], errors_test[param]['values'], label = 'on test')
            axes[ind].xaxis.set_ticks(range(len(errors_train[param]['params'])), errors_train[param]['params'])
        axes[ind].set_title(str(param))
    plt.legend()
    plt.tight_layout()

In [6]:
gb_params = {"n_estimators": [10, 50, 150, 300, 450, 600], 
             "max_depth": range(1, 26, 4), 
             "min_samples_split": range(1, 26, 4), 
             "min_samples_leaf": range(1, 26, 4), 
             "max_features": range(10, 100, 10)}

get_best_params(GradientBoostingClassifier(), gb_params, train_data)

00:00:39 (00:00:39)
n_estimators 10
Model: GradientBoostingClassifier Metric: log_loss On test: 1.067229
00:03:58 (00:03:19)
n_estimators 50
Model: GradientBoostingClassifier Metric: log_loss On test: 0.665423
00:13:48 (00:09:50)
n_estimators 150
Model: GradientBoostingClassifier Metric: log_loss On test: 0.572966
00:33:17 (00:19:28)
n_estimators 300
Model: GradientBoostingClassifier Metric: log_loss On test: 0.534792
01:02:10 (00:28:52)
n_estimators 450
Model: GradientBoostingClassifier Metric: log_loss On test: 0.519510
01:40:33 (00:38:23)
n_estimators 600
Model: GradientBoostingClassifier Metric: log_loss On test: 0.510731
01:47:26 (00:06:53)
min_samples_split 1
Model: GradientBoostingClassifier Metric: log_loss On test: 0.599448
01:54:02 (00:06:35)
min_samples_split 5
Model: GradientBoostingClassifier Metric: log_loss On test: 0.595559
02:00:38 (00:06:36)
min_samples_split 9
Model: GradientBoostingClassifier Metric: log_loss On test: 0.595549
02:07:14 (00:06:35)
min_samples_split 1

KeyboardInterrupt: 

In [7]:
gb_best_params = {"n_estimators": 600, 
                  "max_depth": 9}

In [None]:
gb_params = {"min_samples_leaf": range(1, 26, 4), 
             "max_features": range(10, 100, 10)}

get_best_params(GradientBoostingClassifier(), gb_params, train_data)