In [1]:
# Loading dataset
import pandas as pd


def load(dataset):
    X = pd.read_csv('../data/%s' % dataset + '/%s_train.data' % dataset, header=None, sep=' ')
    # For unknown for me reason, X is read with last column filled with NaN
    X.drop(X.columns[[-1]], axis=1, inplace=True)
    Y = pd.read_csv('../data/%s' % dataset + '/%s_train.solution' % dataset, header=None, sep=' ')[0]
    return X, Y

In [15]:
from math import sqrt
import sklearn
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, VarianceThreshold
from libscores import auc_cv
from libscores import bac_cv
from classifier import eliminate_features


def rf_model(x, y, p, e):
    return Pipeline([
        ('variance', VarianceThreshold()),
        ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
        ('classification', RandomForestClassifier(n_estimators=e, random_state=1, n_jobs=-1))
    ]).fit(x, y), "SELECT+RF percentile=%d" % p + " n_estimators=%d" % e


def et_model(x, y, p, e):
    return Pipeline([
        ('variance', VarianceThreshold()),            
        ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
        ('classification', ExtraTreesClassifier(n_estimators=e, n_jobs=-1, max_depth=None, random_state=1))
    ]).fit(x, y), "SELECT+ET percentile=%d" % p + " n_estimators=%d" % e


def optimize(X, Y):
    """Performs optimization for given dataset"""
    best_model = None
    best_label = None
    best_metrics = 0

    if name in ["christine", "jasmine", "madeline", "philippine", "sylvine"]:
        metrics_function = bac_cv
    else:
        metrics_function = auc_cv

    # Eliminate features to sqrt(len(X)), or number of observations, in case of small datasets
    indexes = eliminate_features(X, Y, min(sqrt(len(X)), 2 * len(X[0])))
    Xi = X[indexes]
    
    model_function = rf_model  
    for e in [100, 200, 300]:
        for p in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
            m, label = model_function(Xi, Y, p, e)
            r = metrics_function(m, Xi, Y)
            print "Processed: %s" % label + " score: %f" % r
            if r > best_metrics:
                best_metrics = r
                best_label = label
                best_model = m

    model_function = et_model
    for e in [100, 200, 300]:
        for p in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
            m, label = model_function(Xi, Y, p, e)
            r = metrics_function(m, Xi, Y)
            print "Processed: %s" % label + " score: %f" % r            
            if r > best_metrics:
                best_metrics = r
                best_label = label
                best_model = m

                
    print "Best model: %s" % best_label + " metrics: %f" % best_metrics
    return best_model, Xi, indexes

In [None]:
%%time
for name in ["jasmine", "madeline", "philippine", "sylvine"]:
    print "PROCESSING %s" % name
    X, Y = load(name)
    optimize(X, Y)

PROCESSING jasmine
Processed: SELECT+RF percentile=5 n_estimators=100 score: 0.473190
Processed: SELECT+RF percentile=10 n_estimators=100 score: 0.554290
Processed: SELECT+RF percentile=15 n_estimators=100 score: 0.575737
Processed: SELECT+RF percentile=20 n_estimators=100 score: 0.558981
Processed: SELECT+RF percentile=25 n_estimators=100 score: 0.628686
Processed: SELECT+RF percentile=30 n_estimators=100 score: 0.623995
Processed: SELECT+RF percentile=35 n_estimators=100 score: 0.640751
Processed: SELECT+RF percentile=40 n_estimators=100 score: 0.634718
Processed: SELECT+RF percentile=45 n_estimators=100 score: 0.640751
Processed: SELECT+RF percentile=50 n_estimators=100 score: 0.646783
Processed: SELECT+RF percentile=5 n_estimators=200 score: 0.473190
Processed: SELECT+RF percentile=10 n_estimators=200 score: 0.554290
Processed: SELECT+RF percentile=15 n_estimators=200 score: 0.577078
Processed: SELECT+RF percentile=20 n_estimators=200 score: 0.557641
Processed: SELECT+RF percentile