In [1]:
# Loading dataset
import pandas as pd


def load(dataset):
    X = pd.read_csv('../data/%s' % dataset + '/%s_train.data' % dataset, header=None, sep=' ')
    # For unknown for me reason, X is read with last column filled with NaN
    X.drop(X.columns[[-1]], axis=1, inplace=True)
    Y = pd.read_csv('../data/%s' % dataset + '/%s_train.solution' % dataset, header=None, sep=' ')[0]
    return X, Y

In [2]:
import sklearn
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile
from libscores import bac_cv


def optimize(X, Y):
    """Performs optimization for given dataset"""
    best_model = None
    max_metrics = 0
    reports = {}
    if name in ["christine", "jasmine", "madeline", "philippine", "sylvine"]:
        metrics_function = bac_cv
    else:
        metrics_function = auc_cv

    for p in [5, 10, 15, 20, 25, 30, 35, 50]:
        for e in [100, 200, 300]:
            M = Pipeline([
                ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
                ('classification', RandomForestClassifier(n_estimators=e, random_state=1, n_jobs=-1))
            ]).fit(X, Y)
            r = metrics_function(M, X, Y)
            reports["SELECT+RF percentile=%d" % p + " n_estimators=%d" % e] = r
            if r > max_metrics:
                max_metrics = r
                best_model = M

    for p in [5, 10, 15, 20, 25, 30, 35, 50]:
        for e in [100, 200, 300]:
            M = Pipeline([
                ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
                ('classification',
                 ExtraTreesClassifier(n_estimators=e, n_jobs=-1, max_depth=None, min_samples_split=1, random_state=1))
            ]).fit(X, Y)
            r = metrics_function(M, X, Y)
            reports["SELECT+RF percentile=%d" % p + " n_estimators=%d" % e] = r
            if r > max_metrics:
                max_metrics = r
                best_model = M

    for p in [5, 10, 15, 20, 25, 30, 35, 50]:
        for e in [100, 200, 300]:
            for f in [5, 10, 15, 20]:
                M = Pipeline([
                    ('feature_selection',
                     SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
                    ('classification',
                     ExtraTreesClassifier(n_estimators=e, n_jobs=-1, max_depth=None, min_samples_split=1,
                                          random_state=1, max_features=f))
                ]).fit(X, Y)
                r = metrics_function(M, X, Y)
                reports["SELECT+RF percentile=%d" % p + " n_estimators=%d" % e + " max_features %d" % f] = r
                if r > max_metrics:
                    max_metrics = r
                    best_model = M

    print "Best metrics %f" % max_metrics
    print "BEST model %s" % max(reports, key=reports.get)
    return best_model


In [9]:
%%time
for name in ["jasmine", "madeline", "philippine", "sylvine"]:
    print "PROCESSING %s" % name
    X, Y = load(name)
    optimize(X, Y)