In [5]:
# Loading dataset
import pandas as pd


def load(dataset):
    X = pd.read_csv('../data/%s' % dataset + '/%s_train.data' % dataset, header=None, sep=' ')
    # For unknown for me reason, X is read with last column filled with NaN
    X.drop(X.columns[[-1]], axis=1, inplace=True)
    Y = pd.read_csv('../data/%s' % dataset + '/%s_train.solution' % dataset, header=None, sep=' ')[0]
    return X, Y

In [7]:
import sklearn
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile

from libscores import bac_cv
from classifier import eliminate_features


def bruteforce(X, Y):
    """Performs brute force search for given datates"""
    best_model = None
    max_bac = 0
    reports = {}

    for p in [5, 10, 25]:
        for e in [100, 200, 300]:
            M = Pipeline([
                ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
                ('classification', RandomForestClassifier(n_estimators=e, random_state=1, n_jobs=-1))
            ]).fit(X, Y)
            bac = bac_cv(M, X, Y)
            reports["SELECT+RF percentile=%d" % p + " n_estimators=%d" % e] = bac
            if bac > max_bac:
                max_bac = bac
                best_model = M        

    for p in [5, 10, 25]:
        for e in [100, 200, 300]:
            M = Pipeline([
                ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
                ('classification', ExtraTreesClassifier(n_estimators=e, n_jobs=-1, max_depth=None, min_samples_split=1, random_state=1))
            ]).fit(X, Y)
            bac = bac_cv(M, X, Y)
            reports["SELECT+RF percentile=%d" % p + " n_estimators=%d" % e] = bac
            if bac > max_bac:
                max_bac = bac
                best_model = M

    for f in [5, 10, 20]:
        for e in [100, 200, 300]:
            X_new = X[eliminate_features(X, Y, f)]
            M = Pipeline([
                ('classification', RandomForestClassifier(n_estimators=e, random_state=1, n_jobs=-1))
            ]).fit(X_new, Y)
            bac = bac_cv(M, X_new, Y)
            reports["SELECT+RF percentile=%d" % p + " n_estimators=%d" % e] = bac
            if bac > max_bac:
                max_bac = bac
                best_model = M

    for f in [5, 10, 20]:
        for e in [100, 200, 300]:
            X_new = X[eliminate_features(X, Y, f)]
            M = Pipeline([
                ('classification', ExtraTreesClassifier(n_estimators=e, n_jobs=-1, max_depth=None, min_samples_split=1, random_state=1))
            ]).fit(X_new, Y)
            bac = bac_cv(M, X_new, Y)
            reports["SELECT+RF percentile=%d" % p + " n_estimators=%d" % e] = bac
            if bac > max_bac:
                max_bac = bac
                best_model = M

    print "Best BAC %f" % max_bac
    print "BEST model %s" % max(reports, key=reports.get)
    return best_model



In [None]:
%%time
for name in ["christine", "jasmine", "madeline", "philippine", "sylvine"]:
    print "Loading %s" % name
    X, Y = load(name)
    bruteforce(X, Y)