In [1]:
# Loading dataset
import pandas as pd


def load(dataset):
    X = pd.read_csv('../data/%s' % dataset + '/%s_train.data' % dataset, header=None, sep=' ')
    # For unknown for me reason, X is read with last column filled with NaN
    X.drop(X.columns[[-1]], axis=1, inplace=True)
    Y = pd.read_csv('../data/%s' % dataset + '/%s_train.solution' % dataset, header=None, sep=' ')[0]
    return X, Y

In [3]:
from math import sqrt
import sklearn
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, VarianceThreshold
from libscores import auc_cv
from libscores import bac_cv

def rf_model(x, y, p, e):
    return Pipeline([           
        ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
        ('classification', RandomForestClassifier(n_estimators=e, random_state=1, n_jobs=-1))
    ]).fit(x, y), "SELECT+RF percentile=%d" % p + " n_estimators=%d" % e


def et_model(x, y, p, e):
    return Pipeline([           
        ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
        ('classification', ExtraTreesClassifier(n_estimators=e, n_jobs=-1, random_state=1))
    ]).fit(x, y), "SELECT+ET percentile=%d" % p + " n_estimators=%d" % e


In [26]:
def process(X, Y, model_function, metrics_function, best_model, best_metrics, best_label):
    p = -1
    for e in [10, 50, 100, 200, 300]:
        # Start optimization
        if p > 0:
            l = max(1, p - 20)
            r = min(r, p + 20)
        else:
            l = 1
            r = 50
        # Step left
        model_l, label_l = model_function(X, Y, l, e)
        metrics_l = metrics_function(model_l, X, Y)
        if metrics_l > best_metrics:
            best_metrics = metrics_l; best_label = label_l; best_model = model_l
        print "Processed: %s" % label_l + " score: %f" % metrics_l

        # Step rigth
        model_r, label_r = model_function(X, Y, r, e)
        metrics_r = metrics_function(model_r, X, Y)
        if metrics_r > best_metrics:
            best_metrics = metrics_r; best_label = label_r; best_model = model_r
        print "Processed: %s" % label_r + " score: %f" % metrics_r

        step = 0
        while step < 10:
            improved = False
            step += 1
            p = (l + r) / 2
            model_p, label_p = model_function(X, Y, p, e)
            metrics_p = metrics_function(model_p, X, Y)
            if metrics_p > best_metrics:
                best_metrics = metrics_p; best_label = label_p; best_model = model_p; improved = True
            print "Processed: %s" % label_p + " score: %f" % metrics_p
            
            if metrics_l > metrics_r:
                r, model_r, metrics_r, label_r = p, model_p, metrics_p, label_p
            else:
                l, model_l, metrics_l, label_l = p, model_p, metrics_p, label_p
            if not improved and step >= 5 or l == r:
                break

    return best_model, best_metrics, best_label

def optimize(X, Y):
    """Performs optimization for given dataset"""
    
    if name in ["christine", "jasmine", "madeline", "philippine", "sylvine"]:
        metrics_function = bac_cv
    else:
        metrics_function = auc_cv
        
    model = None
    metrics = 0
    label = None    

    # Lets remove constant features
    X = VarianceThreshold(.1).fit_transform(X)

    model, metrics, label = process(X, Y, rf_model, metrics_function, model, metrics, label)
    model, metrics, label = process(X, Y, et_model, metrics_function, model, metrics, label)

    print "Best model: %s" % label + " metrics: %f" % metrics
    return model, X

In [None]:
%%time
for name in ["jasmine", "madeline", "philippine", "sylvine"]:
    print "PROCESSING %s" % name
    X, Y = load(name)
    optimize(X, Y)

PROCESSING jasmine
Processed: SELECT+RF percentile=1 n_estimators=10 score: 0.469839
Processed: SELECT+RF percentile=50 n_estimators=10 score: 0.585791
Processed: SELECT+RF percentile=25 n_estimators=10 score: 0.575067
