In [1]:
# Loading dataset
import pandas as pd


def load(dataset):
    X = pd.read_csv('../data/%s' % dataset + '/%s_train.data' % dataset, header=None, sep=' ')
    # For unknown for me reason, X is read with last column filled with NaN
    X.drop(X.columns[[-1]], axis=1, inplace=True)
    Y = pd.read_csv('../data/%s' % dataset + '/%s_train.solution' % dataset, header=None, sep=' ')[0]
    return X, Y

In [2]:
from math import sqrt
import sklearn
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, VarianceThreshold
from libscores import auc_cv
from libscores import bac_cv

def rf_model(x, y, p, e):
    return Pipeline([   
        ('variation_zero', VarianceThreshold(.1)),    
        ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
        ('classification', RandomForestClassifier(n_estimators=e, random_state=1, n_jobs=-1, min_samples_split=1))
    ]).fit(x, y), "SELECT+RF percentile=%d" % p + " n_estimators=%d" % e

def rf_no_var_model(x, y, p, e):
    return Pipeline([   
        ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
        ('classification', RandomForestClassifier(n_estimators=e, random_state=1, n_jobs=-1, min_samples_split=1))
    ]).fit(x, y), "SELECT+RF_NO_VAR percentile=%d" % p + " n_estimators=%d" % e

def et_model(x, y, p, e):
    return Pipeline([
        ('variation_zero', VarianceThreshold(.1)),
        ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
        ('classification', ExtraTreesClassifier(n_estimators=e, n_jobs=-1, random_state=1, min_samples_split=1))
    ]).fit(x, y), "SELECT+ET percentile=%d" % p + " n_estimators=%d" % e

# def bagg_model(x, y, p, e):
#     return Pipeline([
#         ('variation_zero', VarianceThreshold(.1)),
#         ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
#         ('classification', BaggingClassifier(base_estimator=None, n_estimators=e, max_samples=1.0, 
#                                              max_features=1.0, bootstrap=True, 
#                                              bootstrap_features=False, oob_score=False, n_jobs=-1, 
#                                              random_state=1, verbose=0)),
#         ]).fit(x, y), "SELECT+BAGGING percentile=%d" % p + " n_estimators=%d" % e

# def ada_model(x, y, p, e):
#     return Pipeline([
#         ('variation_zero', VarianceThreshold(.1)),
#         ('feature_selection', SelectPercentile(percentile=p, score_func=sklearn.feature_selection.f_classif)),
#         ('classification', AdaBoostClassifier(base_estimator=None, n_estimators=e, 
#                                               learning_rate=1.0, algorithm='SAMME.R', random_state=1))
#     ]).fit(x, y), "SELECT+ADA_BOOST percentile=%d" % p + " n_estimators=%d" % e

In [None]:
def process(X, Y, model_function, metrics_function, best_model, best_metrics, best_label):
    p = -1
    # Choose global ranges by data
    if len(X) < 200:
        L, R = 50, 99
    elif len(X) > 1000:
        L, R = 1, 40
    else: 
        L, R = 1, 90
    for e in [100, 200, 270]:
        # Start optimization from previous point
        if p > 0:
            l, r = max(L, p - 25), min(R, p + 25)
        else:
            l, r = L, R

        # Left
        model_l, label_l = model_function(X, Y, l, e)
        metrics_l = metrics_function(model_l, X, Y)
        if metrics_l > best_metrics:
            best_metrics = metrics_l; best_label = label_l; best_model = model_l
        print "Processed: %s" % label_l + " score: %f" % metrics_l

        # Rigth
        model_r, label_r = model_function(X, Y, r, e)
        metrics_r = metrics_function(model_r, X, Y)
        if metrics_r > best_metrics:
            best_metrics = metrics_r; best_label = label_r; best_model = model_r
        print "Processed: %s" % label_r + " score: %f" % metrics_r

        no_progress = 0
        while True:
            # Median point
            p = (l + r) / 2
            model_p, label_p = model_function(X, Y, p, e)
            metrics_p = metrics_function(model_p, X, Y)
            if metrics_p > best_metrics:
                best_metrics = metrics_p; best_label = label_p; best_model = model_p; no_progress = 0
            else:
                no_progress += 1
            print "Processed: %s" % label_p + " score: %f" % metrics_p + ". No progress %d steps" % no_progress
            
            if metrics_l > metrics_r:
                r, model_r, metrics_r, label_r = p, model_p, metrics_p, label_p
            else:
                l, model_l, metrics_l, label_l = p, model_p, metrics_p, label_p
            if no_progress >= 2 or l == r:
                break

    return best_model, best_metrics, best_label

def optimize(name, X, Y):
    """Performs optimization for given dataset"""
    
    if name in ["christine", "jasmine", "madeline", "philippine", "sylvine"]:
        metrics_function = bac_cv
    else:
        metrics_function = auc_cv
        
    # Starting point
    model, metrics, label = None, 0, None

    # Extra class classifier works poor on huge datasets
    if len(X) < 1000: 
        model, metrics, label = process(X, Y, rf_model, metrics_function, model, metrics, label)
        model, metrics, label = process(X, Y, et_model, metrics_function, model, metrics, label)
    else:
        model, metrics, label = process(X, Y, rf_no_var_model, metrics_function, model, metrics, label)        
        
#     model, metrics, label = process(X, Y, bagg_model, metrics_function, model, metrics, label)
#     model, metrics, label = process(X, Y, ada_model, metrics_function, model, metrics, label)
    
    print "%s " % name + " best model: %s" % label + " metrics: %f" % metrics
    return model, X

In [None]:
%%time
for name in ["christine", "jasmine", "madeline", "philippine", "sylvine"]:
    print "PROCESSING %s" % name
    X, Y = load(name)
    optimize(name, X, Y)

PROCESSING christine


 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=1 n_estimators=100 score: 0.414544


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=40 n_estimators=100 score: 0.472130


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=20 n_estimators=100 score: 0.452196. No progress 1 steps


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=30 n_estimators=100 score: 0.461794. No progress 2 steps


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=5 n_estimators=200 score: 0.451458


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=55 n_estimators=200 score: 0.470653


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=30 n_estimators=200 score: 0.474714. No progress 0 steps


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=42 n_estimators=200 score: 0.472499. No progress 1 steps


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=36 n_estimators=200 score: 0.470653. No progress 2 steps


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=11 n_estimators=270 score: 0.464378


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=61 n_estimators=270 score: 0.468808


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.


Processed: SELECT+RF_NO_VAR percentile=36 n_estimators=270 score: 0.471023. No progress 1 steps


 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.
