In [6]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import pickle   

In [7]:
def create_dataset_right(rs_tag, mins_diff=5):
    rf = pd.read_csv('a13/Rechts_Flow_2011.csv')
    rs = pd.read_csv('a13/Rechts_Speed_2011.csv')

    jam_threshold = rs[rs_tag].quantile(.25)

    y = (rs[rs_tag] < jam_threshold)[mins_diff:]

    df = pd.concat([rf,rs],axis=1)

    df = df.iloc[:-mins_diff,:]

    return df, y

def train_val_test_split(df, y, test_size=0.15, val_size=0.15):
    cut1 = int(len(y) * test_size)
    cut2 = int(len(y) * (test_size + val_size))

    X_train = df.iloc[:-cut2,:]
    X_val = df.iloc[-cut2:-cut1,:]
    X_test = df.iloc[-cut1:,:]

    y_train = y[:-cut2]
    y_val = y[-cut2:-cut1]
    y_test = y[-cut1:]

    return X_train, X_val, X_test, y_train, y_val, y_test

In [8]:
df, y = create_dataset_right(rs_tag='118.4', mins_diff=5)

In [9]:
X_train, _, X_test, y_train, _, y_test =  train_val_test_split(df, y, test_size=0.3, val_size=0)

In [10]:
def train_eval_classifier(X_train, X_test, y_train, y_test):
    params = dict(
        max_depth=[30],
        n_estimators=[100],
    )

    score = 'f1'

    clf = GridSearchCV(
        RandomForestClassifier(),  
        param_grid=params,  # parameters to tune via cross validation
        refit=True,  # fit using all data, on the best detected classifier
        n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
        scoring=score,  # what score are we optimizing?
        cv=StratifiedKFold(y_train, n_folds=3),  # what type of cross validation to use
    )

    clf.fit(X_train, y_train)

    return clf


In [11]:
clf = train_eval_classifier(X_train, X_test, y_train, y_test)

In [12]:
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))


             precision    recall  f1-score   support

      False       0.94      0.98      0.96    117547
       True       0.92      0.81      0.86     40131

avg / total       0.93      0.93      0.93    157678

