Based on the analysis made here by Sergio Rossell, we will try to train some models that predict traffic jams during weekdays.

https://github.com/pablocelayes/transportlab16/blob/master/jamsOnDaysAndHours.png

We will experiment using 5, 15 or 30 minutes of anticipation. For now we will focus on data from Q2, and later we will see how this results extend to other quarters.

In [24]:
%matplotlib inline

from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from datetime import datetime

In [100]:
def create_dataset(mins_diff):
    # n470_hects = range(53,70) # Hectometers from previous analysis
    n470_hects = range(47,68) # Hectometers between 10.087 and 12.110, as used in Sergio's analysis
    ngbr_hects = range(n470_hects[0] - 15, n470_hects[-1] + 15)
    jam_threshold = 30
    times = pd.date_range(start = datetime(2011, 1, 1), end = datetime(2011, 12, 31, 23, 59), freq = 'T')
    
    rf = pd.read_csv('a13/Rechts_Flow_2011.csv', header=None)
    rs = pd.read_csv('a13/Rechts_Speed_2011.csv', header=None)
    df = pd.concat([rf.iloc[:,ngbr_hects], rs.iloc[:,ngbr_hects]],axis=1)
    
    rs.set_index(times, inplace = True)
    df.set_index(times, inplace = True)
    
    # keeping only weekdays in Q2
    rsi = rs.iloc[:,n470_hects]
    rsi = rsi[(rsi.index.dayofweek < 5) & (rsi.index.month >= 3) & (rsi.index.month <= 6)]
    inters_means = rsi.mean(axis=1)

    df = df[(df.index.dayofweek < 5) & (df.index.month >= 3) & (df.index.month <= 6)]
    df = df.iloc[:-mins_diff,:]
    
    y = (inters_means < jam_threshold)[mins_diff:]

    return df, y.values


def train_test_split(df, y, test_size=0.3):
    cut = int(df.shape[0] * test_size)
    
    df['y'] = y
    X_train = df.iloc[:-cut,:]
    X_test = df.iloc[-cut:,:]

    y_train = X_train['y'].values; del X_train['y']
    y_test = X_test['y'].values; del X_test['y']
    
    return X_train, X_test, y_train, y_test


We now use the GridSearchCV module from sklearn to find the best combination of parameters for our problem, using 3-fold cross validation for each candidate combination.

In [102]:
params = {"max_depth": [5, 10, None],
          "max_features": [25, 50],
          "class_weight": ["balanced", "balanced_subsample"],
          "min_samples_split": [10, 20],
          "min_samples_leaf": [3],
          "criterion": ["gini", "entropy"],
          "n_estimators": [10, 20],
          
         }

for mins in [5, 15, 30]:
    print("Searching best model for %d mins of anticipation" % mins)
    df, y = create_dataset(mins)
    X_train, X_test, y_train, y_test =  train_test_split(df, y)

    clf = GridSearchCV(
        RandomForestClassifier(),  
        param_grid=params,  # parameters to tune via cross validation
        refit=True,  # fit using all data, on the best detected classifier
        n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
        scoring='f1',  # what score are we optimizing?
        cv=StratifiedKFold(y_train, n_folds=3),  # what type of cross validation to use
    )

    clf.fit(X_train, y_train)
    
    print(clf.best_params_)

    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

Searching best model for 5 mins of anticipation
{'criterion': 'entropy', 'max_depth': None, 'class_weight': 'balanced', 'min_samples_leaf': 3, 'max_features': 25, 'min_samples_split': 10, 'n_estimators': 20}
             precision    recall  f1-score   support

      False       1.00      0.99      0.99     36876
       True       0.82      0.86      0.84      1138

avg / total       0.99      0.99      0.99     38014

Searching best model for 15 mins of anticipation
{'criterion': 'gini', 'max_depth': None, 'class_weight': 'balanced_subsample', 'min_samples_leaf': 3, 'max_features': 25, 'min_samples_split': 20, 'n_estimators': 10}
             precision    recall  f1-score   support

      False       0.98      0.98      0.98     36873
       True       0.37      0.38      0.37      1138

avg / total       0.96      0.96      0.96     38011

Searching best model for 30 mins of anticipation
{'criterion': 'gini', 'max_depth': 10, 'class_weight': 'balanced_subsample', 'min_samples_leaf': 

Performance is very good for 5 mins of anticipation, but then degrades a lot for 15 mins.
Let's see how much we can extend the anticipation before it starts to degrade.

In [None]:
params = {"max_depth": [5, 10, None],
          "max_features": [25, 50],
          "class_weight": ["balanced", "balanced_subsample"],
          "min_samples_split": [10, 20],
          "min_samples_leaf": [3],
          "criterion": ["gini", "entropy"],
          "n_estimators": [10, 20],
          
         }

for mins in range(6,15):
    print("Searching best model for %d mins of anticipation" % mins)
    df, y = create_dataset(mins)
    X_train, X_test, y_train, y_test =  train_test_split(df, y)

    clf = GridSearchCV(
        RandomForestClassifier(),  
        param_grid=params,  # parameters to tune via cross validation
        refit=True,  # fit using all data, on the best detected classifier
        n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
        scoring='f1',  # what score are we optimizing?
        cv=StratifiedKFold(y_train, n_folds=3),  # what type of cross validation to use
    )

    clf.fit(X_train, y_train)
    
    print(clf.best_params_)

    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

Searching best model for 6 mins of anticipation
{'criterion': 'entropy', 'max_depth': None, 'class_weight': 'balanced_subsample', 'min_samples_leaf': 3, 'max_features': 25, 'min_samples_split': 20, 'n_estimators': 10}
             precision    recall  f1-score   support

      False       1.00      0.99      0.99     36876
       True       0.76      0.85      0.80      1138

avg / total       0.99      0.99      0.99     38014

Searching best model for 7 mins of anticipation
{'criterion': 'entropy', 'max_depth': None, 'class_weight': 'balanced', 'min_samples_leaf': 3, 'max_features': 25, 'min_samples_split': 10, 'n_estimators': 20}
             precision    recall  f1-score   support

      False       0.99      0.99      0.99     36875
       True       0.77      0.79      0.78      1138

avg / total       0.99      0.99      0.99     38013

Searching best model for 8 mins of anticipation
{'criterion': 'entropy', 'max_depth': None, 'class_weight': 'balanced_subsample', 'min_samples_l