## Imports

In [31]:
import data_functions as da
import ml_functions as ml

In [32]:
# from simpleloop
from __future__ import division
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
from datetime import timedelta
import random
from scipy import optimize
import time
import seaborn as sns
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import itertools
from datetime import datetime

## Functions

#### See ml_functions.py for additional helper functions

In [5]:
# Magicloop code adapted from Rayid Ghani: https://github.com/rayidghani/magicloops/blob/master/magicloops.py
def LR():
    return LogisticRegression(penalty = 'l1', C = 1e5)

def KNN():
    return KNeighborsClassifier(n_neighbors = 3)

def DT():
    return DecisionTreeClassifier()

def SVM():
    return svm.SVC(kernel = 'linear', probability = True, random_state = 3)

def RF():
    return RandomForestClassifier(n_estimators = 50, n_jobs = -1)

def AB():
    return AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                                                    algorithm="SAMME",
                                                    n_estimators=200)
def GB():
    return GradientBoostingClassifier(learning_rate = 0.05,
                                    	subsample = 0.5,
                                    	max_depth = 6,
                                    	n_estimators = 10)
def NB():
    return GaussianNB()

In [34]:
all_models = models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB']

def classifier_loop(df, features, start, grid_size='test', models_to_run = all_models):
    
    # define grid to use: test, small, large
    clfs, grid = define_clfs_params(grid_size)
    
    X_test, X_train, y_test, y_train = train_test_over_time(df, features, start=start)
    # call clf_loop and store results in results_df
    results_df = ml.my_loop(models_to_run, clfs,grid, X_test, X_train, y_test, y_train)
    
    return results_df

In [36]:
def define_clfs_params(grid_size):
    """Define defaults for different classifiers.
    Define three types of grids:
    Test: for testing your code
    Small: small grid
    Large: Larger grid that has a lot more parameter sweeps
    """

    clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'LR': LogisticRegression(penalty='l1', C=1e5),
        'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
        'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
        'NB': GaussianNB(),
        'DT': DecisionTreeClassifier(),
        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
        'KNN': KNeighborsClassifier(n_neighbors=3) 
            }

    large_grid = { 
    'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
    'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }
    
    small_grid = { 
    'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
    'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': { 'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }
    
    test_grid = { 
    'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'LR': { 'penalty': ['l1'], 'C': [0.01]},
    'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']},
    'ET': { 'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
    'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]},
    'NB' : {},
    'DT': {'criterion': ['gini'], 'max_depth': [1],'min_samples_split': [10]},
    'SVM' :{'C' :[0.01],'kernel':['linear']},
    'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']}
           }
    
    if (grid_size == 'large'):
        return clfs, large_grid
    elif (grid_size == 'small'):
        return clfs, small_grid
    elif (grid_size == 'test'):
        return clfs, test_grid
    else:
        return 0, 0

In [37]:
def train_test_over_time(df, features, target='fully_funded', start='Jan 2011'):
    dates = {}
    dates['2011-01-01'] = ['2011-12-31', '2012-06-31']
    dates['2011-07-01'] = ['2012-07-31', '2012-12-31']
    dates['2012-01-01'] = ['2012-12-31', '2013-06-31']

    if start == 'Jan 2011':
        start = '2011-01-01'
        end_train = dates[start][0]
        end_test = dates[start][1]
    elif start == 'Jul 2011':
        start = '2011-07-01'
        end_train = dates[start][0]
        end_test = dates[start][1]
    elif start == 'Jan 2012':
        start = '2012-01-01'
        end_train = dates[start][0]
        end_test = dates[start][1]
    
        
    x_test = da.specify_range(df, 'date_posted', start, end_train)
    x_train = da.specify_range(df, 'date_posted', end_train, end_test)
    x_test, x_train = x_test[features], x_train[features]

    y_test = da.specify_range(df[['date_posted', target]], 'date_posted', start, end_train)
    y_train = da.specify_range(df[['date_posted', target]], 'date_posted', end_train, end_test)
    y_test, y_train = y_test[target], y_train[target]
                         
    return x_test, x_train, y_test, y_train

## Formatting for classification

In [215]:
# to load
all_projects = da.read_data('projects.csv')
outcomes = da.read_data('outcomes.csv')

projects = da.specify_range(all_projects, 'date_posted', '2011-01-01', '2013-12-31')
combined = pd.merge(projects, outcomes, on='projectid')
funded = combined.loc[combined.fully_funded=='t']
unfunded = combined.loc[combined.fully_funded=='f']

In [216]:
da.replace_na(combined, 'students_reached')
inf = ['total_price_excluding_optional_support', 'total_price_including_optional_support', 'students_reached']
for i in inf:
    combined = da.remove_outliers(combined, i)

In [217]:
projects.columns

Index(['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid',
       'school_latitude', 'school_longitude', 'school_city', 'school_state',
       'school_zip', 'school_metro', 'school_district', 'school_county',
       'school_charter', 'school_magnet', 'school_year_round', 'school_nlns',
       'school_kipp', 'school_charter_ready_promise', 'teacher_prefix',
       'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
       'primary_focus_subject', 'primary_focus_area',
       'secondary_focus_subject', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level', 'fulfillment_labor_materials',
       'total_price_excluding_optional_support',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'date_posted'],
      dtype='object')

In [218]:
classify = combined[['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid',
       'school_latitude', 'school_longitude', 'school_city', 'school_state',
       'school_zip', 'school_metro', 'school_district', 'school_county',
       'school_charter', 'school_magnet', 'school_year_round', 'school_nlns',
       'school_kipp', 'school_charter_ready_promise', 'teacher_prefix',
       'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
       'primary_focus_subject', 'primary_focus_area',
       'secondary_focus_subject', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level', 'fulfillment_labor_materials',
       'total_price_excluding_optional_support',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'date_posted','fully_funded']]
# projects + fully funded 

y = combined[['date_posted', 'fully_funded']]

In [219]:
b = ml.find_binary_cols(classify)

In [220]:
#b.remove('eligible_double_your_impact_match')
ml.turn_to_1_0(classify, b)    
classify['eligible_double_your_impact_match'] = classify['eligible_double_your_impact_match'].apply(lambda x: 1 if x=='t' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  for col in b:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [221]:
cats = [x for x in classify.columns if x not in b]   
cats = ['school_metro', 'teacher_prefix', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level']

In [222]:
classify = ml.category_cols(classify, cats)

In [223]:
classify.fulfillment_labor_materials.unique()

array([0])

In [224]:
ml.add_discrete_variable(classify, 'students_reached', 'disc_students_reached', 5)
ml.add_discrete_variable(classify, 'total_price_excluding_optional_support', 'disc_price_excluding', 10)
ml.add_discrete_variable(classify, 'total_price_including_optional_support', 'disc_price_including', 10)
classify.columns

Index(['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid',
       'school_latitude', 'school_longitude', 'school_city', 'school_state',
       'school_zip', 'school_district', 'school_county', 'school_charter',
       'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp',
       'school_charter_ready_promise', 'teacher_teach_for_america',
       'teacher_ny_teaching_fellow', 'primary_focus_subject',
       'secondary_focus_subject', 'secondary_focus_area',
       'fulfillment_labor_materials', 'total_price_excluding_optional_support',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'date_posted', 'fully_funded', 'rural', 'suburban', 'urban', 'Dr.',
       'Mr.', 'Mrs.', 'Ms.', 'Applied Learning', 'Health & Sports',
       'History & Civics', 'Literacy & Language', 'Math & Science',
       'Music & The Arts', 'Special Needs', 'Books', 'Other', 'Supplies',
       'Te

In [227]:
features = [ 'school_charter',
       'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp',
       'school_charter_ready_promise', 'teacher_teach_for_america',
       'teacher_ny_teaching_fellow', 
       'fulfillment_labor_materials', 'total_price_excluding_optional_support',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
         'rural', 'suburban', 'urban', 'Dr.',
       'Mr.', 'Mrs.', 'Ms.', 'Applied Learning', 'Health & Sports',
       'History & Civics', 'Literacy & Language', 'Math & Science',
       'Music & The Arts', 'Special Needs', 'Books', 'Other', 'Supplies',
       'Technology', 'Trips', 'Visitors', 'high poverty', 'highest poverty',
       'low poverty', 'moderate poverty', 'Grades 3-5', 'Grades 6-8',
       'Grades 9-12', 'Grades PreK-2']

### Dates Used

#### dates[start] = [end_train, end_test]
#### dates['2011-01-01'] = ['2011-12-31', '2012-06-31'] - Jan 2011
#### dates['2011-07-01'] = ['2012-07-31', '2012-12-31'] - Jul 2011
#### dates['2012-01-01'] = ['2012-12-31', '2013-06-31'] - Jan 2012

In [228]:
classifier_loop(classify, features, 'Jan 2011')

RF
DT
KNN
ET
AB
GB
LR
NB


Unnamed: 0,model_type,clf,parameters,auc-roc,baseline,p_at_1,p_at_2,p_at_5,p_at_10,p_at_20,p_at_30,p_at_50,r_at_1,r_at_2,r_at_5,r_at_10,r_at_20,r_at_30,r_at_50
0,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.509881,0.692669,1.0,0.603811,0.833066,0.91655,0.958279,0.972185,0.983311,0.014421,0.017433,0.060116,0.132308,0.276692,0.421047,0.709785
1,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.588031,0.692669,1.0,1.0,1.0,0.905115,0.631331,0.754212,0.852529,0.014421,0.028871,0.072163,0.130658,0.182289,0.326644,0.615382
2,KNN,"KNeighborsClassifier(algorithm='auto', leaf_si...","{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.562579,0.692669,1.0,1.0,1.0,0.53681,0.752683,0.767585,0.768324,0.014421,0.028871,0.072163,0.077491,0.217328,0.332436,0.5546
3,ET,"(ExtraTreeClassifier(class_weight=None, criter...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.524557,0.692669,0.997992,0.998997,0.999599,0.999799,0.667235,0.667358,0.800377,0.014392,0.028842,0.072134,0.144326,0.192656,0.289028,0.577737
4,AB,"(DecisionTreeClassifier(class_weight=None, cri...","{'algorithm': 'SAMME', 'n_estimators': 1}",0.588031,0.692669,1.0,1.0,1.0,0.905115,0.631331,0.754212,0.852529,0.014421,0.028871,0.072163,0.130658,0.182289,0.326644,0.615382
5,GB,([DecisionTreeRegressor(criterion='friedman_ms...,"{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...",0.588031,0.692669,1.0,1.0,1.0,0.905115,0.631331,0.754212,0.852529,0.014421,0.028871,0.072163,0.130658,0.182289,0.326644,0.615382
6,LR,"LogisticRegression(C=0.01, class_weight=None, ...","{'C': 0.01, 'penalty': 'l1'}",0.645114,0.692669,0.835341,0.819458,0.829454,0.854764,0.846054,0.826758,0.782766,0.012046,0.023659,0.059856,0.123389,0.244288,0.358063,0.565025
7,NB,GaussianNB(priors=None),{},0.612108,0.692669,0.841365,0.820461,0.808587,0.821264,0.788587,0.77835,0.763229,0.012133,0.023687,0.05835,0.118553,0.227695,0.337098,0.550922


In [38]:
classifier_loop(classify, features, 'Jul 2011')

RF
DT
KNN
ET
AB
GB
LR
NB


Unnamed: 0,model_type,clf,parameters,auc-roc,baseline,p_at_1,p_at_2,p_at_5,p_at_10,p_at_20,p_at_30,p_at_50,r_at_1,r_at_2,r_at_5,r_at_10,r_at_20,r_at_30,r_at_50
0,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.501993,0.746965,0.92555,0.962775,0.98512,0.99256,0.99628,0.99752,0.998478,0.012381,0.025758,0.065935,0.132866,0.26675,0.400611,0.668357
1,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.602844,0.746965,1.0,1.0,1.0,0.852215,0.763294,0.842191,0.905285,0.013377,0.026754,0.066931,0.114079,0.204368,0.33823,0.605976
2,KNN,"KNeighborsClassifier(algorithm='auto', leaf_si...","{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.580896,0.746965,1.0,1.0,1.0,0.669598,0.816806,0.823311,0.843404,0.013377,0.026754,0.066931,0.089633,0.218696,0.330647,0.564554
3,ET,"(ExtraTreeClassifier(class_weight=None, criter...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.50814,0.746965,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.013377,0.026754,0.066931,0.133861,0.267746,0.401607,0.669375
4,AB,"(DecisionTreeClassifier(class_weight=None, cri...","{'algorithm': 'SAMME', 'n_estimators': 1}",0.602844,0.746965,1.0,1.0,1.0,0.852215,0.763294,0.842191,0.905285,0.013377,0.026754,0.066931,0.114079,0.204368,0.33823,0.605976
5,GB,([DecisionTreeRegressor(criterion='friedman_ms...,"{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...",0.602959,0.746965,1.0,1.0,1.0,0.853061,0.763125,0.842079,0.905218,0.013377,0.026754,0.066931,0.114192,0.204323,0.338185,0.60593
6,LR,"LogisticRegression(C=0.01, class_weight=None, ...","{'C': 0.01, 'penalty': 'l1'}",0.653625,0.746965,0.978003,0.962775,0.943862,0.922895,0.892721,0.870428,0.830724,0.013083,0.025758,0.063173,0.12354,0.239022,0.34957,0.556066
7,NB,GaussianNB(priors=None),{},0.605342,0.746965,0.878173,0.86802,0.876902,0.863037,0.837433,0.824438,0.80729,0.011747,0.023223,0.058692,0.115527,0.224219,0.3311,0.54038


In [39]:
classifier_loop(classify, features, 'Jan 2012')

RF
DT
KNN
ET
AB
GB
LR
NB


Unnamed: 0,model_type,clf,parameters,auc-roc,baseline,p_at_1,p_at_2,p_at_5,p_at_10,p_at_20,p_at_30,p_at_50,r_at_1,r_at_2,r_at_5,r_at_10,r_at_20,r_at_30,r_at_50
0,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.503013,0.684614,0.849885,0.450346,0.78024,0.890145,0.945073,0.963382,0.978029,0.012405,0.013147,0.056969,0.130019,0.276083,0.422147,0.714276
1,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.60931,0.684614,1.0,1.0,1.0,0.901685,0.667667,0.778444,0.867067,0.014596,0.029193,0.073015,0.131704,0.195045,0.341109,0.633238
2,KNN,"KNeighborsClassifier(algorithm='auto', leaf_si...","{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.587878,0.684614,1.0,1.0,1.0,0.625894,0.737711,0.82514,0.833695,0.014596,0.029193,0.073015,0.091421,0.215506,0.361571,0.608866
3,ET,"(ExtraTreeClassifier(class_weight=None, criter...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.550736,0.684614,0.997691,0.998845,0.999538,0.999769,0.838334,0.892146,0.935241,0.014563,0.029159,0.072982,0.146031,0.244901,0.390932,0.683027
4,AB,"(DecisionTreeClassifier(class_weight=None, cri...","{'algorithm': 'SAMME', 'n_estimators': 1}",0.60931,0.684614,1.0,1.0,1.0,0.901685,0.667667,0.778444,0.867067,0.014596,0.029193,0.073015,0.131704,0.195045,0.341109,0.633238
5,GB,([DecisionTreeRegressor(criterion='friedman_ms...,"{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...",0.609491,0.684614,1.0,1.0,1.0,0.904454,0.666167,0.777444,0.866467,0.014596,0.029193,0.073015,0.132109,0.194606,0.340671,0.6328
6,LR,"LogisticRegression(C=0.01, class_weight=None, ...","{'C': 0.01, 'penalty': 'l1'}",0.632213,0.684614,0.856813,0.808314,0.821791,0.830602,0.827371,0.808524,0.767459,0.012506,0.023597,0.060003,0.121321,0.241699,0.35429,0.560492
7,NB,GaussianNB(priors=None),{},0.587471,0.684614,0.801386,0.80485,0.78024,0.775213,0.769675,0.769367,0.742165,0.011697,0.023496,0.056969,0.113231,0.224844,0.337131,0.542019


### Additional

In [None]:
'''
def main(df, features=features, outcome='6M', 
         models='all', grid_size='test', prediction_time='date_posted', 
         outcomes=['6M'], validate_end_dates=['2013-05-31']):
    
    validation_dates = []
    for date in validate_end_dates:
        temp = datetime.strptime(date, '%Y-%m-%d')


    # models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB']
    if (models == 'all'):
        models_to_run=['RF','LR','DT','ET','AB']
    else:
        models_to_run = []
        models_to_run.append(model)

    clfs, grid = define_clfs_params(grid_size)
    print("defined clfs params")                                                                                                      

    all_predictors=features

    # define dataframe to write results to
    results_df =  pd.DataFrame(columns=('model_type','clf', 'parameters', 'outcome', 'validation_date', 'group',
                                        'train_set_size', 'validation_set_size','predictors',
                                        'baseline','precision_at_5','precision_at_10','precision_at_20','precision_at_30','precision_at_40',
                                        'precision_at_50','recall_at_5','recall_at_10','recall_at_20','recall_at_30','recall_at_40',
                                        'recall_at_50','auc-roc'))

    
    print("made results df")
    # the magic loop starts here
    # we will loop over models, parameters, outcomes, validation_Dates
    # and store several evaluation metrics

    for index,clf in enumerate([clfs[x] for x in models_to_run]):
        parameter_values = grid[models_to_run[index]]
        for p in ParameterGrid(parameter_values):
            for current_outcome in outcomes:
                for predictor in all_predictors:
                    for validation_date in validation_dates:
                        print(p, current_outcome, predictor, validation_date)
                                                                                                           
                        try:
                            print(models_to_run[index])
                            clf.set_params(**p)
                            if (outcome == '6M'):
                                delta = 180
                            else:
                                raise ValueError('value of outcome is unknown')                 
                        
                            train_set = df[df[prediction_time] <= datetime.strptime(validation_date, '%Y-%m-%d') - timedelta(days=delta)]
                            # fill in missing values for train set using just the train set
                            # we'll do it a very naive way here but you should think more carefully about this first
                            train_set.fillna(train_set.mean(), inplace=True)
                            train_set.dropna(axis=1, how='any', inplace=True)
                            
                            validation_set = df[df[prediction_time] > datetime.strptime(validation_date, '%Y-%m-%d') - timedelta(days=0)]
                            # fill in missing values for validation set using all the data
                            # we'll do it a very naive way here but you should think more carefully about this first
                            validation_set.fillna(df.mean(), inplace=True)
                            validation_set.dropna(axis=1, how='any', inplace=True)

                            print(predictor)
                            # get predictors by removing those dropped by dropna
                            predictors_to_use = list(set(predictor).intersection(train_set.columns))

                            model = clf.fit(train_set[predictor], train_set[current_outcome]) 
                            pred_probs = clf.predict_proba(validation_set[predictor])[::,1]
                            print(len(train_set))
                            print(len(validation_set))
                            #pred_probs_sorted, true_outcome_sorted = zip(*sorted(zip(pred_probs, validation_set[current_outcome]), reverse=True))
                            results_df.loc[len(results_df)] = [models_to_run[index],clf, p, current_outcome, validation_date, group,
                                                               len(train_set),len(validation_set), 
                                                               predictor, 
                                                                precision_at_k(validation_set[current_outcome],pred_probs, 100),
                                                                precision_at_k(validation_set[current_outcome],pred_probs, 5),
                                                                precision_at_k(validation_set[current_outcome],pred_probs, 10),
                                                                precision_at_k(validation_set[current_outcome],pred_probs, 20),
                                                                precision_at_k(validation_set[current_outcome],pred_probs, 30),
                                                                precision_at_k(validation_set[current_outcome],pred_probs, 40),
                                                                precision_at_k(validation_set[current_outcome],pred_probs, 50),
                                                                recall_at_k(validation_set[current_outcome],pred_probs, 5),
                                                                recall_at_k(validation_set[current_outcome],pred_probs, 10),
                                                                recall_at_k(validation_set[current_outcome],pred_probs, 20),
                                                                recall_at_k(validation_set[current_outcome],pred_probs, 30),
                                                                recall_at_k(validation_set[current_outcome],pred_probs, 40),
                                                                recall_at_k(validation_set[current_outcome],pred_probs, 50),
                                                                roc_auc_score(validation_set[current_outcome], pred_probs)]

                            # plot precision recall graph
                            # we'll show them here but you can also save them to disk
                            plot_precision_recall_n(validation_set[current_outcome], pred_probs, clf, 'show')
                            # write results to csv as they come in so we always have something to see even if models runs for days
                           
                        except IndexError:
                            continue
    

    return results_df
'''

In [None]:
main(classify, features=features, outcome='6M', models='all', grid_size='test')

In [None]:
# sample config file to run temporal validation

# start time of our data
start_time = '2011-01-01'

#last date of data including labels and outcomes that we have
end_time = '2013-12-31'

#how far out do we want to predict (let's say in months for now)
prediction_windows = [6, 12]
prediction_window = 6

#how often is this prediction being made? every day? every month? once a year?
update_window = 12

from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta

start_dt = datetime.strptime(start_time, '%Y-%m-%d')
end_dt = datetime.strptime(end_time, '%Y-%m-%d')

dates_dict = {}
last_end_test= start_dt + relativedelta(months=+prediction_window) + relativedelta(months=+update_window)
print(last_end_test)
end_test = end_dt - relativedelta(months=+prediction_window)

start = (last_end_test  - relativedelta(months=+update_window)) + relativedelta(days=+2)
#print("start", start)
end_train = end_test - relativedelta(months=+prediction_window)
#print("end_train", end_train)
end_test = end_dt - relativedelta(months=+prediction_window)
#print("end_test",end_test)

dates_dict[start] = [end_train, end_test]
print(dates_dict)