In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
from pipeline import loader as lo
from pipeline import classifier as clas
from pipeline import explorer as ex
from pipeline import preprocessor as pro
from pipeline import _util as ut
from pipeline import evaluator as ev
from pipeline import features_generator as fe
from datetime import timedelta
import pandas as pd

  from numpy.core.umath_tests import inner1d


# Models evaluation ( pipeline skeleton)

## 1. Load Data
#### - Load data from csv file
#### - Check the data type of each column

In [2]:
df = lo.load('projects_2012_2013.csv')

In [3]:
df.dtypes

projectid                                  object
teacher_acctid                             object
schoolid                                   object
school_ncesid                             float64
school_latitude                           float64
school_longitude                          float64
school_city                                object
school_state                               object
school_metro                               object
school_district                            object
school_county                              object
school_charter                             object
school_magnet                              object
teacher_prefix                             object
primary_focus_subject                      object
primary_focus_area                         object
secondary_focus_subject                    object
secondary_focus_area                       object
resource_type                              object
poverty_level                              object


## 2. Explore Data

### show the missing rate of each column

In [4]:
df.isnull().sum(axis=0)/ df.shape[0]

projectid                                 0.000000
teacher_acctid                            0.000000
schoolid                                  0.000000
school_ncesid                             0.073878
school_latitude                           0.000000
school_longitude                          0.000000
school_city                               0.000000
school_state                              0.000000
school_metro                              0.121815
school_district                           0.001376
school_county                             0.000000
school_charter                            0.000000
school_magnet                             0.000000
teacher_prefix                            0.000000
primary_focus_subject                     0.000120
primary_focus_area                        0.000120
secondary_focus_subject                   0.324510
secondary_focus_area                      0.324510
resource_type                             0.000136
poverty_level                  

## 3 label the data's outcome

In [5]:
df['date_posted'] = pd.to_datetime(df['date_posted'])
df['datefullyfunded'] = pd.to_datetime(df['datefullyfunded'])
df['not_funded_in_60_days'] = \
    (df['datefullyfunded'] - df['date_posted'] >= pd.to_timedelta(60, unit='days')).astype('int')

Notice we care about the project that is not fully funded in 60 days, so we label the outcome of not get funded in 60 days as 1 else as 0.

## 4 Run the main function

In [13]:
xs_lst = [item for item in df.columns if item not in {'date_posted',
                                                      'datefullyfunded',
                                                      'not_funded_in_60_days',
                                                      'projectid',
                                                      'teacher_acctid',
                                                      'schoolid',
                                                      'school_ncesid'}]
grid_size = 'test'
clfs, grid = clas.define_clfs_params(grid_size)
models_to_run=['DT','LR','RF','AB', 'GB','ET','BG']
res = main.run_time_validation(models_to_run, clfs, grid, grid_size,
    df, '2012-01-01' ,'2014-1-1' , 6, 6, 'date_posted', xs_lst, ['not_funded_in_60_days'])

2012-01-01 00:00:00 2013-05-02 00:00:00 2013-07-01 00:00:00 2014-01-01 00:00:00


MemoryError: 

## 4 Build all the models

In [59]:
grid_size = 'test'
clfs, grid = clas.define_clfs_params(grid_size)
models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB','BG']
res = main.run_time_validation(models_to_run, clfs, grid, grid_size,
    df, '2012-01-01' ,'2014-1-1' , 6, 6, 'date_posted', xs_lst, ['fully_funded_in_60_days'])

2012-01-01 00:00:00 2013-05-02 00:00:00 2013-07-01 00:00:00 2014-01-01 00:00:00
RF
DT
KNN
ET
AB
GB
LR
NB
BG
2012-01-01 00:00:00 2012-11-02 00:00:00 2013-01-01 00:00:00 2013-07-01 00:00:00
RF
DT
KNN
ET
AB
GB
LR
NB
BG
2012-01-01 00:00:00 2012-05-02 00:00:00 2012-07-01 00:00:00 2013-01-01 00:00:00
RF
DT
KNN
ET
AB
GB
LR
NB
BG


In [28]:
res.columns

Index(['train_start', 'train_end', 'test_start', 'test_end', 'model_type',
       'clf', 'parameters', 'auc-roc', 'baseline_at_1', 'baseline_at_2',
       'baseline_at_5', 'baseline_at_10', 'baseline_at_20', 'baseline_at_30',
       'baseline_at_50', 'accuracy_at_1', 'accuracy_at_2', 'accuracy_at_5',
       'accuracy_at_10', 'accuracy_at_20', 'accuracy_at_30', 'accuracy_at_50',
       'precision_at_1', 'precision_at_2', 'precision_at_5', 'precision_at_10',
       'precision_at_20', 'precision_at_30', 'precision_at_50', 'recall_at_1',
       'recall_at_2', 'recall_at_5', 'recall_at_10', 'recall_at_20',
       'recall_at_30', 'recall_at_50'],
      dtype='object')

## 5. Compare all the models

In [48]:
lst = ['auc-roc','accuracy_at_1', 'accuracy_at_2', 'accuracy_at_5',
       'accuracy_at_10', 'accuracy_at_20', 'accuracy_at_30', 'accuracy_at_50',
       'precision_at_1', 'precision_at_2', 'precision_at_5', 'precision_at_10',
       'precision_at_20', 'precision_at_30', 'precision_at_50', 'recall_at_1',
       'recall_at_2', 'recall_at_5', 'recall_at_10', 'recall_at_20',
       'recall_at_30', 'recall_at_50']
res_lst =[]
for item in lst:
    temp = res.sort_values(by = item,ascending = False).head(1)[['model_type','parameters','clf','train_start','train_end',item]]
    temp['metrics']  = item
    temp.rename(columns = {item:'score'},inplace =True)
    temp_1 = temp[['model_type','metrics', 'train_end','score']]
    res_lst.append(temp_1)
best_models = pd.concat(res_lst)

model_type                                                    RF
parameters     {'max_features': 'sqrt', 'min_samples_split': ...
clf            (DecisionTreeClassifier(class_weight=None, cri...
train_start                                  2012-01-01 00:00:00
train_end                                    2013-05-02 00:00:00
score                                                   0.692914
metrics                                                  auc-roc
Name: 0, dtype: object
model_type                                                    RF
parameters     {'max_features': 'sqrt', 'min_samples_split': ...
clf            (DecisionTreeClassifier(class_weight=None, cri...
train_start                                  2012-01-01 00:00:00
train_end                                    2012-05-02 00:00:00
score                                                   0.743598
metrics                                            accuracy_at_1
Name: 0, dtype: object
model_type                                  

model_type                                                    RF
parameters     {'max_features': 'sqrt', 'min_samples_split': ...
clf            (DecisionTreeClassifier(class_weight=None, cri...
train_start                                  2012-01-01 00:00:00
train_end                                    2013-05-02 00:00:00
score                                                   0.327315
metrics                                             recall_at_20
Name: 0, dtype: object
model_type                                                    RF
parameters     {'max_features': 'sqrt', 'min_samples_split': ...
clf            (DecisionTreeClassifier(class_weight=None, cri...
train_start                                  2012-01-01 00:00:00
train_end                                    2013-05-02 00:00:00
score                                                   0.467626
metrics                                             recall_at_30
Name: 0, dtype: object
model_type                                  

In [49]:
best_models

Unnamed: 0,model_type,metrics,train_end,score
0,RF,auc-roc,2013-05-02,0.692914
0,RF,accuracy_at_1,2012-05-02,0.743598
0,RF,accuracy_at_2,2012-05-02,0.742021
0,RF,accuracy_at_5,2012-05-02,0.7375
0,RF,accuracy_at_10,2012-05-02,0.7304
0,RF,accuracy_at_20,2012-05-02,0.706189
0,RF,accuracy_at_30,2013-05-02,0.681572
2,KNN,accuracy_at_50,2012-11-02,0.618372
5,GB,precision_at_1,2012-11-02,0.631336
5,GB,precision_at_2,2012-11-02,0.608295


In [14]:
res.sort_values(by = 'precision_at_5',ascending = False).head(10)[['model_type','parameters','clf','train_start','train_end','precision_at_5']]

Unnamed: 0,model_type,parameters,clf,train_start,train_end,precision_at_5
4,AB,"{'algorithm': 'SAMME', 'n_estimators': 100}","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-11-02,0.560369
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-11-02,0.55023
5,GB,"{'learning_rate': 0.1, 'subsample': 0.5, 'max_...",([DecisionTreeRegressor(criterion='friedman_ms...,2012-01-01,2012-11-02,0.548387
6,LR,"{'C': 0.01, 'penalty': 'l1'}","LogisticRegression(C=0.01, class_weight=None, ...",2012-01-01,2012-11-02,0.539171
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2013-05-02,0.533062
5,GB,"{'learning_rate': 0.1, 'subsample': 0.5, 'max_...",([DecisionTreeRegressor(criterion='friedman_ms...,2012-01-01,2013-05-02,0.52038
3,ET,"{'max_features': 'sqrt', 'min_samples_split': ...","(ExtraTreeClassifier(class_weight=None, criter...",2012-01-01,2012-11-02,0.481106
3,ET,"{'max_features': 'sqrt', 'min_samples_split': ...","(ExtraTreeClassifier(class_weight=None, criter...",2012-01-01,2013-05-02,0.47962
4,AB,"{'algorithm': 'SAMME', 'n_estimators': 100}","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2013-05-02,0.474185
6,LR,"{'C': 0.01, 'penalty': 'l1'}","LogisticRegression(C=0.01, class_weight=None, ...",2012-01-01,2013-05-02,0.47192


In [16]:
res.sort_values(by = 'precision_at_1',ascending = False).head(10)[['model_type','parameters','clf','train_start','train_end','precision_at_1']]

Unnamed: 0,model_type,parameters,clf,train_start,train_end,precision_at_1
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-11-02,0.617512
5,GB,"{'learning_rate': 0.1, 'subsample': 0.5, 'max_...",([DecisionTreeRegressor(criterion='friedman_ms...,2012-01-01,2012-11-02,0.617512
4,AB,"{'algorithm': 'SAMME', 'n_estimators': 100}","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-11-02,0.599078
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2013-05-02,0.589569
3,ET,"{'max_features': 'sqrt', 'min_samples_split': ...","(ExtraTreeClassifier(class_weight=None, criter...",2012-01-01,2012-11-02,0.557604
6,LR,"{'C': 0.01, 'penalty': 'l1'}","LogisticRegression(C=0.01, class_weight=None, ...",2012-01-01,2012-11-02,0.543779
3,ET,"{'max_features': 'sqrt', 'min_samples_split': ...","(ExtraTreeClassifier(class_weight=None, criter...",2012-01-01,2013-05-02,0.53288
5,GB,"{'learning_rate': 0.1, 'subsample': 0.5, 'max_...",([DecisionTreeRegressor(criterion='friedman_ms...,2012-01-01,2013-05-02,0.521542
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-05-02,0.486322
3,ET,"{'max_features': 'sqrt', 'min_samples_split': ...","(ExtraTreeClassifier(class_weight=None, criter...",2012-01-01,2012-05-02,0.480243


In [17]:
res.sort_values(by = 'precision_at_2',ascending = False).head(10)[['model_type','parameters','clf','train_start','train_end','precision_at_2']]

Unnamed: 0,model_type,parameters,clf,train_start,train_end,precision_at_2
5,GB,"{'learning_rate': 0.1, 'subsample': 0.5, 'max_...",([DecisionTreeRegressor(criterion='friedman_ms...,2012-01-01,2012-11-02,0.615207
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-11-02,0.573733
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2013-05-02,0.571914
6,LR,"{'C': 0.01, 'penalty': 'l1'}","LogisticRegression(C=0.01, class_weight=None, ...",2012-01-01,2012-11-02,0.557604
4,AB,"{'algorithm': 'SAMME', 'n_estimators': 100}","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-11-02,0.550691
5,GB,"{'learning_rate': 0.1, 'subsample': 0.5, 'max_...",([DecisionTreeRegressor(criterion='friedman_ms...,2012-01-01,2013-05-02,0.539071
3,ET,"{'max_features': 'sqrt', 'min_samples_split': ...","(ExtraTreeClassifier(class_weight=None, criter...",2012-01-01,2013-05-02,0.516421
3,ET,"{'max_features': 'sqrt', 'min_samples_split': ...","(ExtraTreeClassifier(class_weight=None, criter...",2012-01-01,2012-11-02,0.511521
0,RF,"{'max_features': 'sqrt', 'min_samples_split': ...","(DecisionTreeClassifier(class_weight=None, cri...",2012-01-01,2012-05-02,0.473445
6,LR,"{'C': 0.01, 'penalty': 'l1'}","LogisticRegression(C=0.01, class_weight=None, ...",2012-01-01,2013-05-02,0.458664


In [25]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')