# Run Models

Neural networks

## 0.0 Libraries

In [1]:
%load_ext autoreload
%autoreload 2

#system
import os
import sys
from os.path import join as pj
module_path = os.path.abspath(pj('..','..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# utils
from src.d00_utils import print_helper_functions as phf

# ipython
import warnings
warnings.simplefilter('ignore')

# executing code
import click
import logging

# type annotations
from typing import List, Set, Dict, Tuple, Optional
from collections.abc import Iterable

# configuring
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import configparser

# data
import numpy as np
import re

# stats
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# viz
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set(font_scale=1.5)
plt.style.use('bmh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# file handling
import tempfile
import joblib
import botocore
import boto3
from os.path import join as pj
import pickle as pkl
sys.path.append('../..')

# data extraction/transformation
from src.d01_data.build_features import FeatureExtractorText, FeatureExtractorOHE, FeatureExtractorNumber, CustomImputer
from sklearn.pipeline import Pipeline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1.0 Load feature union and data

In [2]:
# not used in this stub but often useful for finding various files
project_dir = Path().resolve().parents[1]
print(project_dir)

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
_ = load_dotenv(find_dotenv())

raw_dir = pj(project_dir, 'data', os.environ.get('RAW_DIR'))
interim_dir = pj(project_dir, 'data', os.environ.get('INTERIM_DIR'))
processed_dir = pj(project_dir, 'data', os.environ.get('PROCESSED_DIR'))

/Users/marclawson/repositories/grantnav_10k_predictor


In [3]:
# load features
filename = 'feature_union.jlib'
_file = open(pj(processed_dir, filename), 'rb')
feature_union = joblib.load(_file)

In [4]:
data = pd.read_csv(pj(interim_dir,'grantnav_data_post2015.csv'), index_col=0)
data = data[data['amount_awarded']<=10000]
data.head()

Unnamed: 0,identifier,currency,title,description,amount_awarded,award_date,recipient_org_identifier,recipient_org_name,funding_org_identifier,funding_org_name,planned_dates_duration_months,year,days_since_award,funding_org_identifier_revised,funding_org_name_revised
0,360G-cabinetoffice-GA-060628,GBP,Bus Service Operator Grants (outside London) 2...,Placeholder,569.0,2016-04-01,360G-cabinetoffice-DfT_BS82,The Village Bus Committee of Amberley & Slindon,GB-GOR-D9,Department for Transport,12.0,2016,1894,GB-GOR-D9,Department for Transport
1,360G-cabinetoffice-GA-060629,GBP,Bus Service Operator Grants (outside London) 2...,Placeholder,1089.0,2016-04-01,GB-CHC-1111971,Totnes and Rural Community Bus Association,GB-GOR-D9,Department for Transport,12.0,2016,1894,GB-GOR-D9,Department for Transport
8,360G-cabinetoffice-GA-060013,GBP,Bus Service Operator Grants (outside London) 2...,Placeholder,2409.0,2016-04-01,360G-cabinetoffice-GB105582583,BELLE COACHES,GB-GOR-D9,Department for Transport,12.0,2016,1894,GB-GOR-D9,Department for Transport
9,360G-cabinetoffice-GA-059649,GBP,English Community Transport 2016/17,Placeholder,1658.0,2016-04-01,360G-cabinetoffice-GB676859169,BUNGAY AREA COMMUNITY TRANSPORT,GB-GOR-D9,Department for Transport,12.0,2016,1894,GB-GOR-D9,Department for Transport
10,360G-cabinetoffice-GA-059562,GBP,English Community Transport 2016/17,Placeholder,624.0,2016-04-01,360G-cabinetoffice-GB600027706,TRANSPORT 17 LTD,GB-GOR-D9,Department for Transport,12.0,2016,1894,GB-GOR-D9,Department for Transport


## 2.0 Simple Linear Regression

In [5]:
job_params = {
        'cv': 5,                                                       
        'n_jobs': -1,                                         
        'verbose': 10,
        'scoring': 'r2'
}

# set up fixed arguments for gridsearch
fixed_params = {
    'featureunion__pipeline-1__tfidfvectorizer__lowercase': [True],
    'featureunion__pipeline-1__tfidfvectorizer__strip_accents': 'unicode',
    'featureunion__pipeline-1__tfidfvectorizer__stop_words': ['English'],
    'featureunion__pipeline-1__tfidfvectorizer__token_pattern': '\w+',
    'featureunion__pipeline-2__tfidfvectorizer__lowercase': [True],
    'featureunion__pipeline-2__tfidfvectorizer__strip_accents': 'unicode',
    'featureunion__pipeline-2__tfidfvectorizer__stop_words': ['English'],
    'featureunion__pipeline-2__tfidfvectorizer__token_pattern': '\w+',
}
    
param_grid = {
    'featureunion__pipeline-1__tfidfvectorizer__max_features': [5000],
    'featureunion__pipeline-1__tfidfvectorizer__ngram_range': [(1,1)],
    'featureunion__pipeline-2__tfidfvectorizer__max_features': [5000],
    'featureunion__pipeline-2__tfidfvectorizer__ngram_range': [(1,3)],
    'featureunion__pipeline-1__standardscaler__with_mean': [False],
    'featureunion__pipeline-2__standardscaler__with_mean': [False],
    'featureunion__pipeline-3__standardscaler__with_mean': [True],
    'featureunion__pipeline-4__standardscaler__with_mean': [True],
    'featureunion__pipeline-5__standardscaler__with_mean': [False],
    'clf__alpha':  [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20],
    'clf__max_iter': [10000],
    'clf__penalty': ['l1', 'l2']
}

In [6]:
from sklearn.linear_model import Ridge, Lasso, SGDRegressor

In [7]:
def train_test_setup(data, target, test_size=0.2):
    data_copy = data.copy()
    y = data_copy.pop(target)
    X = data_copy
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=2
                                                       )
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_setup(data, 'amount_awarded')

In [8]:
# create model pipeline
pipe = Pipeline([
        ('featureunion', feature_union),
        ('clf', Ridge())
])

In [9]:
funding_diffs = list(set(X_train.funding_org_name_revised.unique()).symmetric_difference(set(X_test.funding_org_name_revised.unique())))
X_train[X_train['funding_org_name_revised'].isin(funding_diffs)]['funding_org_name_revised'].value_counts()

Series([], Name: funding_org_name_revised, dtype: int64)

In [None]:
# set model parameters and fit
pipe.set_params(**fixed_params)
searchcv = GridSearchCV(pipe, param_grid, **job_params)
try:
    if __name__ == '__main__':
        searchcv.fit(X_train, y_train)
except Exception as e:
    raise
    print(e)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('featureextractortext',
                                                                  FeatureExtractorText(columns='description')),
                                                                 ('tfidfvectorizer',
                                                                  TfidfVectorizer(lowercase=[True],
                                                                                  stop_words=['English'],
                                                                                  strip_accents='unicode',
                                                                                  token_pattern='\\w+')),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                          

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
best_model = searchcv.best_estimator_.named_steps['clf']

In [None]:
predicted = searchcv.predict(X_test)
observed = y_test.values

In [None]:
preds_v_obs = pd.DataFrame([pd.Series(predicted), pd.Series(observed)]).T
preds_v_obs.columns=['predicted', 'observed']
preds_v_obs['residuals'] = preds_v_obs['observed'] - preds_v_obs['predicted']
baseline = preds_v_obs['observed'].mean()
preds_v_obs['baseline_residuals'] = preds_v_obs['observed'] - baseline
print(f"Baseline: {baseline}")

Get baseline stats

In [None]:
print('Baseline:')
print(f"MSE: {preds_v_obs.residuals.map(lambda x: x**2).mean():,.0f}")
print(f"RMSE: {preds_v_obs.residuals.map(lambda x: x**2).mean()**0.5:,.0f}")
print('='*15)
print()
print('Our model:')
print(f"MSE: {preds_v_obs.baseline_residuals.map(lambda x: x**2).mean():,.0f}")
print(f"RMSE: {preds_v_obs.baseline_residuals.map(lambda x: x**2).mean()**0.5:,.0f}")
print('='*15)

In [None]:
from sklearn.metrics import mean_absolute_error

print('Baseline:')
print(f"MAE: {mean_absolute_error(preds_v_obs['observed'], pd.Series([baseline for o in preds_v_obs['observed']])):,.0f}")
print('='*15)
print()
print('Our model:')
print(f"MAE: {mean_absolute_error(preds_v_obs['observed'], preds_v_obs['predicted']):,.0f}")
print('='*15)

In [None]:
preds_v_obs

In [None]:
test_df = X_test[X_test.index.isin(y_test.index)]
test_df['predicted_grant'] = predicted
test_df['observed_grant'] = observed

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
preds_v_obs.plot(x='predicted', y='observed', kind='scatter', color='b', ax=ax)
ax.plot([preds_v_obs['observed'].min(), preds_v_obs['observed'].max()], [
        preds_v_obs['observed'].min(), preds_v_obs['observed'].max()], lw=2, c='r')
plt.show();

In [None]:
searchcv.best_score_

In [None]:
sorted(searchcv.cv_results_.keys())

In [None]:
pd.DataFrame(searchcv.cv_results_)

In [None]:
searchcv.cv_results_

In [None]:
from statsmodels.graphics.gofplots import qqplot

In [None]:
qqplot(preds_v_obs.residuals, line='s')
plt.show();

## 3.0 Neural Network

In [9]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

In [50]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='Huber', optimizer='adam')
    return model

In [47]:
# create model pipeline
pipe = Pipeline([
        ('featureunion', feature_union),
        ('clf', KerasRegressor(build_fn=baseline_model))
])

In [48]:
job_params = {
        'cv': 5,                                                       
        'n_jobs': -1,                                         
        'verbose': 10,
}

# set up fixed arguments for gridsearch
fixed_params = {
    'featureunion__pipeline-1__tfidfvectorizer__lowercase': [True],
    'featureunion__pipeline-1__tfidfvectorizer__strip_accents': 'unicode',
    'featureunion__pipeline-1__tfidfvectorizer__stop_words': ['English'],
    'featureunion__pipeline-1__tfidfvectorizer__token_pattern': '\w+',
    'featureunion__pipeline-2__tfidfvectorizer__lowercase': [True],
    'featureunion__pipeline-2__tfidfvectorizer__strip_accents': 'unicode',
    'featureunion__pipeline-2__tfidfvectorizer__stop_words': ['English'],
    'featureunion__pipeline-2__tfidfvectorizer__token_pattern': '\w+',
}
    
param_grid = {
    'featureunion__pipeline-1__tfidfvectorizer__max_features': [5000],
    'featureunion__pipeline-1__tfidfvectorizer__ngram_range': [(1,1)],
    'featureunion__pipeline-2__tfidfvectorizer__max_features': [5000],
    'featureunion__pipeline-2__tfidfvectorizer__ngram_range': [(1,3)],
    'featureunion__pipeline-1__standardscaler__with_mean': [False],
    'featureunion__pipeline-2__standardscaler__with_mean': [False],
    'featureunion__pipeline-3__standardscaler__with_mean': [True],
    'featureunion__pipeline-4__standardscaler__with_mean': [True],
    'featureunion__pipeline-5__standardscaler__with_mean': [False],
    'clf__epochs': [10000],
    'clf__batch_size': [5]
}

In [49]:
# set model parameters and fit
pipe.set_params(**fixed_params)
searchcv = GridSearchCV(pipe, param_grid, **job_params)
try:
    if __name__ == '__main__':
        searchcv.fit(X_train, y_train)
except Exception as e:
    raise
    print(e)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('featureextractortext',
                                                                  FeatureExtractorText(columns='description')),
                                                                 ('tfidfvectorizer',
                                                                  TfidfVectorizer(lowercase=[True],
                                                                                  stop_words=['English'],
                                                                                  strip_accents='unicode',
                                                                                  token_pattern='\\w+')),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                          

Fitting 5 folds for each of 1 candidates, totalling 5 fits


TypeError: baseline_model() missing 1 required positional argument: 'inputs'

In [None]:
predicted = searchcv.predict(X_test)
observed = y_test.values

In [62]:
preds_v_obs = pd.DataFrame([pd.Series(predicted), pd.Series(observed)]).T
preds_v_obs.columns=['predicted', 'observed']
preds_v_obs['residuals'] = preds_v_obs['observed'] - preds_v_obs['predicted']
baseline = preds_v_obs['observed'].mean()
preds_v_obs['baseline_residuals'] = preds_v_obs['observed'] - baseline
print(f"Baseline: {baseline}")

Baseline: 5592.059470117068
