In [395]:
import pandas as pd

In [396]:
data = pd.read_csv('cleaned_data.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)


In [397]:
data0 = data[data['Impressions'] != 0].dropna()
data1 = data[data['Impressions'] == 0]
print('data used to build the model:', len(data0), 'rows')
print('data used to predict:', len(data1), 'rows')

data used to build the model: 55383 rows
data used to predict: 448456 rows


## Data preparation 

In [398]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime

In [399]:
data0.columns

Index(['Access', 'DMA', 'Date Aired', 'Estimate', 'GRP', 'Impressions',
       'Length', 'Market', 'Media', 'Spot Cost', 'Spot Type', 'Station ID',
       'Time Aired', 'DOW', 'daypart'],
      dtype='object')

In [400]:
# identify outlier stations
outlier_stations = pd.read_pickle('Code/outlier_stations.pickle')
Outlier = list(outlier_stations[outlier_stations>=5].index)
Outlier

[216, 184, 352, 311, 200]

In [418]:
feature = ['Access','Estimate', 'Market', 'Length', 'Spot Cost', 'Station ID','DOW', 'Date Aired']
x = data0.loc[:,feature]
x.index = range(len(x))
y = data0.loc[:,'Impressions']

Access = pd.get_dummies(x['Access'])
Estimate = pd.get_dummies(x['Estimate'])
Market = pd.get_dummies(x['Market'])
Station = pd.DataFrame({'Outlier_station': [(i in Outlier)+0 for i in x['Station ID']]})
Month = pd.DataFrame({'Month': [datetime.datetime.strptime(i, "%Y-%m-%d").month for i in x['Date Aired']]})
DOW = pd.get_dummies(x['DOW'])
#daypart = pd.get_dummies(x['daypart'])
Hour = pd.get_dummies([i[0:2] for i in data0['Time Aired']])
x = pd.concat([x, Access, Estimate, Market, Station, DOW, Month, Hour], axis = 1).drop(['Access', 'Estimate', 'Market','Station ID', 'DOW', 'Date Aired', 'Length'], axis=1)



In [419]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=100)

In [420]:
X_train.columns

Index(['Spot Cost', '30BP', '30DP', '30GD', '30PM', '30PT', '30SR', '30TN',
       '30VE', 'SV15', 'Q119', 'Q219', 'Q319', 'Q419',
       'Cable                         ', 'DirecTV                       ',
       'Dish Network                  ', 'National Network              ',
       'Outlier_station', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday',
       'Tuesday', 'Wednesday', 'Month', '00', '01', '02', '03', '04', '05',
       '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17',
       '18', '19', '20', '21', '22', '23'],
      dtype='object')

## Basic NN model

In [421]:
mod = MLPRegressor(
    hidden_layer_sizes = (3,),
    activation = 'relu',
    solver="adam",
    alpha=0.0001,
    learning_rate_init=0.01,
    max_iter=1000,
    verbose = True,
    random_state=1234
)
mod.fit(X_train, y_train)

Iteration 1, loss = 833.95553722
Iteration 2, loss = 782.83349743
Iteration 3, loss = 732.64085086
Iteration 4, loss = 683.51103664
Iteration 5, loss = 648.01608067
Iteration 6, loss = 631.74586572
Iteration 7, loss = 628.67985307
Iteration 8, loss = 631.26124732
Iteration 9, loss = 618.95113637
Iteration 10, loss = 617.06907454
Iteration 11, loss = 615.69371873
Iteration 12, loss = 611.76726131
Iteration 13, loss = 611.88080163
Iteration 14, loss = 603.96450894
Iteration 15, loss = 596.68260589
Iteration 16, loss = 602.85113190
Iteration 17, loss = 611.08651832
Iteration 18, loss = 600.23306328
Iteration 19, loss = 619.63909244
Iteration 20, loss = 606.87085844
Iteration 21, loss = 602.75245794
Iteration 22, loss = 606.71791568
Iteration 23, loss = 597.50778019
Iteration 24, loss = 592.30270525
Iteration 25, loss = 592.93345494
Iteration 26, loss = 584.47362844
Iteration 27, loss = 580.58959694
Iteration 28, loss = 610.97181941
Iteration 29, loss = 587.45732945
Iteration 30, loss = 59

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(3,), learning_rate='constant',
             learning_rate_init=0.01, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1234, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=True,
             warm_start=False)

In [422]:
pred_train = mod.predict(X_train)
pred_test = mod.predict(X_test)
print('train set mse:', metrics.mean_squared_error(y_train,pred_train))
print('test set mse:', metrics.mean_squared_error(y_test,pred_test))
print('train set rmse:', (metrics.mean_squared_error(y_train,pred_train))**0.5)
print('test set rmse:', (metrics.mean_squared_error(y_test,pred_test))**0.5)

train set mse: 914.3721116110105
test set mse: 820.1219348716304
train set rmse: 30.238586468467908
test set rmse: 28.63777112262109


## Grid Search

In [423]:
hls = [(2,), (3,), (2, 2), (3, 3), (2,2,2), (3, 3, 3)]

param_grid = {"hidden_layer_sizes": hls, "alpha": [0.0001, 0.001, 0.01]}

mod_cv = GridSearchCV(
    mod, 
    param_grid, 
    scoring= 'neg_root_mean_squared_error', 
    cv=5, n_jobs=4, refit=True, verbose=5
)
#mod_cv.fit(X_train, y_train)

In [424]:
mod_cv.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   19.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:  2.1min finished


Iteration 1, loss = 833.95553722
Iteration 2, loss = 782.83349743
Iteration 3, loss = 732.64085086
Iteration 4, loss = 683.51103664
Iteration 5, loss = 648.01608067
Iteration 6, loss = 631.74586572
Iteration 7, loss = 628.67985307
Iteration 8, loss = 631.26124732
Iteration 9, loss = 618.95113637
Iteration 10, loss = 617.06907454
Iteration 11, loss = 615.69371873
Iteration 12, loss = 611.76726131
Iteration 13, loss = 611.88080163
Iteration 14, loss = 603.96450894
Iteration 15, loss = 596.68260589
Iteration 16, loss = 602.85113190
Iteration 17, loss = 611.08651832
Iteration 18, loss = 600.23306328
Iteration 19, loss = 619.63909244
Iteration 20, loss = 606.87085844
Iteration 21, loss = 602.75245794
Iteration 22, loss = 606.71791568
Iteration 23, loss = 597.50778019
Iteration 24, loss = 592.30270525
Iteration 25, loss = 592.93345494
Iteration 26, loss = 584.47362844
Iteration 27, loss = 580.58959694
Iteration 28, loss = 610.97181941
Iteration 29, loss = 587.45732945
Iteration 30, loss = 59

GridSearchCV(cv=5, error_score=nan,
             estimator=MLPRegressor(activation='relu', alpha=0.0001,
                                    batch_size='auto', beta_1=0.9, beta_2=0.999,
                                    early_stopping=False, epsilon=1e-08,
                                    hidden_layer_sizes=(3,),
                                    learning_rate='constant',
                                    learning_rate_init=0.01, max_fun=15000,
                                    max_iter=1000, momentum=0.9,
                                    n_iter_no_change=10,
                                    nesterovs_momentum=True, power_t=0.5,
                                    random_state=1234, shuffle=True,
                                    solver='adam', tol=0.0001,
                                    validation_fraction=0.1, verbose=True,
                                    warm_start=False),
             iid='deprecated', n_jobs=4,
             param_grid={'alpha': [0.0001, 

In [425]:
pred_train = mod_cv.best_estimator_.predict(X_train)
pred_test = mod_cv.best_estimator_.predict(X_test)
print('train set mse:', metrics.mean_squared_error(y_train,pred_train))
print('test set mse:', metrics.mean_squared_error(y_test,pred_test))

train set mse: 914.3721116110105
test set mse: 820.1219348716304


In [427]:
print('train set rmse:', (metrics.mean_squared_error(y_train,pred_train))**0.5)
print('test set rmse:', (metrics.mean_squared_error(y_test,pred_test))**0.5)

train set rmse: 30.238586468467908
test set rmse: 28.63777112262109
