In [104]:
import pandas as pd

In [105]:
data = pd.read_csv('cleaned_data.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)


In [106]:
data0 = data[data['GRP'] != 0]
data1 = data[data['GRP'] == 0]
print('data used to build the model:', len(data0), 'rows')
print('data used to predict:', len(data1), 'rows')

data used to build the model: 56574 rows
data used to predict: 626783 rows


## Data preparation 

In [107]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [108]:
data0.columns

Index(['Access', 'DMA', 'Date Aired', 'Estimate', 'GRP', 'Impressions',
       'Length', 'Market', 'Media', 'Spot Cost', 'Spot Type', 'Station ID',
       'Time Aired', 'DOW', 'daypart'],
      dtype='object')

In [109]:
# identify outlier stations
# Outlier = 
outlier_stations = pd.read_pickle('Code/outlier_stations.pickle')
Outlier = list(outlier_stations[outlier_stations>=5].index)
Outlier

[216, 184, 352, 311, 200]

In [110]:
feature = ['Estimate', 'Access', 'Market', 'Length', 'Spot Cost', 'Station ID']
x = data0.loc[:,feature]
x.index = range(len(x))
y = data0.loc[:,'GRP']

Estimate = pd.get_dummies(x['Estimate'])
Access = pd.get_dummies(x['Access'])
Market = pd.get_dummies(x['Market'])
#Station = pd.get_dummies(x['Station ID'])
Station = pd.DataFrame({'Outlier_station': [(i in Outlier) for i in x['Station ID']]})
x = pd.concat([x, Estimate, Access, Market, Station], axis = 1).drop(['Estimate', 'Access', 'Market','Station ID'], axis=1)




In [54]:
feature = ['Estimate', 'Access', 'Market', 'Length', 'Spot Cost', 'Station ID','DOW', 'daypart']
x = data0.loc[:,feature]
y = data0.loc[:,'GRP']

Estimate = pd.get_dummies(x['Estimate'])
Access = pd.get_dummies(x['Access'])
Market = pd.get_dummies(x['Market'])
#Station = pd.get_dummies(x['Station ID'])
Station = pd.DataFrame([(i in Outlier) for i in x['Station ID']])
DOW = pd.get_dummies(x['DOW'])
daypart = pd.get_dummies(x['daypart'])
x = pd.concat([x, Estimate, Access, Market, Station, DOW, daypart], axis = 1).drop(['Estimate', 'Access', 'Market','Station ID', 'DOW', 'daypart'], axis=1)



In [111]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

## Basic NN model

In [112]:
mod = MLPRegressor(
    hidden_layer_sizes = (3,),
    activation = 'relu',
    solver="adam",
    alpha=0.0001,
    learning_rate_init=0.01,
    max_iter=1000,
    verbose = True,
    random_state=1234
)
mod.fit(X_train, y_train)

Iteration 1, loss = 234.73806342
Iteration 2, loss = 0.02359299
Iteration 3, loss = 0.01306919
Iteration 4, loss = 0.00674757
Iteration 5, loss = 0.00354581
Iteration 6, loss = 0.00208857
Iteration 7, loss = 0.00144131
Iteration 8, loss = 0.00115404
Iteration 9, loss = 0.00101963
Iteration 10, loss = 0.00094826
Iteration 11, loss = 0.00090537
Iteration 12, loss = 0.00088063
Iteration 13, loss = 0.00086334
Iteration 14, loss = 0.00084381
Iteration 15, loss = 0.00082894
Iteration 16, loss = 0.00082001
Iteration 17, loss = 0.00081361
Iteration 18, loss = 0.00080763
Iteration 19, loss = 0.00080372
Iteration 20, loss = 0.00080114
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(3,), learning_rate='constant',
             learning_rate_init=0.01, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1234, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=True,
             warm_start=False)

In [113]:
pred_train = mod.predict(X_train)
pred_test = mod.predict(X_test)
print('train set mse:', metrics.mean_squared_error(y_train,pred_train))
print('test set mse:', metrics.mean_squared_error(y_test,pred_test))

train set mse: 0.0015977625173717084
test set mse: 0.0015969791311892844


## Grid Search

In [114]:
hls = [(2,), (3,), (4,), (2, 2), (3, 3), (4, 4), (2,2,2), (3, 3, 3), (4, 4, 4)]

param_grid = {"hidden_layer_sizes": hls, "alpha": [0.0001, 0.001, 0.01, 0.05]}

mod_cv = GridSearchCV(
    mod, 
    param_grid, 
    scoring= 'neg_root_mean_squared_error', 
    cv=5, n_jobs=4, refit=True, verbose=5
)
#mod_cv.fit(X_train, y_train)

In [115]:
mod_cv.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    5.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   27.8s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:  1.3min finished


Iteration 1, loss = 0.03052731
Iteration 2, loss = 0.00065570
Iteration 3, loss = 0.00065224
Iteration 4, loss = 0.00065148
Iteration 5, loss = 0.00064429
Iteration 6, loss = 0.00063262
Iteration 7, loss = 0.00060585
Iteration 8, loss = 0.00055754
Iteration 9, loss = 0.00054896
Iteration 10, loss = 0.00054065
Iteration 11, loss = 0.00054377
Iteration 12, loss = 0.00054238
Iteration 13, loss = 0.00053961
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


GridSearchCV(cv=5, error_score=nan,
             estimator=MLPRegressor(activation='relu', alpha=0.0001,
                                    batch_size='auto', beta_1=0.9, beta_2=0.999,
                                    early_stopping=False, epsilon=1e-08,
                                    hidden_layer_sizes=(3,),
                                    learning_rate='constant',
                                    learning_rate_init=0.01, max_fun=15000,
                                    max_iter=1000, momentum=0.9,
                                    n_iter_no_change=10,
                                    nesterovs_momentum=True, power_t=0.5,
                                    random_state=12...ffle=True,
                                    solver='adam', tol=0.0001,
                                    validation_fraction=0.1, verbose=True,
                                    warm_start=False),
             iid='deprecated', n_jobs=4,
             param_grid={'alpha': [0.0001, 0.00

In [116]:
pred_train = mod_cv.best_estimator_.predict(X_train)
pred_test = mod_cv.best_estimator_.predict(X_test)
print('train set mse:', metrics.mean_squared_error(y_train,pred_train))
print('test set mse:', metrics.mean_squared_error(y_test,pred_test))

train set mse: 0.001041105668518042
test set mse: 0.0010526347046692164
