In [248]:
import pandas as pd
data = pd.read_csv('Data/cleaned_data.csv').drop(['Unnamed: 0'], axis=1)

In [249]:
data.head()

Unnamed: 0,Access,DMA,Date Aired,Estimate,GRP,Impressions,Length,Market,Media,Spot Cost,Spot Type,Station ID,Time Aired
0,30GD,900,2018-12-31,Q119,0.0,0.0,30,National Network,TV,56.0,INV,37,00:06:00
1,30TN,900,2018-12-31,Q119,0.0,0.0,30,Cable,TV,56.0,INV,202,00:07:00
2,30PM,900,2018-12-31,Q119,0.016,27.2,30,Cable,TV,142.4,INV,253,00:09:00
3,30PM,900,2018-12-31,Q119,0.128,158.4,30,Cable,TV,64.0,INV,340,00:09:00
4,30GD,900,2018-12-31,Q119,0.0,0.0,30,Cable,TV,56.0,INV,168,00:10:00


In [250]:
station_universe = pd.read_excel('Data/station universe.xlsx')
station_universe.head()

Unnamed: 0,Station ID,Station Category,subscribers (millions)
0,1,Lifestyle,50.0
1,2,Variety,0.4
2,4,Sports,26.0
3,5,News,26.0
4,6,Variety,26.0


In [251]:
data = pd.merge(data, station_universe, left_on = 'Station ID', right_on = 'Station ID', how = 'left')
data['Absolute Audience Size'] = data['GRP'] * data['subscribers (millions)']
data.head()

Unnamed: 0,Access,DMA,Date Aired,Estimate,GRP,Impressions,Length,Market,Media,Spot Cost,Spot Type,Station ID,Time Aired,Station Category,subscribers (millions),Absolute Audience Size
0,30GD,900,2018-12-31,Q119,0.0,0.0,30,National Network,TV,56.0,INV,37,00:06:00,News,41.0,0.0
1,30TN,900,2018-12-31,Q119,0.0,0.0,30,Cable,TV,56.0,INV,202,00:07:00,Variety,30.0,0.0
2,30PM,900,2018-12-31,Q119,0.016,27.2,30,Cable,TV,142.4,INV,253,00:09:00,Sports,41.0,0.656
3,30PM,900,2018-12-31,Q119,0.128,158.4,30,Cable,TV,64.0,INV,340,00:09:00,Variety,60.0,7.68
4,30GD,900,2018-12-31,Q119,0.0,0.0,30,Cable,TV,56.0,INV,168,00:10:00,Lifestyle,12.5,0.0


In [252]:
data0 = data[data['Impressions'] != 0].dropna()
data1 = data[data['Impressions'] == 0]
print('data used to build the model:', len(data0), 'rows')
print('data used to predict:', len(data1), 'rows')

data used to build the model: 79484 rows
data used to predict: 603873 rows


## Data preparation 

In [253]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime

In [254]:
data0.columns

Index(['Access', 'DMA', 'Date Aired', 'Estimate', 'GRP', 'Impressions',
       'Length', 'Market', 'Media', 'Spot Cost', 'Spot Type', 'Station ID',
       'Time Aired', 'Station Category', 'subscribers (millions)',
       'Absolute Audience Size'],
      dtype='object')

In [255]:
# identify outlier stations
outlier_stations = pd.read_pickle('Code/outlier_stations.pickle')
Outlier = list(outlier_stations[outlier_stations>=5].index)
Outlier

[216, 184, 352, 311, 200]

In [256]:
# "week", "hour", "outlier"
#Outlier = [216, 184, 352, 311, 200, 10, 15, 192, 224, 280, 252, 72]

feature = ['Estimate', 'Length', 'Market','Spot Cost', 'Station ID','Station Category', 'Date Aired','Time Aired']
x = data0.loc[:,feature]
x.index = range(len(x))
y = data0.loc[:,'Absolute Audience Size']


#Month = pd.DataFrame({'Month': [datetime.datetime.strptime(i, "%Y-%m-%d").month for i in x['Date Aired']]})
#daypart = pd.get_dummies(x['daypart'])

import datetime 
Estimate = pd.get_dummies(x['Estimate'])
Market = pd.get_dummies(x['Market'])
Station = pd.DataFrame({'Outlier_station': [(i in Outlier)+0 for i in x['Station ID']]})
Category = pd.get_dummies(x['Station Category'])
Weekday = pd.DataFrame({'Weekday': [datetime.datetime.strptime(i, '%Y-%m-%d').weekday() for i in x['Date Aired']]})
Hour = pd.get_dummies([i[0:2] for i in data0['Time Aired']])

x = pd.concat([x, Estimate, Market, Station, Category, Weekday, Hour], axis = 1).drop(['Estimate', 'Market', 'Station ID','Station Category', 'Time Aired','Date Aired'], axis=1)



In [257]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=100)
X_train.head()

Unnamed: 0,Length,Spot Cost,Q119,Q219,Q319,Q419,Cable,DirecTV,Dish Network,National Network,...,14,15,16,17,18,19,20,21,22,23
66645,30,64.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60208,30,200.0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14106,30,256.0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61018,30,192.0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
16360,30,96.0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## Basic NN model

In [258]:
mod = MLPRegressor(
    hidden_layer_sizes = (3,),
    activation = 'relu',
    solver="adam",
    alpha=0.0001,
    learning_rate_init=0.01,
    max_iter=1000,
    verbose = True,
    random_state=1234
)
mod.fit(X_train, y_train)

Iteration 1, loss = 233.15786218
Iteration 2, loss = 6.10685823
Iteration 3, loss = 5.17734604
Iteration 4, loss = 4.62076109
Iteration 5, loss = 4.36152733
Iteration 6, loss = 4.26620031
Iteration 7, loss = 4.23864648
Iteration 8, loss = 4.23250052
Iteration 9, loss = 4.23148826
Iteration 10, loss = 4.23133010
Iteration 11, loss = 4.23136220
Iteration 12, loss = 4.23136510
Iteration 13, loss = 4.23142781
Iteration 14, loss = 4.23139197
Iteration 15, loss = 4.23147358
Iteration 16, loss = 4.23133148
Iteration 17, loss = 4.23148036
Iteration 18, loss = 4.23154759
Iteration 19, loss = 4.23151818
Iteration 20, loss = 4.23139754
Iteration 21, loss = 4.23157871
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(3,), learning_rate='constant',
             learning_rate_init=0.01, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1234, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=True,
             warm_start=False)

In [259]:
pred_train = mod.predict(X_train)
pred_test = mod.predict(X_test)
print('train set mse:', metrics.mean_squared_error(y_train,pred_train))
print('test set mse:', metrics.mean_squared_error(y_test,pred_test))
print('train set rmse:', (metrics.mean_squared_error(y_train,pred_train))**0.5)
print('test set rmse:', (metrics.mean_squared_error(y_test,pred_test))**0.5)

train set mse: 8.462704956530528
test set mse: 8.728546467564398
train set rmse: 2.9090728688932024
test set rmse: 2.9544113572020394


## Grid Search

In [286]:
hls = [(4,), (4,3,2),(4,3,2,2)]

param_grid = {"hidden_layer_sizes": hls, "alpha": [0.0001, 0.001, 0.01]}

mod_cv = GridSearchCV(
    mod, 
    param_grid, 
    scoring= 'neg_root_mean_squared_error', 
    cv=5, n_jobs=4, refit=True, verbose=5
)
mod_cv.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   18.9s
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:  1.8min finished


Iteration 1, loss = 2.70611640
Iteration 2, loss = 2.28475253
Iteration 3, loss = 1.95195465
Iteration 4, loss = 1.86404478
Iteration 5, loss = 1.85889898
Iteration 6, loss = 1.81395234
Iteration 7, loss = 1.76145511
Iteration 8, loss = 1.75444518
Iteration 9, loss = 1.76692298
Iteration 10, loss = 1.75628031
Iteration 11, loss = 1.70799940
Iteration 12, loss = 1.72398239
Iteration 13, loss = 1.68555342
Iteration 14, loss = 1.64554177
Iteration 15, loss = 1.69048782
Iteration 16, loss = 1.70427386
Iteration 17, loss = 1.67012871
Iteration 18, loss = 1.68066722
Iteration 19, loss = 1.68100563
Iteration 20, loss = 1.63021145
Iteration 21, loss = 1.68766304
Iteration 22, loss = 1.63698378
Iteration 23, loss = 1.62219610
Iteration 24, loss = 1.62973052
Iteration 25, loss = 1.58921497
Iteration 26, loss = 1.62754587
Iteration 27, loss = 1.70273509
Iteration 28, loss = 1.62052813
Iteration 29, loss = 1.59143050
Iteration 30, loss = 1.59411535
Iteration 31, loss = 1.56394158
Iteration 32, los

GridSearchCV(cv=5, error_score=nan,
             estimator=MLPRegressor(activation='relu', alpha=0.0001,
                                    batch_size='auto', beta_1=0.9, beta_2=0.999,
                                    early_stopping=False, epsilon=1e-08,
                                    hidden_layer_sizes=(3,),
                                    learning_rate='constant',
                                    learning_rate_init=0.01, max_fun=15000,
                                    max_iter=1000, momentum=0.9,
                                    n_iter_no_change=10,
                                    nesterovs_momentum=True, power_t=0.5,
                                    random_state=1234, shuffle=True,
                                    solver='adam', tol=0.0001,
                                    validation_fraction=0.1, verbose=True,
                                    warm_start=False),
             iid='deprecated', n_jobs=4,
             param_grid={'alpha': [0.0001, 

In [287]:
mod_cv.best_params_

{'alpha': 0.0001, 'hidden_layer_sizes': (4, 3, 2)}

In [288]:
pred_train = mod_cv.best_estimator_.predict(X_train)
pred_test = mod_cv.best_estimator_.predict(X_test)
print('train set rmse:', (metrics.mean_squared_error(y_train,pred_train))**0.5)
print('test set rmse:', (metrics.mean_squared_error(y_test,pred_test))**0.5)

train set rmse: 1.7872094542195773
test set rmse: 1.8833349190587856
