In [15]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression, RidgeCV, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings("ignore")

sns.set(rc={"figure.figsize": (11.7, 8.27)})

In [13]:
dat = pd.read_csv('../Data/dat_features.csv')

In [14]:
unseen = dat[(dat.Impressions.eq(0.0)) & (dat.GRP.eq(0.0)) | (dat.active_flag.eq(1))]
training = dat[dat.Impressions > 0.0]

In [16]:
labels = [
        'Q119', 'Q219',
        'Q319', 'Q419',
        'BP', 'DC', 'DE', 'DP',
        'GD', 'GX', 'PL', 'PM',
        'PN', 'PT', 'SR', 'SV',
        'TN', 'VE',
        "Length",
        "Spot_Cost",
        "Cable",
        "DirecTV",
        "Dish_Network",
        "National_Network",
        "Over-the-top_content",
        'Q1', 'Q2', 'Q3', 'Q4',
        'bin_1', 'bin_2',
        'bin_3', 'bin_4', 'bin_5',
        'Daytime', 'Early_Fringe',
        'Late_Fringe', 'Late_Night',
        'Morning', 'Overnight',
        'Primetime',
        'midnight', 'one_am', 'two_am', 'three_am', 'four_am', 'five_am',
        'six_am', 'seven_am', 'eight_am', 'nine_am', 'ten_am', 'eleven_am',
        'noon', 'one_pm', 'two_pm', 'three_pm', 'four_pm', 'five_pm', 'six_pm',
        'seven_pm', 'eight_pm', 'nine_pm', 'ten_pm', 'eleven_pm'
    ]

X = training.loc[:,labels]
y = training.loc[:, "Impressions"]

scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_x.fit(X)
X_scale = scaler_x.transform(X)

scaler_y.fit(y.values.reshape(1, -1))
y_scale = scaler_y.fit_transform(y.values.reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X_scale, y_scale, test_size=0.30)

In [26]:
pca = PCA(0.95)
pca.fit(X_train)
pca.fit(X_test)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [27]:
ridge = Ridge(alpha=1e-10, solver='sag')

ridge.fit(X_train, y_train)

Ridge(alpha=1e-10, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='sag', tol=0.001)

In [30]:
preds = scaler_y.inverse_transform(ridge.predict(X_test))

print(np.sqrt(MSE(scaler_y.inverse_transform(y_test), preds)))

35.204884658592526
