In [17]:
import os
import time
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from cslib import fetch_ts, engineer_features

# 1. Ingesting Data
### looking at the end of the data

In [18]:
data_dir = os.path.join("..", "capstone-w","cs-train")
ts_all = fetch_ts(data_dir,clean=False)
ts_all['all'].tail()

... processing data for loading


Unnamed: 0,date,purchases,unique_invoices,unique_streams,total_views,year_month,revenue
602,2019-06-26,1358,67,999,6420,2019-06,4903.17
603,2019-06-27,1620,80,944,9435,2019-06,5499.38
604,2019-06-28,1027,70,607,5539,2019-06,3570.6
605,2019-06-29,0,0,0,0,2019-06,0.0
606,2019-06-30,602,27,423,2534,2019-06,1793.98


# 2. feature fngineering and train-test split

### generate features using 7, 14, 28, 70-day time windows, the monthly sum of previous year, the average number of invoices and the total views in recent 30 days

In [19]:
X,y,dates = engineer_features(ts_all['all'])
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

In [20]:
X.head()

Unnamed: 0,previous_7,previous_14,previous_28,previous_70,previous_year,recent_invoices,recent_views
0,14450.54,14450.54,14450.54,14450.54,0.0,5.928571,537.392857
1,27862.5,27862.5,27862.5,27862.5,0.0,10.310345,1007.103448
2,41152.75,41152.75,41152.75,41152.75,0.0,14.966667,1499.066667
3,50840.03,50840.03,50840.03,50840.03,0.0,18.533333,1916.533333
4,52283.29,52283.29,52283.29,52283.29,0.0,19.6,2026.966667


In [21]:
y[:5]

array([183857.22, 170445.26, 157155.01, 147467.73, 147991.02])

# 3. Model Training and Perforamcne Comparision  

### 3.1. Random Forest Model

In [22]:
param_grid_rf = {
    'rf__criterion': ['mse','mae'],
    'rf__n_estimators': [10,15,20,25,50,100]
    }
time_start = time.time()
pipe_rf = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

rf_mae =  mean_absolute_error(y_test, y_pred)
rf_mse =  mean_squared_error(y_test, y_pred)
rf_r2_score = r2_score(y_test, y_pred)
rf_explained_variance_score = explained_variance_score(y_test, y_pred)

print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("mae = ".format(rf_mae))
print("mse = ".format(rf_mse))
print("r2_score = ".format(rf_r2_score))
print("explained_variance_score = ".format(rf_explained_variance_score))
print("best params =", grid.best_params_)
print("--------------------------------------------------------------------------------------")

train time =  00:00:04
mae = 
mse = 
r2_score = 
explained_variance_score = 
best params = {'rf__criterion': 'mse', 'rf__n_estimators': 100}
--------------------------------------------------------------------------------------


### 3.2. Gradient Boosting Model

In [23]:
param_grid_gb = {
    'gb__criterion': ['mse','mae'],
    'gb__n_estimators': [10,15,20,25,50,100]
    }

time_start = time.time()
pipe_gb = Pipeline(steps=[('scaler', StandardScaler()), ('gb', GradientBoostingRegressor())])

grid = GridSearchCV(pipe_gb, param_grid=param_grid_gb, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

gb_mae =  mean_absolute_error(y_test, y_pred)
gb_mse =  mean_squared_error(y_test, y_pred)
gb_r2_score = r2_score(y_test, y_pred)
gb_explained_variance_score = explained_variance_score(y_test, y_pred)

print("training time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("mae = ".format(gb_mae))
print("mse = ".format(gb_mse))
print("r2_score = ".format(gb_r2_score))
print("explained_variance_score = ".format(gb_explained_variance_score))
print("best parameters =", grid.best_params_)
print("--------------------------------------------------------------------------------------")

training time =  00:00:01
mae = 
mse = 
r2_score = 
explained_variance_score = 
best parameters = {'gb__criterion': 'mse', 'gb__n_estimators': 100}
--------------------------------------------------------------------------------------
