## Train a model to make predictions for upcoming videos based on show cast 

In [2]:
# Setup window width
from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# Supress warnings
import warnings
warnings.filterwarnings('ignore')

# Import
import pandas as pd
import numpy as np
import datetime as dt
import joblib

# Set pandas options
pd.set_option('display.max_columns', 50)
pd.set_option('display.precision', 2)

# Plot
from matplotlib import pyplot as plt
import seaborn as sns

sns.set()

In [20]:
# ML imports
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, median_absolute_error,\
                            mean_squared_error, mean_squared_log_error, r2_score 
from sklearn.preprocessing import StandardScaler

In [4]:
# Load data
df = pd.read_csv("razgony_data.csv", sep=",", index_col="id")

In [5]:
X = df.drop(columns=["release_date", "names", "likes", "dislikes", "views"])
y = df["views"]

In [6]:
# Save columns names
columns = X.columns

In [7]:
# Standardize data due to big numeric difference
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Split data for train and holdout 
X_train, X_val, y_train, y_val = X_scaled[:61], X_scaled[61:], y.iloc[:61], y.iloc[61:]

In [9]:
# Set time series data splitter
tss = TimeSeriesSplit(n_splits=3)

## Train linear Lasso model

Lasso model showed best results on given sparse dataset

In [10]:
# Cross validate Lasso on Time Series Split
cv_lasso = LassoCV(cv=tss)

In [11]:
cv_lasso.fit(X_train, y_train)

In [12]:
prediction = cv_lasso.predict(X_val)

## Metrics

In [30]:
def metric_printer(y_true, y_pred):
    for metric in [mean_absolute_error, mean_absolute_percentage_error, median_absolute_error, 
                    mean_squared_error, mean_squared_log_error, r2_score]:
        print(f"{metric.__name__.upper():>32} = {metric(y_true, y_pred):.2f}")

metric_printer(y_val, prediction)

             MEAN_ABSOLUTE_ERROR = 127442.03
  MEAN_ABSOLUTE_PERCENTAGE_ERROR = 0.17
           MEDIAN_ABSOLUTE_ERROR = 100906.82
              MEAN_SQUARED_ERROR = 26930700587.97
          MEAN_SQUARED_LOG_ERROR = 0.04
                        R2_SCORE = 0.11


In [13]:
z = pd.DataFrame({"true": y_val, "pred": prediction})
z["diff"] = z['pred'] - z["true"]

In [14]:
# Feature coefficients
coef_df = pd.DataFrame({"cols": columns, "coef": cv_lasso.coef_}).sort_values(by="coef", ascending=False)

In [15]:
# Top 10 coeffiecients
coef_df[:10]

Unnamed: 0,cols,coef
0,duration,143959.19
1,age_days,114354.26
101,Сергей_Орлов,111192.47
51,Артур_Чапарян,79134.89
52,Иван_Ильин,62571.34
90,Филипп_Воронин,59150.88
84,Саша_Малой,45596.95
37,Ваня_Усович,38723.97
59,Эльдар_Гусейнов,26088.39
93,Идрак_Мирзализаде,16447.05


In [22]:
# Dump Lasso model object to a file for further load
joblib.dump(cv_lasso, "./dumps/lasso_model.joblib")
# Dump scaler object
joblib.dump(scaler, "./dumps/scaler.joblijb")
# Dump dataframe columns
joblib.dump(columns, "./dumps/data_columns.joblib")

['data_columns.joblib']