# Code for everything

## Imports

In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import PredefinedSplit
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Data Loading

In [85]:
df_train = pd.read_csv('../data/US_youtube_trending_train_20231122.csv')
df_test = pd.read_csv('../data/US_youtube_trending_test_20231122.csv')
df_val = pd.read_csv('../data/US_youtube_trending_validation_20231122.csv')

In [86]:
df_train['face_detected'] = df_train['Face Detected'].astype('bool')
df_train['text_detected'] = df_train['Text Detected'].astype('bool')
df_train.rename({"Color Palette": "color_palette"}, axis=1, inplace=True)

df_test['face_detected'] = df_test['Face Detected'].astype('bool')
df_test['text_detected'] = df_test['Text Detected'].astype('bool')
df_test.rename({"Color Palette": "color_palette"}, axis=1, inplace=True)

df_val['face_detected'] = df_val['Face Detected'].astype('bool')
df_val['text_detected'] = df_val['Text Detected'].astype('bool')
df_val.rename({"Color Palette": "color_palette"}, axis=1, inplace=True)

## Selecting columns of interest

In [87]:
cols = [
    # Target Variable
    'view_count',
    
    # Thumbnail Analysis
    'Brightness', 'top3_contains_black','top3_contains_blue','top3_contains_brown','top3_contains_green','top3_contains_grey',
    'top3_contains_orange','top3_contains_pink','top3_contains_purple','top3_contains_red','top3_contains_white','top3_contains_yellow',
    'Saturation', 'Contrast', 'color_palette', 'text_detected', 'face_detected',
    
    # Last  Video Data
    #'last_video_log_comment_count', 'last_video_log_days_on_trending', 'last_video_log_dislikes', 'last_video_log_likes', 'last_video_log_views',
    
    # Video metadata
    'categoryId',  'comments_disabled', 'ratings_disabled', 
    #'hour_of_day_published', 'local_hour_of_day_published',
    
    # Published Time Data
    #'hour_published_normalized', 
    'time_of_day_published', 'day_of_week_published', 'is_weekend_published', 
    'local_time_of_day_published',
    #'local_hour_published_normalized', 
    'published_on_holiday',
    
    # Previous year data
    #'previous_365_days_geo_avg_like_dislike_ratio', 
    'previous_365_days_log_avg_comment_count', 'previous_365_days_log_avg_days_on_trending',
    'previous_365_days_log_avg_dislikes', 'previous_365_days_log_avg_likes', 'previous_365_days_log_avg_views', 'previous_365_days_log_videos_count',

    # Title Analysis
    'title_length_chars', 'title_length_words', 'title_avg_word_length', 'title_longest_word_length', 'title_all_upcase',
    'title_first_upcase', 'title_any_upcase', 'title_prop_upcase', 'title_all_lowercase', 'title_sentiment', 'title_contains_digit',
    'title_starts_digit', 'title_contains_question', 'title_exclamation_count', 'title_punctuation_count',
    'title_stop_words_count', 'title_stop_words_prop', 'title_contains_quote', 
]

In [88]:
set(df_train.columns) - set(cols)

{'Face Detected',
 'Text Detected',
 'channelId',
 'channelTitle',
 'comment_count',
 'days_on_trending',
 'days_since_published',
 'description',
 'dislikes',
 'dislikes_scaled',
 'hour_of_day_published',
 'hour_published_normalized',
 'is_weekend_trending',
 'last_video_comment_count',
 'last_video_days_on_trending',
 'last_video_dislikes',
 'last_video_likes',
 'last_video_log_comment_count',
 'last_video_log_days_on_trending',
 'last_video_log_dislikes',
 'last_video_log_likes',
 'last_video_log_views',
 'last_video_views',
 'likes',
 'likes_scaled',
 'local_hour_of_day_published',
 'local_hour_published_normalized',
 'previous_365_days_avg_comment_count',
 'previous_365_days_avg_days_on_trending',
 'previous_365_days_avg_dislikes',
 'previous_365_days_avg_likes',
 'previous_365_days_avg_views',
 'previous_365_days_geo_avg_like_dislike_ratio',
 'previous_365_days_videos_count',
 'previous_avg_days_on_trending',
 'publishedAt',
 'tags',
 'thumbnail_link',
 'title',
 'trending_date',

In [89]:
df_train = df_train[cols]
df_test = df_test[cols]
df_val = df_val[cols]

In [90]:
df_train['text_detected'] = df_train['text_detected'].astype(int)
df_train['face_detected'] = df_train['face_detected'].astype(int)
df_train['comments_disabled'] = df_train['comments_disabled'].astype(int)
df_train['ratings_disabled'] = df_train['ratings_disabled'].astype(int)
df_train['is_weekend_published'] = df_train['is_weekend_published'].astype(int)
df_train['published_on_holiday'] = df_train['published_on_holiday'].astype(int)
df_train['title_all_upcase'] = df_train['title_all_upcase'].astype(int)

df_test['text_detected'] = df_test['text_detected'].astype(int)
df_test['face_detected'] = df_test['face_detected'].astype(int)
df_test['comments_disabled'] = df_test['comments_disabled'].astype(int)
df_test['ratings_disabled'] = df_test['ratings_disabled'].astype(int)
df_test['is_weekend_published'] = df_test['is_weekend_published'].astype(int)
df_test['published_on_holiday'] = df_test['published_on_holiday'].astype(int)
df_test['title_all_upcase'] = df_test['title_all_upcase'].astype(int)

df_val['text_detected'] = df_val['text_detected'].astype(int)
df_val['face_detected'] = df_val['face_detected'].astype(int)
df_val['comments_disabled'] = df_val['comments_disabled'].astype(int)
df_val['ratings_disabled'] = df_val['ratings_disabled'].astype(int)
df_val['is_weekend_published'] = df_val['is_weekend_published'].astype(int)
df_val['published_on_holiday'] = df_val['published_on_holiday'].astype(int)
df_val['title_all_upcase'] = df_val['title_all_upcase'].astype(int)

## Check NaN

In [91]:
(df_train[cols].isna().sum() / len(df_train)).sort_values(ascending = False).head(10)

previous_365_days_log_avg_comment_count       0.169631
previous_365_days_log_avg_days_on_trending    0.169631
previous_365_days_log_avg_dislikes            0.169631
previous_365_days_log_avg_likes               0.169631
previous_365_days_log_avg_views               0.169631
top3_contains_white                           0.038299
color_palette                                 0.038299
Contrast                                      0.038299
Saturation                                    0.038299
Brightness                                    0.038299
dtype: float64

## Handle nan

In [92]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)
df_val = df_val.fillna(0)

## Selecting categorical columns

In [93]:
cols_dtypes = df_train[cols].dtypes
dummy_cols = cols_dtypes[cols_dtypes == "object"].index
dummy_cols

Index(['color_palette', 'time_of_day_published', 'day_of_week_published',
       'local_time_of_day_published'],
      dtype='object')

In [94]:
df_train.color_palette.unique()

array(['darkslategray', 'lightslategray', 'black', 'silver', 'royalblue',
       0, 'midnightblue', 'slategray', 'rosybrown', 'burlywood', 'peru',
       'saddlebrown', 'gainsboro', 'darkgray', 'lightsteelblue',
       'dimgray', 'lime', 'whitesmoke', 'tan', 'wheat', 'indianred',
       'lightgray', 'darkolivegreen', 'thistle', 'maroon',
       'darkslateblue', 'powderblue', 'mistyrose', 'gray', 'skyblue',
       'forestgreen', 'linen', 'brown', 'sienna', 'cornflowerblue',
       'darkkhaki', 'lavender', 'goldenrod', 'khaki', 'seagreen',
       'steelblue', 'lightcoral', 'darksalmon', 'palevioletred',
       'darkcyan', 'darkseagreen', 'darkgreen', 'white', 'cadetblue',
       'lightskyblue', 'plum', 'lightblue', 'beige', 'palegreen',
       'indigo', 'orchid', 'lightpink', 'slateblue', 'blueviolet',
       'darkgoldenrod', 'olivedrab', 'mediumaquamarine', 'mediumpurple',
       'paleturquoise', 'lightgreen', 'snow', 'teal', 'mediumorchid',
       'bisque', 'sandybrown', 'yellowgreen',

In [95]:
set(df_train.color_palette.unique()) - set(df_val.color_palette.unique())

{'aliceblue',
 'aquamarine',
 'blueviolet',
 'darkblue',
 'darkorange',
 'darkred',
 'darkviolet',
 'deeppink',
 'deepskyblue',
 'gold',
 'greenyellow',
 'hotpink',
 'lawngreen',
 'lightsalmon',
 'lime',
 'limegreen',
 'magenta',
 'mediumblue',
 'mediumslateblue',
 'mintcream',
 'oldlace',
 'olive',
 'orangered',
 'orchid',
 'slateblue',
 'turquoise'}

In [96]:
all_categories = set(df_train['color_palette'].unique()) | \
                 set(df_test['color_palette'].unique()) | \
                 set(df_val['color_palette'].unique())

category_mapping = {category: i for i, category in enumerate(all_categories)}

def create_dummies(df, category_mapping):
    dummies = pd.get_dummies(df['color_palette'])
    # Add missing columns based on the full category list
    for category in category_mapping:
        if category not in dummies:
            dummies[category] = 0
    # Ensure consistent column order
    sorted_keys = sorted(category_mapping.keys(), key=str)
    dummies = dummies[sorted_keys]
    return dummies

train_dummies = create_dummies(df_train, category_mapping)
test_dummies = create_dummies(df_test, category_mapping)
validation_dummies = create_dummies(df_val, category_mapping)

df_train = df_train.drop('color_palette', axis=1).join(train_dummies)
df_test = df_test.drop('color_palette', axis=1).join(test_dummies)
df_val = df_val.drop('color_palette', axis=1).join(validation_dummies)

In [97]:
# dummies for color_palette already created
dummy_cols = dummy_cols.delete(0)

In [98]:
df_train = pd.get_dummies(df_train, columns=dummy_cols)
df_test = pd.get_dummies(df_test, columns=dummy_cols)
df_val = pd.get_dummies(df_val, columns=dummy_cols)

In [100]:
categories = [24,  1, 26, 20, 28, 10, 27, 25, 22, 23,  2, 17, 15, 19, 29]
df_train['categoryId'] = pd.Categorical(df_train['categoryId'], categories = categories)
df_test['categoryId'] = pd.Categorical(df_test['categoryId'], categories = categories)
df_val['categoryId'] = pd.Categorical(df_val['categoryId'], categories = categories)

categories = [-1, 0, 1]
df_train['title_sentiment'] = pd.Categorical(df_train['title_sentiment'], categories = categories)
df_test['title_sentiment'] = pd.Categorical(df_test['title_sentiment'], categories = categories)
df_val['title_sentiment'] = pd.Categorical(df_val['title_sentiment'], categories = categories)

In [101]:
df_train = pd.get_dummies(df_train, columns = ['categoryId', 'title_sentiment'])
df_test = pd.get_dummies(df_test, columns = ['categoryId', 'title_sentiment'])
df_val = pd.get_dummies(df_val, columns = ['categoryId', 'title_sentiment'])

In [102]:
df_train

Unnamed: 0,view_count,Brightness,top3_contains_black,top3_contains_blue,top3_contains_brown,top3_contains_green,top3_contains_grey,top3_contains_orange,top3_contains_pink,top3_contains_purple,...,categoryId_22,categoryId_23,categoryId_2,categoryId_17,categoryId_15,categoryId_19,categoryId_29,title_sentiment_-1,title_sentiment_0,title_sentiment_1
0,2291958,95.544693,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
1,2527029,95.544693,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
2,2542208,95.544693,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
3,2551698,95.544693,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
4,2558032,95.544693,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,235398,124.405790,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,False,False,True
73078,263205,124.405790,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,False,False,True
73079,280751,124.405790,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,False,False,True
73080,294538,124.405790,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,False,False,True


## Replacing -inf values in the log columns

In [103]:
df_train = df_train.replace([np.inf, -np.inf], 0)
df_test = df_test.replace([np.inf, -np.inf], 0)
df_val = df_val.replace([np.inf, -np.inf], 0)

## Training

In [104]:
X_train = df_train.loc[:, df_train.columns != "view_count"]
y_train = df_train["view_count"]

X_test = df_test.loc[:, df_test.columns != "view_count"]
y_test = df_test["view_count"]

X_val = df_val.loc[:, df_val.columns != "view_count"]
y_val = df_val["view_count"]

In [105]:
# Create an array where training instances are -1 and validation instances are 0
test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_val.shape[0])]
ps = PredefinedSplit(test_fold)

X_combined = np.concatenate((X_train, X_val))
X_val.columns = X_val.columns.astype(str)
y_combined = np.concatenate((y_train, y_val))

### Random Forest

In [109]:
from sklearn.model_selection import GridSearchCV, KFold

In [112]:
rf = RandomForestRegressor(random_state = 42)
rf_params = {'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 6, 9, 12, 15]}
rf_gs = GridSearchCV(rf, rf_params, cv = ps, n_jobs = -1, verbose = 3)
rf_gs.fit(X_combined, y_combined)
rf_best = rf_gs.best_estimator_

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [122]:
xgb = XGBRegressor(random_state = 42)
xgb_params = {'n_estimators': [50, 100, 200, 500, 1000], 'max_depth': [3, 5, 7, 9, 11, 13, 15], 'learning_rate': [0.1, 0.05, 0.025, 0.01]}
xgb_gs = GridSearchCV(xgb, xgb_params, cv = ps, n_jobs = -1, verbose = 3)
xgb_gs.fit(X_combined, y_combined)
xgb_best = xgb_gs.best_estimator_

Fitting 1 folds for each of 140 candidates, totalling 140 fits


In [123]:
models = [rf_best, xgb_best]
for model in models:
    predictions = model.predict(X_test.to_numpy())
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f'Model: {model.__class__.__name__}, MAE: {mae}, MSE: {mse}, R2: {r2}')

Model: RandomForestRegressor, MAE: 2081468.8099442536, MSE: 56762681022527.586, R2: 0.612724543456437
Model: XGBRegressor, MAE: 2089616.3449110533, MSE: 55098686712925.125, R2: 0.6240774983262295


In [19]:
# rf = RandomForestRegressor(random_state=42)
# rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
# #rf_params = {'n_estimators': [100], 'max_depth': [15]}

# rf_gs = GridSearchCV(rf, rf_params, cv=ps, n_jobs=-1, verbose=3)
# rf_gs.fit(X_combined, y_combined)
# rf_best = rf_gs.best_estimator_

Fitting 1 folds for each of 4 candidates, totalling 4 fits


### Interpretation stuff

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns, rf_best.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'importance'})
importances.sort_values(by='importance')[-5:].plot(kind='bar', rot=45)

#### Export the first three decision trees from the forest

for i in range(3):
    tree = best_rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

### XGBoost

In [20]:
xgb = XGBRegressor(random_state=42)
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.1, 0.01]}

xgb_gs = GridSearchCV(xgb, xgb_params, cv=ps, n_jobs=-1, verbose=3)
xgb_gs.fit(X_combined, y_combined)
xgb_best = xgb_gs.best_estimator_

Fitting 1 folds for each of 8 candidates, totalling 8 fits


### Lasso

<sup><sub>ROBUST NOT SPARSE</sup></sub>

In [21]:
lasso = Lasso()
lasso_params = {'alpha': [0.1, 1, 3]}

lasso_gs = GridSearchCV(lasso, lasso_params, cv=ps, n_jobs=-1, verbose=3)
lasso_gs.fit(X_combined, y_combined)
lasso_best = lasso_gs.best_estimator_

Fitting 1 folds for each of 3 candidates, totalling 3 fits


  model = cd_fast.enet_coordinate_descent(


### Model evaluation

In [113]:
models = [rf_best, xgb_best, lasso_best]#,nn]
for model in models:
    #if model != nn:  # For non-neural network models
    #    predictions = model.predict(X_test)
    #else:
    #    predictions = nn.predict(X_test).flatten()
    predictions = model.predict(X_test.to_numpy())
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f'Model: {model.__class__.__name__}, MAE: {mae}, MSE: {mse}, R2: {r2}')

NameError: name 'xgb_best' is not defined

#### With previous video data

Model: RandomForestRegressor, MAE: 2074996.7017330027, MSE: 55872604602256.77, R2: 0.6187972790251152

Model: XGBRegressor, MAE: 2150861.698505149, MSE: 77971771326171.17, R2: 0.46802101673335283

Model: Lasso, MAE: 3252643.0836352184, MSE: 138987867550888.66, R2: 0.0517257308825082

#### Without previous video data

Model: RandomForestRegressor, MAE: 2081482.959581667, MSE: 56436427243572.86, R2: 0.6149504792106627

Model: XGBRegressor, MAE: 2071499.5344590796, MSE: 64559375522716.67, R2: 0.559529937992099

Model: Lasso, MAE: 3239764.3821198265, MSE: 140310015863701.23, R2: 0.04270509298734637

### Saving the model

In [87]:
import pickle

with open('../data/rf_model_without_previous.pkl', 'wb') as out:
    pickle.dump(rf_best, out, pickle.HIGHEST_PROTOCOL)

## Baselines

#### Baseline 1: Predicting average views

In [23]:
avg_views = y_train.mean()
y_baseline_1 = [avg_views] * len(y_test)

mae = mean_absolute_error(y_test, y_baseline_1)
mse = mean_squared_error(y_test, y_baseline_1)
r2 = r2_score(y_test, y_baseline_1)
print(f'Baseline 1, MAE: {mae}, MSE: {mse}, R2: {r2}')

Baseline 1, MAE: 3324987.6932239714, MSE: 147921560488945.4, R2: -0.00922628810040127


#### Baseline 2: Predicting previous video views

In [44]:
y_baseline_2 = np.exp(X_test.last_video_log_views)

mae = mean_absolute_error(y_test, y_baseline_2)
mse = mean_squared_error(y_test, y_baseline_2)
r2 = r2_score(y_test, y_baseline_2)
print(f'Baseline 2, MAE: {mae}, MSE: {mse}, R2: {r2}')

Baseline 2, MAE: 2192965.9635579074, MSE: 57477095865023.94, R2: 0.607850296340106
