# Code for everything

## Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import PredefinedSplit
from xgboost import XGBRegressor
#from keras.models import Sequential
#from keras.layers import Dense, Activation
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

## Data Loading

In [2]:
df_train = pd.read_csv('../data/US_youtube_trending_train.csv')
df_test = pd.read_csv('../data/US_youtube_trending_test.csv')
df_val = pd.read_csv('../data/US_youtube_trending_validation.csv')

In [3]:
df_train['face_detected'] = df_train['Face Detected'].astype('bool')
df_train['text_detected'] = df_train['Text Detected'].astype('bool')
df_train.rename({"Color Palette": "color_palette"}, axis=1, inplace=True)

df_test['face_detected'] = df_test['Face Detected'].astype('bool')
df_test['text_detected'] = df_test['Text Detected'].astype('bool')
df_test.rename({"Color Palette": "color_palette"}, axis=1, inplace=True)

df_val['face_detected'] = df_val['Face Detected'].astype('bool')
df_val['text_detected'] = df_val['Text Detected'].astype('bool')
df_val.rename({"Color Palette": "color_palette"}, axis=1, inplace=True)

## Selecting columns of interest

In [4]:
cols = [
'categoryId', 'view_count', 'comments_disabled', 'ratings_disabled', 'hour_of_day_published',
'hour_published_normalized', 'time_of_day_published', 'day_of_week_published', 'is_weekend_published',
'local_time_of_day_published', 'local_hour_published_normalized', 'published_on_holiday', 'previous_365_days_videos_count',
'previous_365_days_avg_views', 'previous_365_days_avg_likes', 'previous_365_days_avg_dislikes', 'previous_365_days_geo_avg_like_dislike_ratio',
'previous_365_days_avg_days_on_trending', 'previous_365_days_avg_comment_count', 'last_video_views', 'last_video_likes', 'last_video_dislikes',
'last_video_days_on_trending', 'last_video_comment_count', 'previous_avg_days_on_trending',
'title_length_chars', 'title_length_words', 'title_avg_word_length', 'title_longest_word_length', 'title_all_upcase',
'title_first_upcase', 'title_any_upcase', 'title_prop_upcase', 'title_all_lowercase', 'title_sentiment', 'title_contains_digit',
'title_starts_digit', 'title_contains_question', 'title_exclamation_count', 'title_punctuation_count',
'title_stop_words_count', 'title_stop_words_prop', 'title_contains_quote', 'Saturation', 'Contrast', 'color_palette', 'text_detected', 'face_detected'
]

In [5]:
set(df_train.columns) - set(cols)

{'Face Detected',
 'Text Detected',
 'channelId',
 'channelTitle',
 'comment_count',
 'days_on_trending',
 'days_since_published',
 'description',
 'dislikes',
 'dislikes_scaled',
 'is_weekend_trending',
 'likes',
 'likes_scaled',
 'local_hour_of_day_published',
 'publishedAt',
 'tags',
 'thumbnail_link',
 'title',
 'trending_date',
 'trending_day_of_week',
 'video_id',
 'view_count_scaled'}

In [6]:
df_train = df_train[cols]
df_test = df_test[cols]
df_val = df_val[cols]

## Check NaN

In [7]:
(df_train[cols].isna().sum()/len(df_train)).sort_values(ascending=False).head(10)

previous_365_days_geo_avg_like_dislike_ratio    0.436441
previous_365_days_avg_likes                     0.169631
previous_365_days_avg_views                     0.169631
last_video_comment_count                        0.169631
last_video_days_on_trending                     0.169631
last_video_dislikes                             0.169631
last_video_likes                                0.169631
last_video_views                                0.169631
previous_365_days_avg_comment_count             0.169631
previous_365_days_avg_days_on_trending          0.169631
dtype: float64

# TODO: handle nan

In [8]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)
df_val = df_val.fillna(0)

## Selecting categorical columns

In [9]:
cols_dtypes = df_train[cols].dtypes
dummy_cols = cols_dtypes[cols_dtypes == "object"].index
dummy_cols

Index(['time_of_day_published', 'day_of_week_published',
       'local_time_of_day_published', 'color_palette'],
      dtype='object')

In [10]:
df_train.color_palette.unique()

array(['darkslategray', 'lightslategray', 'black', 'silver', 'royalblue',
       'midnightblue', 'slategray', 'rosybrown', 'burlywood', 'peru',
       'saddlebrown', 'gainsboro', 'darkgray', 'lightsteelblue',
       'dimgray', 'lime', 'whitesmoke', 'tan', 'wheat', 'indianred',
       'lightgray', 'darkolivegreen', 'thistle', 0, 'maroon',
       'darkslateblue', 'powderblue', 'mistyrose', 'gray', 'skyblue',
       'forestgreen', 'linen', 'brown', 'sienna', 'cornflowerblue',
       'darkkhaki', 'lavender', 'goldenrod', 'khaki', 'seagreen',
       'steelblue', 'lightcoral', 'darksalmon', 'palevioletred',
       'darkcyan', 'darkseagreen', 'darkgreen', 'white', 'cadetblue',
       'lightskyblue', 'plum', 'lightblue', 'beige', 'palegreen',
       'indigo', 'orchid', 'lightpink', 'slateblue', 'blueviolet',
       'darkgoldenrod', 'olivedrab', 'mediumaquamarine', 'mediumpurple',
       'paleturquoise', 'lightgreen', 'snow', 'sandybrown', 'teal',
       'mediumorchid', 'bisque', 'yellowgreen',

In [11]:
set(df_train.color_palette.unique()) - set(df_val.color_palette.unique())

{'aliceblue',
 'aquamarine',
 'blueviolet',
 'darkblue',
 'darkorange',
 'darkviolet',
 'deeppink',
 'deepskyblue',
 'gold',
 'greenyellow',
 'hotpink',
 'lawngreen',
 'lightsalmon',
 'lime',
 'limegreen',
 'magenta',
 'mediumblue',
 'mediumslateblue',
 'mintcream',
 'oldlace',
 'olive',
 'orangered',
 'orchid',
 'turquoise'}

In [12]:
all_categories = set(df_train['color_palette'].unique()) | \
                 set(df_test['color_palette'].unique()) | \
                 set(df_val['color_palette'].unique())

category_mapping = {category: i for i, category in enumerate(all_categories)}

def create_dummies(df, category_mapping):
    dummies = pd.get_dummies(df['color_palette'])
    # Add missing columns based on the full category list
    for category in category_mapping:
        if category not in dummies:
            dummies[category] = 0
    # Ensure consistent column order
    sorted_keys = sorted(category_mapping.keys(), key=str)
    dummies = dummies[sorted_keys]
    return dummies

train_dummies = create_dummies(df_train, category_mapping)
test_dummies = create_dummies(df_test, category_mapping)
validation_dummies = create_dummies(df_val, category_mapping)

df_train = df_train.drop('color_palette', axis=1).join(train_dummies)
df_test = df_test.drop('color_palette', axis=1).join(test_dummies)
df_val = df_val.drop('color_palette', axis=1).join(validation_dummies)

In [13]:
# dummies for color_palette already created
dummy_cols = dummy_cols.delete(3)

df_train = pd.get_dummies(df_train, columns=dummy_cols)
df_test = pd.get_dummies(df_test, columns=dummy_cols)
df_val = pd.get_dummies(df_val, columns=dummy_cols)

## Training

In [32]:
X_train = df_train.loc[:, df_train.columns != "view_count"]
y_train = df_train["view_count"]

X_test = df_test.loc[:, df_test.columns != "view_count"]
y_test = df_test["view_count"]

X_val = df_val.loc[:, df_val.columns != "view_count"]
y_val = df_val["view_count"]

In [33]:
# Create an array where training instances are -1 and validation instances are 0
test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_val.shape[0])]
ps = PredefinedSplit(test_fold)

X_combined = np.concatenate((X_train, X_val))
X_val.columns = X_val.columns.astype(str)
y_combined = np.concatenate((y_train, y_val))

### Random Forest

In [16]:
rf = RandomForestRegressor(random_state=42)
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
#rf_params = {'n_estimators': [100], 'max_depth': [15]}

rf_gs = GridSearchCV(rf, rf_params, cv=ps, n_jobs=-1, verbose=3)
rf_gs.fit(X_combined, y_combined)
rf_best = rf_gs.best_estimator_

Fitting 1 folds for each of 4 candidates, totalling 4 fits


predictions = rf_best.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
print(f'Model: rf_best, MAE: {round(mae,2)}, RMSE: {round(mse,2)}, R2: {round(r2,3)*100}%')

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns, rf_best.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'importance'})
importances.sort_values(by='importance')[-5:].plot(kind='bar', rot=45)

#### Export the first three decision trees from the forest

for i in range(3):
    tree = best_rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

### XGBoost

In [17]:
xgb = XGBRegressor(random_state=42)
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.1, 0.01]}

xgb_gs = GridSearchCV(xgb, xgb_params, cv=ps, n_jobs=-1, verbose=3)
xgb_gs.fit(X_combined, y_combined)
xgb_best = xgb_gs.best_estimator_

Fitting 1 folds for each of 8 candidates, totalling 8 fits


### Deep Neural Network

In [None]:
#nn = Sequential()
#nn.add(Dense(64, input_dim=X_train.shape[1]))  # Input layer
#nn.add(Activation('relu'))  # Activation for hidden layer
#nn.add(Dense(32))  # Hidden layer
#nn.add(Activation('relu'))  # Activation for hidden layer
#nn.add(Dense(1))  # Output layer

#nn.compile(optimizer='adam', loss='mean_squared_error')
#nn.fit(X_train, y_train, epochs=100, batch_size=32)  # You might need to adjust epochs and batch_size

### Lasso

<sup><sub>ROBUST NOT SPARSE</sup></sub>

In [18]:
lasso = Lasso()
lasso_params = {'alpha': [0.1, 1, 3]}

lasso_gs = GridSearchCV(lasso, lasso_params, cv=ps, n_jobs=-1, verbose=3)
lasso_gs.fit(X_combined, y_combined)
lasso_best = lasso_gs.best_estimator_

Fitting 1 folds for each of 3 candidates, totalling 3 fits


  model = cd_fast.enet_coordinate_descent(


### Model evaluation

In [37]:
models = [rf_best, xgb_best, lasso_best]#,nn]
for model in models:
    #if model != nn:  # For non-neural network models
    #    predictions = model.predict(X_test)
    #else:
    #    predictions = nn.predict(X_test).flatten()
    predictions = model.predict(X_test.to_numpy())
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f'Model: {model.__class__.__name__}, MAE: {mae}, MSE: {mse}, R2: {r2}')

Model: RandomForestRegressor, MAE: 2105430.5032766247, MSE: 60550869670722.95, R2: 0.5868788212006337
Model: XGBRegressor, MAE: 2070499.7446413354, MSE: 55345153117759.76, R2: 0.6223959288912498
Model: Lasso, MAE: 2256721.077642066, MSE: 56903010965667.805, R2: 0.6117671125913446
