## Preliminaries

In [1]:
import pandas as pd
from langdetect import detect
from textblob import TextBlob
import string
import nltk
from nltk.corpus import stopwords
import sys
import shap

In [2]:
# read combined dataset
df = pd.read_csv("../data/US_youtube_trending_data.csv", parse_dates = ["publishedAt", "trending_date"])

In [3]:
# create train data
train_data = df
train_data['timestamp'] = pd.to_datetime(train_data['trending_date'])
train_data = train_data[train_data['timestamp'].dt.year == 2022]

In [4]:
sys.path.insert(1, '../youtube_views_predictor')
from text_feature_engineering import TextFeatureExtractor

## Identify Keywords

In [5]:
def remove_punctuation_within_word(word):
    return ''.join(char for char in word if char not in string.punctuation)

def identify_frequent_words(df):

    # identify frequently occuring words
    word_counts = df['title'].str.lower().str.split(expand = True).stack().apply(remove_punctuation_within_word).value_counts()
    frequent_words = word_counts[word_counts >= 1000].index
    
    # remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    frequent_words = [word for word in frequent_words if word.lower() not in stop_words and word not in punctuation]

    # determine words associated with higher average views
    result_df = pd.DataFrame(columns = frequent_words)
    
    for word in frequent_words:
        df['title'].fillna('', inplace = True) 
        word_mask = df['title'].str.lower().str.contains(word.lower())
        avg_views = df.loc[word_mask, 'view_count'].mean()
        
        if not result_df.empty:
            result_df = pd.concat([result_df, pd.DataFrame({'Word': [word], 'Average_Views': [avg_views]})], ignore_index=True)
        else:
            result_df = pd.DataFrame({'Word': [word], 'Average_Views': [avg_views]})
    
    result_df = result_df.sort_values(by = 'Average_Views', ascending = False)
    return result_df.head(20)

## Clean Data

In [6]:
keywords = identify_frequent_words(train_data).iloc[:, 0].tolist() + ['How']
keywords.remove('')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
keywords

['trailer',
 'shorts',
 'full',
 'official',
 'video',
 '100',
 'music',
 'world',
 '2022',
 'highlights',
 'vs',
 'day',
 '1',
 '2',
 'ft',
 'game',
 '3',
 '4',
 'nba',
 'How']

In [44]:
feature_extractor = TextFeatureExtractor(keywords)
df_with_features = feature_extractor.create_text_features(df)

In [46]:
df_with_features.columns

Index(['video_id', 'title', 'publishedAt', 'channelId', 'channelTitle',
       'categoryId', 'trending_date', 'tags', 'view_count', 'likes',
       'dislikes', 'comment_count', 'thumbnail_link', 'comments_disabled',
       'ratings_disabled', 'description', 'timestamp', 'trailer', 'shorts',
       'full', 'official', 'video', '100', 'music', 'world', '2022',
       'highlights', 'vs', 'day', '1', '2', 'ft', 'game', '3', '4', 'nba',
       'How', 'title_length_chars', 'title_length_words',
       'title_avg_word_length', 'title_longest_word_length',
       'title_all_upcase', 'title_first_upcase', 'title_any_upcase',
       'title_prop_upcase', 'title_all_lowercase', 'title_sentiment',
       'title_contains_digit', 'title_starts_digit', 'title_contains_question',
       'title_exclamation_count', 'title_punctuation_count',
       'title_stop_words_count', 'title_stop_words_prop',
       'title_contains_quote'],
      dtype='object')

# Test Models

In [47]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

In [49]:
X = df_with_features[
    keywords + [
        'title_length_chars', 'title_length_words', 'title_avg_word_length',
        'title_longest_word_length', 'title_all_upcase', 'title_first_upcase',
        'title_any_upcase', 'title_prop_upcase', 'title_all_lowercase', 'title_sentiment',
        'title_contains_digit', 'title_starts_digit', 'title_contains_question', 
        'title_exclamation_count', 'title_punctuation_count', 'title_stop_words_count', 
        'title_stop_words_prop', 'title_contains_quote'
    ]
]

y = df_with_features['view_count']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [51]:
model = RandomForestRegressor(n_estimators = 50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

KeyboardInterrupt: 

In [14]:
explainer = shap.TreeExplainer(model)

In [None]:
shap_values = explainer(X_test)

In [28]:
shap_values

.values =
array([-1.62110409e-02, -4.72495092e-03, -1.82884027e-02, -5.81455912e-03,
       -1.68095011e-03, -1.03275355e-02, -1.46853195e-03, -1.00964817e-02,
       -3.16498311e-03, -2.91928509e-04, -1.54341826e-03, -6.06885396e-03,
       -8.88115692e-04, -8.73807138e-04, -1.06239520e-03, -8.79576564e-03,
       -8.94356457e-04, -6.90766519e-04, -1.02306888e-03, -5.62778648e-04,
       -1.52264826e-03, -3.52828354e-02,  2.94029472e-02, -2.91678258e-05,
        3.06675782e-03, -3.90008851e-05, -7.00410870e-03,  4.10276015e-03,
        4.24236000e-02, -1.28340880e-03, -1.19453419e-03, -3.96953328e-03,
       -1.06251234e-03,  6.31075635e-03,  1.38383218e-04,  1.23080614e-01,
       -3.97288259e-03, -1.21421070e-02, -4.79357571e-03])

.base_values =
array([-1.06142599e-05, -1.06142599e-05, -1.06142599e-05, -1.06142599e-05,
       -1.06142599e-05, -1.06142599e-05, -1.06142599e-05, -1.06142599e-05,
       -1.06142599e-05, -1.06142599e-05, -1.06142599e-05, -1.06142599e-05,
       -1.06142

In [29]:
shap.summary_plot(
    shap_values, 
    X_test.iloc[0], 
    feature_names = keywords + [
        'title_length_chars', 'title_length_words', 'title_avg_word_length',
        'title_longest_word_length', 'title_all_upcase', 'title_first_upcase',
        'title_any_upcase', 'title_prop_upcase', 'title_all_lowercase', 'title_sentiment',
        'title_contains_digit', 'title_starts_digit', 'title_contains_question', 
        'title_exclamation_count', 'title_punctuation_count', 'title_stop_words_count', 
        'title_stop_words_prop', 'title_contains_quote'
    ]
)

AssertionError: Summary plots need a matrix of shap_values, not a vector.

In [20]:
# model = LinearRegression()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# r2 = r2_score(y_test, y_pred)
# print("R-squared:", r2)

R-squared: 0.09812679288986526


In [26]:
# model = RandomForestRegressor(n_estimators = 100)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# r2 = r2_score(y_test, y_pred)
# print("R-squared:", r2)

R-squared: 0.8440691365529528


In [21]:
# model = XGBRegressor(n_estimators = 50) 
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# r2 = r2_score(y_test, y_pred)
# print("R-squared:", r2)

R-squared: 0.4951242763196909


In [25]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'min_child_weight': [1, 3, 5],
# }
# model = XGBRegressor()
# grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = 'r2')
# grid_search.fit(X_train, y_train)
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)
# r2 = r2_score(y_test, y_pred)
# print("R-squared:", r2)

R-squared: 0.5694052461503023
