## Preliminaries

In [1]:
import pandas as pd
from langdetect import detect
from textblob import TextBlob
import string
import nltk
from nltk.corpus import stopwords
import sys

In [2]:
sys.path.insert(1, '../youtube_views_predictor')
from text_feature_engineering import TextFeatureExtractor

In [3]:
# read combined dataset
df = pd.read_csv("../data/GB_youtube_trending_data_cleaned.csv", parse_dates = ["publishedAt", "trending_date"])

## Identify Keywords

In [4]:
def remove_punctuation_within_word(word):
    return ''.join(char for char in word if char not in string.punctuation)

# identify frequently occuring words
word_counts = df['title'].str.lower().str.split(expand = True).stack().apply(remove_punctuation_within_word).value_counts()
frequent_words = word_counts[word_counts >= 1000].index

# remove stop words and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
frequent_words = [word for word in frequent_words if word.lower() not in stop_words and word not in punctuation]

In [5]:
# determine words associated with higher average views
result_df = pd.DataFrame(columns = frequent_words)

for word in frequent_words:
    df['title'].fillna('', inplace=True) 
    word_mask = df['title'].str.lower().str.contains(word.lower())
    avg_views = df.loc[word_mask, 'view_count'].mean()
    if not result_df.empty:
        result_df = pd.concat([result_df, pd.DataFrame({'Word': [word], 'Average_Views': [avg_views]})], ignore_index=True)
    else:
        result_df = pd.DataFrame({'Word': [word], 'Average_Views': [avg_views]})

result_df = result_df.sort_values(by = 'Average_Views', ascending = False)
result_df.head(20)

Unnamed: 0,Word,Average_Views
50,mv,15288760.0
143,bts,12446170.0
15,shorts,8218931.0
174,black,5507017.0
56,teaser,5180543.0
1,official,4105751.0
108,among,3945388.0
5,trailer,3834790.0
9,music,3604624.0
146,prix,3555647.0


## Clean Data

In [6]:
keywords = result_df.head(20).iloc[:, 0].tolist() + ['How']
feature_extractor = TextFeatureExtractor(keywords)
df_with_features = feature_extractor.create_text_features(df)
df_with_features.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,...,title_all_lowercase,title_sentiment,title_contains_digit,title_starts_digit,title_contains_question,title_exclamation_count,title_punctuation_count,title_stop_words_count,title_stop_words_prop,title_contains_quote
0,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11 16:34:06+00:00,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12 00:00:00+00:00,"['jacksepticeye', 'funny', 'funny meme', 'meme...",2038853,353790,...,0,-1,0,0,0,0,1,7,0.636364,0
1,9nidKH8cM38,TAXI CAB SLAYER KILLS 'TO KNOW HOW IT FEELS',2020-08-11 20:00:45+00:00,UCFMbX7frWZfuWdjAML0babA,Eleanor Neale,27,2020-08-12 00:00:00+00:00,"['eleanor', 'neale', 'eleanor neale', 'eleanor...",236830,16423,...,0,-1,0,0,0,0,2,2,0.222222,1
2,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands The ...,2020-08-11 17:00:10+00:00,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12 00:00:00+00:00,"['Apex Legends', 'Apex Legends characters', 'n...",2381688,146739,...,0,-1,0,0,0,0,1,3,0.333333,0
3,kgUV1MaD_M8,Nines - Clout (Official Video),2020-08-10 18:30:28+00:00,UCvDkzrj8ZPlBqRd6fIxdhTw,Nines,24,2020-08-12 00:00:00+00:00,"['Nines', 'Trapper of the year', 'Crop Circle'...",613785,37567,...,0,-1,0,0,0,0,3,0,0.0,0
4,49Z6Mv4_WCA,i don't know what im doing anymore,2020-08-11 20:24:34+00:00,UCtinbF-Q-fVthA0qrFQTgXQ,CaseyNeistat,22,2020-08-12 00:00:00+00:00,,940036,87113,...,1,-1,0,0,0,0,1,4,0.571429,0


# Test Models

In [24]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

In [18]:
X = df_with_features[
    keywords + [
        'title_length_chars', 'title_length_words', 'title_avg_word_length',
        'title_longest_word_length', 'title_all_upcase', 'title_first_upcase',
        'title_any_upcase', 'title_prop_upcase', 'title_all_lowercase', 'title_sentiment',
        'title_contains_digit', 'title_starts_digit', 'title_contains_question', 
        'title_exclamation_count', 'title_punctuation_count', 'title_stop_words_count', 
        'title_stop_words_prop', 'title_contains_quote'
    ]
]

y = df_with_features['view_count_scaled']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [20]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.09812679288986526


In [22]:
model = RandomForestRegressor(n_estimators = 50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.8448089288795526


In [21]:
model = XGBRegressor(n_estimators = 50) 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.4951242763196909


In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5],
}
model = XGBRegressor()
grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = 'r2')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)