In [6]:
# Import everything necessary:
from tensorflow.keras.models import Model
from keras.preprocessing.text import Tokenizer                   
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow import keras
import guidedlda
import pandas as pd
import numpy as np

import nltk 
nltk.download('words')

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk import word_tokenize
from nltk.stem import *
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('punkt')

import regex as re

from google.colab import drive

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Predicting Youtube Video Trend Time
## Topic Modeling with GuidedLDA
### Evan Phillips, Jaypal Bhatia, and Noor Gill

In [2]:
# For faster data loading: 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read in datasets from countries that are predominantly English-speaking (United States, Great Britian, and Canada): 
CA_data = pd.read_csv('/content/drive/MyDrive/266 Final Project/Data/CA_youtube_trending_data.csv')
GB_data = pd.read_csv('/content/drive/MyDrive/266 Final Project/Data/GB_youtube_trending_data.csv')
US_data = pd.read_csv('/content/drive/MyDrive/266 Final Project/Data/US_youtube_trending_data.csv')

In [4]:
# Concatenate all videos into a single df: 
df = pd.concat([CA_data, GB_data, US_data])
df.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,KX06ksuS6Xo,Diljit Dosanjh: CLASH (Official) Music Video |...,2020-08-11T07:30:02Z,UCZRdNleCgW-BGUJf-bbjzQg,Diljit Dosanjh,10,2020-08-12T00:00:00Z,clash diljit dosanjh|diljit dosanjh|diljit dos...,9140911,296541,6180,30059,https://i.ytimg.com/vi/KX06ksuS6Xo/default.jpg,False,False,CLASH official music video performed by DILJIT...
1,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353797,2628,40222,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...
2,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146740,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
3,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156914,5857,35331,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,2020-08-11T15:10:05Z,UCDVPcEbVLQgLZX0Rt6jo34A,Mr. Kate,26,2020-08-12T00:00:00Z,The LaBrant Family|DIY|Interior Design|Makeove...,1123889,45803,964,2198,https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg,False,False,Transforming The LaBrant Family's empty white ...


## Data Cleaning:

In [7]:
# Remove non-English words from titles: 
def valid_words_only(column):
  values = []
  for i in df[column]:
    i = re.sub('[^a-zA-Z0-9 ]', " ", i)
    values.append(i)
  df[column] = values

valid_words_only('title')

## Unsupervised Learning: 

In [8]:
# Pre-process by accounting for English stopwords and terms with no definition value
stop_words = list(stopwords.words("english"))

In [9]:
# Tokenize words based on GloVe:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(df.title.values)
words_to_index = tokenizer.word_index

# Pad sequences:
sequences = tokenizer.texts_to_sequences(df.title.values)
X = pad_sequences(sequences, padding='post')

# Check size:
print(X.shape) 

(350330, 24)


In [10]:
# Tokenize with Porter Stemmer due to error: 
porter_stemmer = PorterStemmer()

def stem_tokenizer(text):
  ''' Tokenizes and stems the words in a text'''
  return [porter_stemmer.stem(token) for token in word_tokenize(text.lower())]

In [11]:
# Use Glove vectorizer: 
token_vectorizer = CountVectorizer(tokenizer=stem_tokenizer, stop_words=stop_words)
X = token_vectorizer.fit_transform(df.title)

  % sorted(inconsistent)


In [12]:
# Data validation purposes:
X[0,:]

<1x21678 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [13]:
# Create a  dictionary of tags terms from vector:
tf_feature_names = token_vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(tf_feature_names))



In [14]:
# Print words/phrases from generated dictionary (first 10):
list(word2id.items())[:10]

[('0', 0),
 ('00', 1),
 ('000', 2),
 ('0000001', 3),
 ('000cal', 4),
 ('000fp', 5),
 ('000ft', 6),
 ('000g', 7),
 ('000hp', 8),
 ('000th', 9)]

In [15]:
# Create seed words for model to converge to (based on each video category):
gaming = ['xbox', 'playstation','ps3','ps4','ps5',
          'wii', 'gaming','console','roblox','twitch',
          'minecraft','fortnite','pubg','league of legends',
          'gta', 'battleground', 'pewdiepie', 'streaming',
          'call of duty', 'jumpscare', 'faze', 'warcraft', 'mobile legends',
          'lego']

fashion_and_beauty = ['fashion', 'beauty', 'viral makeup', 'style', 'shoes', 
                      'shirt', 'purse', 'mua', 'handbag', 'pants', 
                      'jacket', 'dress', 'jeans', 'try on haul', 'classy',
                      'y2k', 'fashion week', 'must have', 'closet', 'skincare',
                      'vogue', 'grwm', 'ootd', 'styling', 'makeover']

reactions = ['tried', 'happened', 'pranked', 'surprised', 'shocked', 
             'first time', 'react', 'reaction', 'prank', 'surprise',
             'challenge', 'best moments', 'compilation']

learning = ['calculus', 'academy','diy','how to','tutorial',
            'math','science','physics','chemistry','trigonometry',
            'statistics','machine learning','algebra','java','python',
            'build', 'stata','construct', 'learn', 'develop', 
            'create', 'SAT','GMAT','ACT','GRE','PSAT', 'artificial intelligence',
            'STEM', 'careers', 'resume', 'application', 'workshop', 'education',
            'probability', 'explained']

sports = ['sports', 'athlete', 'highlights', 'nbc sports', 'wwe', 
          'nba', 'wnba', 'mlb', 'nfl', 'mma',
          'ufc', 'world cup', 'superbowl', 'espn', 'league', 
          'boxing', 'football', 'basketball', 'soccer', 'christiano ronaldo', 
          'lebron james', 'kobe bryant', 'michael jordan', 'pat mahomes', 'lionel messi']

music = ['bts', 'harry styles', 'justin bieber', 'selena gomez', 'original song', 
         'cover', 'remix', 'mashup', 'music', 'official music video', 
         'pop', 'r&b', 'drake', 'punjabi song', 'ed sheeran', 
         'rihanna', 'ariana grande', 'taylor swift', 'billie eilish', 'bad bunny',
         'kpop']

# Ensure all words from the original list are in the word2id list: 
gaming = [x for x in gaming if x in list(word2id.keys())]
fashion_and_beauty = [x for x in fashion_and_beauty if x in list(word2id.keys())]
learning = [x for x in learning if x in list(word2id.keys())]
sports = [x for x in sports if x in list(word2id.keys())]
music = [x for x in music if x in list(word2id.keys())]

# Create an array of topics (categories) with all relevent words:
seed_category_list = [gaming, fashion_and_beauty, learning, sports, music, ]
categories = ['gaming', 'fashion_and_beauty', 'learning', 'sports', 'music','other']

In [16]:
# Install guidedLDA this way due to error:
!pip install git+https://github.com/CatalinVoss/GuidedLDA

Collecting git+https://github.com/CatalinVoss/GuidedLDA
  Cloning https://github.com/CatalinVoss/GuidedLDA to /tmp/pip-req-build-y42cxff_
  Running command git clone -q https://github.com/CatalinVoss/GuidedLDA /tmp/pip-req-build-y42cxff_


In [17]:
# Initiate guidedLDA Model
model = guidedlda.GuidedLDA(n_topics=6, n_iter=100, random_state=7, refresh=10)
seed_categories = {} 
for t_id, st in enumerate(seed_category_list): 
    for word in st: 
        seed_categories[word2id[word]] = t_id 
model.fit(X, seed_topics=seed_categories, seed_confidence=0.15) 

INFO:guidedlda:n_documents: 350330
INFO:guidedlda:vocab_size: 21678
INFO:guidedlda:n_words: 2338916
INFO:guidedlda:n_topics: 6
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -29064510
INFO:guidedlda:<10> log likelihood: -19972432
INFO:guidedlda:<20> log likelihood: -19777438
INFO:guidedlda:<30> log likelihood: -19620391
INFO:guidedlda:<40> log likelihood: -19444936
INFO:guidedlda:<50> log likelihood: -19242745
INFO:guidedlda:<60> log likelihood: -19042008
INFO:guidedlda:<70> log likelihood: -18871959
INFO:guidedlda:<80> log likelihood: -18723184
INFO:guidedlda:<90> log likelihood: -18603817
INFO:guidedlda:<99> log likelihood: -18500782


<guidedlda.guidedlda.GuidedLDA at 0x7f8e691ca8d0>

In [18]:
# Create numeric matrix containing the category for each subject line:
probabilities = model.transform(X)



In [19]:
# Create a dataframe for categories and associated values:
df_categories_probs = pd.DataFrame(probabilities, columns = categories)
df_categories_probs = df_categories_probs.astype('float64')
df_categories_probs.dtypes

gaming                float64
fashion_and_beauty    float64
learning              float64
sports                float64
music                 float64
other                 float64
dtype: object

In [20]:
# Assign the one with the highest probability as the category:
df_categories_probs['category'] = df_categories_probs.idxmax(axis=1)
df_categories_probs

Unnamed: 0,gaming,fashion_and_beauty,learning,sports,music,other,category
0,0.000844,0.002049,0.002441,0.000641,0.993276,0.000749,music
1,0.988613,0.001724,0.004962,0.003298,0.000821,0.000583,gaming
2,0.001519,0.005792,0.991734,0.000244,0.000230,0.000480,learning
3,0.013835,0.967836,0.012628,0.002302,0.002366,0.001034,fashion_and_beauty
4,0.132556,0.002477,0.863577,0.000597,0.000432,0.000361,learning
...,...,...,...,...,...,...,...
350325,0.000673,0.001794,0.003855,0.279856,0.001013,0.712809,other
350326,0.003715,0.212326,0.155770,0.622818,0.002832,0.002539,sports
350327,0.001712,0.000545,0.001239,0.808646,0.000470,0.187387,sports
350328,0.000917,0.004959,0.980925,0.004564,0.001171,0.007464,learning


In [21]:
# Insert titles into the dataframe 
titles = df['title'].values
df_categories_probs['title'] = titles

df_categories_probs.tail(50)

Unnamed: 0,gaming,fashion_and_beauty,learning,sports,music,other,category,title
350280,0.003476,0.537959,0.25164,0.2025634,0.0004269892,0.003935,fashion_and_beauty,Putin s war on Ukraine explained
350281,0.258269,0.002563,0.001865,0.5565467,0.1804568,0.0003,sports,The Biggest Music Industry Screw Job Ever ...
350282,0.993566,0.001712,0.002356,0.001157553,0.0008434127,0.000365,gaming,Constrained Writing shorts
350283,0.012346,0.37339,0.001239,0.1397668,0.0009419093,0.472316,other,Las Vegas mom hid note in daughter s sock to a...
350284,0.022131,0.963402,0.007878,0.004456038,0.001217906,0.000915,fashion_and_beauty,GOTTA RESPECT DAD WHEN HE S GOT YOUR BACK S...
350285,0.997017,0.000591,0.001302,0.0006868291,0.000193059,0.000211,gaming,Endurance wreck Ernest Shackleton s lost ship...
350286,0.994809,0.000962,0.001931,0.000854062,0.0001161113,0.001328,gaming,Ranking the Elden Ring Bosses from Easiest to ...
350287,0.003529,0.001854,0.001948,0.001360579,0.9890091,0.002299,music,King Von Too Real Official Video
350288,0.005383,0.003772,0.987962,0.001276613,0.0003287226,0.001277,learning,I Sent A MrBeast Burger To Space
350289,0.000102,9.8e-05,0.000502,0.9984871,7.772584e-05,0.000734,sports,Russell Wilson headed to Denver Broncos in blo...


In [22]:
# Generate Category-Keyword Matrix:
df_category_keywords = pd.DataFrame(model.components_) 

# Assign Column and Index:
df_category_keywords.columns = token_vectorizer.get_feature_names()
df_category_keywords.index = categories

df_category_keywords = df_category_keywords.transpose()
df_category_keywords



Unnamed: 0,gaming,fashion_and_beauty,learning,sports,music,other
0,9.896131e-04,7.393845e-04,7.930337e-04,9.799116e-04,1.634851e-04,9.402772e-03
00,4.850980e-05,2.811241e-08,7.983115e-06,3.109082e-05,2.476671e-08,2.276162e-05
000,9.212696e-03,1.453440e-03,1.283690e-03,2.600814e-03,2.476671e-08,4.800150e-04
0000001,3.528586e-05,2.811241e-08,2.652198e-08,2.823871e-08,2.476671e-08,2.526262e-08
000cal,2.203989e-08,2.811241e-08,4.511389e-05,2.823871e-08,2.476671e-08,2.526262e-08
...,...,...,...,...,...,...
zuko,2.203989e-08,2.811241e-08,2.652198e-08,2.261921e-05,3.717483e-05,2.526262e-08
zuu,2.203989e-08,2.811241e-08,1.328751e-05,2.823871e-08,2.476671e-08,2.526262e-08
zverev,2.203989e-08,2.811241e-08,2.652198e-08,4.803405e-05,3.222149e-05,2.526262e-08
zy,2.203989e-08,2.811241e-08,2.652198e-08,2.823871e-08,9.931451e-06,2.526262e-08


In [23]:
# Get the category of a new title inputted by the user:
title = input("Enter a title for your video: ")

def predict_category(text):
    """ Returns the probability vector for the input text to belong to each of the topics"""
    text_vec = token_vectorizer.transform([text])
    doc_topic = model.transform(text_vec)
    df_categories_probs = pd.DataFrame(doc_topic, columns = categories).astype('float64')
    category = df_categories_probs.idxmax(axis=1)
    return str(category[0])

predict_category(title)

Enter a title for your video: sick dunks!




'other'