In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import re
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

## lda
# !pip install gensim # can install by uncommenting this line
from gensim import corpora
import gensim
## visualizing LDA--likely need to install
# !pip install pyLDAvis # can install by uncommenting this line
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()
## specify to print all output in a call
## and not just first
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os 
data_path = "../../../data/youtube/"
output_path = "../../../output/sentiment_analysis/youtube/fenty/"

os.makedirs(output_path, exist_ok=True)

In [3]:
## spacy --- if you get an error at the load step
## need to download en_core_web_sm (google or try the next line)
#!python -m spacy download en_core_web_sm
import spacy
sp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

  from click.parser import split_arg_string
  from click.parser import split_arg_string


In [4]:
fenty_df = pd.read_csv(data_path + "youtube_fenty.csv")
fenty_df.head()

Unnamed: 0,video_id,title,description,channel,published,views,likes,comments
0,GY90IvsNbvw,TOP 5 FENTY BEAUTY PRODUCTS,MENTIONED Gloss Bomb https://go.magik.ly/ml/1r...,Morgan Turner,2023-02-14T22:00:07Z,254400,17591.0,87.0
1,A50_AmSTdVE,Rihanna’s before &amp; after using THIS founda...,,Fenty Beauty By Rihanna,2024-08-07T21:49:02Z,4308333,216894.0,1990.0
2,4LwpGaDKmZ8,FENTY BEAUTY by RIHANNA... Is It Jeffree Star ...,HEY EVERYONE! Today I'm doing a review and fir...,jeffreestar,2017-09-08T21:50:21Z,13370589,322285.0,19902.0
3,N_zDcuX1Y54,RIHANNA: FENTY BEAUTY - Review + First Impress...,Make sure you subscribe to my channel and hit ...,NikkieTutorials,2017-09-19T20:33:57Z,10381946,252956.0,10111.0
4,bH7M3vBcdcw,FENTY BEAUTY BY RIHANNA | FULL FACE + REVIEW |...,I've been dying to get my hands on the NEW Fen...,Jasmine Brown,2017-09-10T16:12:13Z,668028,24543.0,705.0


In [5]:
## clean data first
def clean_text(text):
    if pd.isnull(text):
        return ""
    
    # Remove URLs and links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove punctuation (optional if you want cleaner tokens)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers (optional)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra spaces and lowercase
    text = text.lower().strip()
    
    return text


In [6]:
fenty_df['text_clean'] = (fenty_df['title'].fillna('') + ' ' + fenty_df['description'].fillna('')).apply(clean_text)

In [7]:
fenty_df

Unnamed: 0,video_id,title,description,channel,published,views,likes,comments,text_clean
0,GY90IvsNbvw,TOP 5 FENTY BEAUTY PRODUCTS,MENTIONED Gloss Bomb https://go.magik.ly/ml/1r...,Morgan Turner,2023-02-14T22:00:07Z,254400,17591.0,87.0,top fenty beauty products mentioned gloss bom...
1,A50_AmSTdVE,Rihanna’s before &amp; after using THIS founda...,,Fenty Beauty By Rihanna,2024-08-07T21:49:02Z,4308333,216894.0,1990.0,rihannas before amp after using this foundatio...
2,4LwpGaDKmZ8,FENTY BEAUTY by RIHANNA... Is It Jeffree Star ...,HEY EVERYONE! Today I'm doing a review and fir...,jeffreestar,2017-09-08T21:50:21Z,13370589,322285.0,19902.0,fenty beauty by rihanna is it jeffree star app...
3,N_zDcuX1Y54,RIHANNA: FENTY BEAUTY - Review + First Impress...,Make sure you subscribe to my channel and hit ...,NikkieTutorials,2017-09-19T20:33:57Z,10381946,252956.0,10111.0,rihanna fenty beauty review first impression...
4,bH7M3vBcdcw,FENTY BEAUTY BY RIHANNA | FULL FACE + REVIEW |...,I've been dying to get my hands on the NEW Fen...,Jasmine Brown,2017-09-10T16:12:13Z,668028,24543.0,705.0,fenty beauty by rihanna full face review ja...
...,...,...,...,...,...,...,...,...,...
3813,XWrB4qOF_UA,FENTY BEAUTY BODY LAVA &amp; FAIRY BOMB | HIT ...,HEY EVERYONE! Where are all my glow addicts at...,jeffreestar,2018-04-08T17:15:51Z,4871365,154354.0,10915.0,fenty beauty body lava amp fairy bomb hit or ...
3814,vxicEG2iYUE,FENTY BEAUTY IN INDIA #fentybeauty #glossylips...,FENTY BEAUTY NOW IN INDIA @mynykaa Cross Borde...,Zohainsight,2024-03-12T19:06:39Z,13375,244.0,3.0,fenty beauty in india fentybeauty glossylips l...
3815,YtUxmVmsJJA,FENTY PRO FILT&#39;R CONCEALER VS TARTE SHAPE ...,Hola my babes! Hope you're all doing great tod...,Dilan Sabah,2019-01-14T19:04:08Z,80635,1168.0,96.0,fenty pro filtr concealer vs tarte shape tape ...
3816,bcZNf6U5fIg,FENTY BEAUTY LIPSTICK DUPE- ₹2500 vs ₹599😱 | #...,Part III- FAV LIPSTICK DUPES!! #episode3 I lov...,Disha Batra,2021-09-09T13:58:21Z,24591,754.0,6.0,fenty beauty lipstick dupe vs shorts dupes ...


In [8]:
## tokenize
tokens_pos_list = []
for description in fenty_df['text_clean'].dropna():
    description = str(description)
    tokens = word_tokenize(description)
    tokens_pos = pos_tag(tokens)
    tokens_pos_list.append(tokens_pos)

In [79]:
## Stem
custom_words_toad = ['fenty', 'rihanna', 'video', 'fentybeauty','youtube', 'youtuber', 'likes', 'comments', 'today', 'hey','subcribe','makeup','skincare','skin','beauty','new','product']

def preprocess(df_col, custom_words_toad):
    porter = PorterStemmer()
    list_stopwords = stopwords.words("english")
    new_stopwords = set(list_stopwords + custom_words_toad)

    corpus_lower = df_col.fillna("").str.lower().to_list()

    nostop_listing = []
    for text in corpus_lower:
        # Clean URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Tokenize and remove stopwords
        tokens = [
            word for word in wordpunct_tokenize(text)
            if word.isalpha() and word not in new_stopwords
        ]
        # Apply stemming
        stemmed_tokens = [porter.stem(word) for word in tokens if len(word) > 2]
        nostop_listing.append(stemmed_tokens)

    return nostop_listing
    
fenty_df['cleaned_description'] = preprocess(fenty_df['description'], custom_words_toad)

In [80]:

## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [81]:
string_docs = fenty_df.cleaned_description.apply(lambda tokens: ' '.join(tokens))
dtm_nopre = create_dtm(
    list_of_strings=string_docs,
    metadata=fenty_df[['video_id', 'channel', 'views', 'likes', 'comments']]
)

Sparse matrix form:
   (0, 2255)	1
  (0, 1466)	1
  (0, 390)	1
  (0, 433)	1
  (0, 1309)	1
  (0, 714)	1
  (0, 1016)	1
  (0, 985)	1
  (0, 3611)	1
  (2, 1135)	1
  (2, 2950)	1
  (2, 1306)	1
  (2, 1742)	1
  (2, 2041)	1
  (2, 1975)	1
  (2, 2449)	1
  (2, 1500)	1

Dense matrix form:
    abeg  aber  abh  abl  abloh  abod  abonn  abonnez  aborigin  aboutbest  \
0     0     0    0    0      0     0      0        0         0          0   
1     0     0    0    0      0     0      0        0         0          0   
2     0     0    0    0      0     0      0        0         0          0   
3     0     0    0    0      0     0      0        0         0          0   
4     0     0    0    0      0     0      0        0         0          0   

   ...  继ladi  蕾哈娜和劳伦  글로벌  뷰티걸들과  뷰티플레이  서포터즈입니다  시작합니다  영상에  인터뷰이는  출연한  
0  ...      0       0    0      0      0        0      0    0      0    0  
1  ...      0       0    0      0      0        0      0    0      0    0  
2  ...      0       0    0      0

In [82]:
# Skip metadata columns manually
term_columns = dtm_nopre.columns.difference(['video_id', 'channel', 'views', 'likes', 'comments', 'index'])
top_terms = dtm_nopre[term_columns].sum(axis=0)
top_terms_sorted = top_terms.sort_values(ascending=False)

print(top_terms_sorted.head(20))


review      349
foundat     296
product     262
subscrib    198
lip         192
use         185
guy         183
welcom      176
watch       173
tri         173
love        161
hope        155
pro         155
first       155
get         154
short       151
back        148
gloss       143
shade       140
full        137
dtype: int64


In [83]:

## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = fenty_df.cleaned_description



## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)
raw_len = len(text_raw_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}


## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(fenty_df.shape[0]*0.05)
upper_bound = round(fenty_df.shape[0]*0.95)

### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(raw_len)} to {str(len(text_raw_dict))}.')
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]} # show first five entries after filtering


## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

{0: 'bomb', 1: 'bright', 2: 'conceal', 3: 'drop', 4: 'eaz'}

Filtering out very rare and very common words reduced the length of dictionary from 4033 to 3.


{0: 'review', 1: 'foundat', 2: 'product'}

Sample of documents represented in dictionary format (with omitted words noted):


[([],
  {'bomb': 1,
   'bright': 1,
   'conceal': 1,
   'drop': 1,
   'eaz': 1,
   'fix': 1,
   'gloss': 1,
   'mention': 1,
   'tint': 1}),
 ([], {}),
 ([(0, 1)],
  {'everyon': 1,
   'first': 1,
   'got': 1,
   'impress': 1,
   'last': 1,
   'line': 1,
   'night': 1}),
 ([],
  {'bell': 1,
   'channel': 1,
   'hit': 1,
   'make': 1,
   'miss': 1,
   'notif': 1,
   'subscrib': 1,
   'sure': 1,
   'video': 1}),
 ([(0, 1)],
  {'die': 1,
   'first': 1,
   'get': 1,
   'hand': 1,
   'impress': 1,
   'line': 1,
   'want': 1}),
 ([],
  {'absolut': 1,
   'everyon': 1,
   'face': 1,
   'forget': 1,
   'full': 1,
   'let': 1,
   'look': 1,
   'love': 1,
   'much': 1,
   'thank': 1,
   'watch': 1}),
 ([],
  {'diffus': 1,
   'filt': 1,
   'instant': 1,
   'pore': 1,
   'primer': 3,
   'pro': 1,
   'rare': 1,
   'retouch': 1}),
 ([], {'channel': 1, 'launch': 1, 'sister': 1, 'subscrib': 1, 'video': 1}),
 ([], {'bomb': 1, 'fok': 1, 'gloss': 1, 'must': 1, 'stix': 1}),
 ([], {'code': 1, 'fat': 1, 'get'

In [84]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feeding the lda function:
## (1) the corpus we created from the dictionary,
## (2) a parameter we decide on for the number of topics (k),
## (3) the dictionary itself,
## (4) parameter for number of passes through training data (more means slower), and
## (5) parameter that returns, for each word remaining in dict, the topic probabilities.
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, 
                                         id2word=text_raw_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True)

print(type(ldamod))

<class 'gensim.models.ldamodel.LdaModel'>


In [85]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)

(0, '0.639*"product" + 0.234*"foundat" + 0.127*"review"')
(1, '0.998*"product" + 0.001*"foundat" + 0.001*"review"')
(2, '0.514*"foundat" + 0.478*"review" + 0.008*"product"')
(3, '0.997*"foundat" + 0.002*"product" + 0.001*"review"')
(4, '0.998*"review" + 0.001*"foundat" + 0.001*"product"')


In [86]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

0    [mention, gloss, bomb, bright, fix, conceal, e...
1                                                   []
2    [everyon, review, first, impress, line, last, ...
3    [make, sure, subscrib, channel, hit, notif, be...
4    [die, get, hand, review, first, impress, line,...
Name: cleaned_description, dtype: object

[[(0, 0.16551188),
  (1, 0.20902297),
  (2, 0.17502663),
  (3, 0.21745461),
  (4, 0.23298393)],
 [(0, 0.16551188),
  (1, 0.20902297),
  (2, 0.17502663),
  (3, 0.21745461),
  (4, 0.23298393)],
 [(0, 0.083798826),
  (1, 0.10582017),
  (2, 0.08952449),
  (3, 0.11008888),
  (4, 0.6107676)],
 [(0, 0.16551188),
  (1, 0.20902297),
  (2, 0.17502663),
  (3, 0.21745461),
  (4, 0.23298393)],
 [(0, 0.08379882),
  (1, 0.10582017),
  (2, 0.089504845),
  (3, 0.11008888),
  (4, 0.61078733)]]

In [87]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

In [15]:
all_nouns = [
    word for one_tok in tokens_pos_list for (word, tag) in one_tok if tag == "NNP"
]

all_adj_and_nouns = [
    word for one_tok in tokens_pos_list for (word, tag) in one_tok if tag == "JJ" or tag == "NN"
]

In [10]:
# Top 20 most common proper nouns (e.g. brand names, product names)
top_nouns = Counter(all_nouns).most_common(20)
print("🔠 Top Proper Nouns:", top_nouns)

# Top 20 adjectives and nouns (for common product features, sentiments)
top_adj_nouns = Counter(all_adj_and_nouns).most_common(20)
print("💬 Top Adjs/Nouns:", top_adj_nouns)

🔠 Top Proper Nouns: [('x', 97), ('_', 9), ('von', 6), ('kylie', 5), ('à', 5), ('_the', 4), ('__', 4), ('youtuber', 3), ('म', 3), ('makeup', 2), ('october', 2), ('함께', 2), ('한', 2), ('뷰티', 2), ('토크', 2), ('éxito', 2), ('mattemoiselle', 2), ('मकब', 2), ('बकस', 2), ('सऊद', 2)]
💬 Top Adjs/Nouns: [('fenty', 4195), ('beauty', 3628), ('rihanna', 1317), ('makeup', 1094), ('new', 1092), ('fentybeauty', 762), ('review', 752), ('i', 741), ('foundation', 726), ('skin', 709), ('video', 457), ('lip', 426), ('gloss', 424), ('bomb', 350), ('skincare', 334), ('first', 305), ('tutorial', 296), ('haul', 283), ('full', 282), ('amp', 276)]


{'huez',
 'theyshe',
 'castle',
 'indianasian',
 'tongue',
 'shab',
 'summer',
 'plush',
 'p',
 'rockys',
 'questionable',
 'macchiato',
 'criss',
 'instinto',
 'sight',
 'muoka',
 'lbs',
 'impressionsreview',
 'clã',
 'iv',
 'few',
 'simiuk',
 'usando',
 'profilter',
 'delight',
 'avis',
 'depth',
 'agenda',
 'hallo',
 'interviewing',
 'arianagrande',
 'white',
 'excellent',
 'swarovski',
 'news',
 'empowerment',
 'kay',
 'blogger',
 'crystal',
 'weekly',
 'contact',
 'tense',
 'shadowstix',
 'monday',
 'reviewer',
 'sangria',
 'unleash',
 'sephorasale',
 'forbes',
 'freckle',
 'comfortable',
 'texturedacne',
 'charitable',
 'facebook',
 'co',
 'size',
 'preppygrwm',
 'sparkly',
 'yan',
 'aka',
 'brightfix',
 'kardashianjenner',
 'edc',
 'demi',
 'balance',
 'tom',
 'kulfibeauty',
 'valerian',
 'redhead',
 'nobie',
 'banana',
 'treslucebeauty',
 'generous',
 'ownit',
 'jamaica',
 'selfimage',
 'anyone',
 'affiliate',
 'downright',
 'travel',
 'melissajackson',
 'literal',
 'boutique',

Brand videos: 153
Influencer videos: 547


In [18]:
all_words = ' '.join(df['title'].dropna()).lower()
words = re.findall(r'\b[a-z]+\b', all_words)
word_counts = Counter(words)

# Most common non-boring words
common = [w for w in word_counts.items() if w[0] not in {'the','and','with','for','of','to','in', 'by', 'shorts', 'is', 'fenty','beauty'}]
print(sorted(common, key=lambda x: x[1], reverse=True)[:20])


[('fentybeauty', 193), ('rihanna', 164), ('gloss', 154), ('new', 152), ('bomb', 119), ('makeup', 112), ('lip', 107), ('s', 92), ('foundation', 84), ('stix', 75), ('review', 63), ('skin', 56), ('soft', 48), ('swatches', 45), ('amp', 44), ('lipstick', 40), ('lit', 37), ('shade', 35), ('sephora', 35), ('powder', 31)]


In [22]:
pos_tags = pos_tag(words)
for word, pos_tag in pos_tags:
    print(f"{word}: {pos_tag}")