<h2>Commentary Analysis - On one video<h2>

In [None]:
# To customize depending on the user:
path = "C:\\Users\\john.doe\\Documents\\My_Directory"

In [None]:
#Import libraries

import time
import itertools
import numpy as np
import pandas as pd

#Libraries for webscrapping
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#Libraries for vectorisation and clustering
from collections import Counter

#Libraries for preprocessing
from gensim.parsing.preprocessing import remove_stopwords
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from textblob import TextBlob
#nltk.download('punkt')

#Libraries for vectorisation and clustering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans

#Libraries for visualization
import webcolors
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px

<h2>Analysing comments under one video<h2>

<h3>Retrieve the video comments<h3>

In [None]:
videoId = 'My_video_id'

In [None]:
#Using Selenium to extract the webpage contents, in particular the comments and associated likes.

with Chrome() as driver:
    wait = WebDriverWait(driver,10)
    driver.get("https://www.youtube.com/watch?v=" + videoId)

    for item in range(3): 
        wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
        time.sleep(3)

    Comments = []
    Likes = []
    URLs = []
    for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
        Comments.append(comment.text.strip('\n'))
    for likes in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#vote-count-middle"))):
        Likes.append(likes.text)
        
    print(Comments[0:5])
    print(Likes[0:5])
  
 #    print(len(links))

<h4> Analyse the word cloud of the comments<h4>

In [None]:
# Tokenization

test = nltk.RegexpTokenizer(r'\w+')
SaveWords = []
for i in range(len(Comments)):
    SaveWords.append(test.tokenize(Comments[i]))  
    
def flatten_list(_2d_list):
    flat_list = []
    for element in _2d_list:
        for item in element:
                flat_list.append(item)
    return flat_list

SaveWords = flatten_list(SaveWords)
SaveWords[0:10]

In [None]:
# Most commonly-used words

SaveWords = [x.lower() for x in SaveWords]
SaveWords = [str(x) for x in SaveWords]
CountWords = Counter(SaveWords)
CountWords.most_common()[0:10]

In [None]:
# Most commonly-used words - no stop words

#nltk.download("stopwords")
SaveWords_filtered = [word for word in SaveWords if word not in fr_stop]
CountWords_filtered = Counter(SaveWords_filtered)
CountWords_filtered.most_common()[0:10]

In [None]:
# Most commonly-used words - last results
SaveWords_filtered = list(filter(lambda x: len(x) > 1, SaveWords_filtered))
CountWords_filtered = Counter(SaveWords_filtered)
CountWords_filtered.most_common()[0:10]

In [None]:
SW = pd.read_csv(path + 'StopWords_mine.csv', encoding='latin-1')
SW = list(itertools.chain.from_iterable([x.replace("'","").split(',') for x in SW["StopWords"]]))
SaveWords_filtered = [word for word in SaveWords_filtered if word not in SW]
CountWords_filtered = Counter(SaveWords_filtered)
CountWords_filtered.most_common()[0:10]

In [None]:
# Word cloud

wordcloud = WordCloud(background_color = 'white', max_font_size=50).generate(str(SaveWords_filtered))
plt.figure()
plt.imshow(wordcloud)#, interpolation="bilinear")
plt.axis("off")
plt.show()

<h3> Comments likes versus sentiment expressed  <h3>

In [None]:
#!pip install textblob
#!pip install textblob-fr
from textblob import TextBlob
from textblob_fr import PatternTagger, PatternAnalyzer

In [None]:
# Scoring depending on the feeling expressed in the comment
Scoring = []
for c_ in Comments:
    Scoring.append(TextBlob(c_, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment[0])

In [None]:
Likes_int = pd.to_numeric(Likes)
d = {'Comments': Comments, 'Scoring': Scoring, 'Likes': Likes_int}
Results = pd.DataFrame(data=d)
Results = Results.sort_values('Scoring' , ascending=False)
Results['Likes'] = Results['Likes'].fillna(0)
Results[0:6]

In [None]:
plt.scatter(Results['Scoring'], Results['Likes'])
plt.yscale('log')
plt.xlabel('Negativity -> Positivity')
plt.ylabel('Number of likes for the comments')

In [None]:
rho = np.corrcoef(Results['Scoring'], Results['Likes'])
rho

<h3> Topic extraction from comments <h3>

In [None]:
# Lemm
Comments_lemm = Comments.copy()
lemmatizer = FrenchLefffLemmatizer()
for c_ in range(len(Comments)):
    Comments_lemm[c_] = lemmatizer.lemmatize(str(Comments[c_]))

#Stop Words + one-letter word filtering
Comments_final = Comments_lemm.copy()
for c_ in range(len(Comments)):
    temp = test.tokenize(Comments[c_])
    temp = [x.lower() for x in temp]
    temp = [word for word in temp if word not in fr_stop]
    temp = [word for word in temp if word not in SW]
    temp = filter(lambda x: len(x) > 1, temp)
    Comments_final[c_] =  ' '.join(temp)


In [None]:
#Define our modelling paramaters
n_features = 100
n_components = 2 
n_top_words = 10

# Use Tensorflow to vectorise the comment flow
tf_vectorizer = CountVectorizer(analyzer='word', max_features=n_features, max_df=0.95, min_df=2)
tf = tf_vectorizer.fit_transform(Comments_final)

# Fitting the LDA model 
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=20,
    learning_method="online", #using "batch" gives the same results, prefer "online" for large dataset,
    learning_decay=0.7,     #control the learning rate in the "online" learning method
    learning_offset=50.0,
    random_state=0
)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [None]:
# Plot the topics

def plot_top_words(model, feature_names, n_top_words, title):
    
    fig, axes = plt.subplots(1, 2 , figsize=(30, 30), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()
    
plot_top_words(lda, tf_feature_names, n_top_words, "Topics of the video commentary")

<h3> Potential improvments<h3>

<h4> Remove non-french comments<h4>

In [None]:
import ngramfreq
text_categorizer = ngramfreq.NGramBasedTextCategorizer()

In [None]:
print(text_categorizer.guess_language("Brian is in the kitchen."))

In [None]:
print(text_categorizer.guess_language(str(Comments[0])))

<font color='green'>**TBD:** Far from being satisfying for all comments. To be tested.</font>

<h4> Use another 'feeling' metric<h4>

In [None]:
text          = 'My feelings are good'
sent          = TextBlob(text)
polarity      = sent.sentiment.polarity
current = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment[0]
print('english comment -->', polarity, 'versus', current)

text          = Comments[0]
sent          = TextBlob(text)
polarity      = sent.sentiment.polarity
current = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment[0]
print('french comment -->', polarity, 'versus', current)

<font color='green'>**TBD:** Check if there is a french version to 'polarity'. Another option would be to investigate Stanford Log-Linear Part-Of-Speech Tagger.</font>

In [None]:
#!pip install transformers
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
classifier(Comments[0])
classifier('Je ne suis pas du tout d accord')