<a href="https://colab.research.google.com/github/michael-L-i/Movies-Topic-Modelling/blob/main/Main_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# packages for data scraping

import pandas as pd
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import time
import random
import numpy as np

!pip install selenium
!apt-get update
!apt install chromium-chromedriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

!apt install firefox
!apt install xvfb
!pip install pyvirtualdisplay
from selenium.webdriver import FirefoxOptions

In [None]:
# packages for topic modelling and sentiment analysis

!pip install bertopic

import pandas as pd
import numpy as np
from collections import defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

import string
import os

from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

In [None]:
# packages for topic labelling and plotting
!pip install openai
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px

In [None]:
# options for Chrome driver necessary for Selenium to run on Colab
options = webdriver.ChromeOptions()
options.add_argument("--verbose")
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument("--window-size=1920, 1200")
options.add_argument('--disable-dev-shm-usage')

# **Collection of Reviews Given Movie Name**

**1. Overview**

The site we chose to gather review data for movies from is [IMDb](https://www.imdb.com/). The get_movie_info() function takes in a querie, which represents the movie the user would want to get reviews from.

**2. Process Structure**

IMDb includes many reviews for each movie. The get_movie_info() function first takes a user querie for a movie name, like "Mission: Impossible - Dead Reckoning Part One" and finds the corresponding movie using IMDb's search feature.

Then, once on the review page for the found movie, extract all the reviews. To extract more, we essentially click on the 'load more' button several times and compile all the available reviews.

**3. Technicals**

For the first step to find the movie URL, which on IMDb is specficied by a movie tag embedded within the URL, not the actual movie name. We do so using Selenium. Once the movie name is passed into the function, we create get the search URL for IMDb to find matching movies. Then, using Selenium we extract the URL stored for that movie.

After getting the actual movie url, we run another driver for the reviews site. One barrier for getting reviews from IMDb is that many of the reviews are hidden, and they only appear when a 'load more' button is clicked. Selenium comes in handy in that it can simulate this process, allowing us to access many reviews on the page. Then, it is a simple process using BeautifulSoup to compile all the reviews.



In [None]:
def get_movie_info(querie, reviews):

  querieURL = "https://www.imdb.com/find?q=" + querie

  # finding URL of review page
  opts = FirefoxOptions()
  opts.add_argument("--headless")

  driverFirefox = webdriver.Firefox(options=opts)
  driverFirefox.get(querieURL)
  movie_url = ""
  try:
      movie = WebDriverWait(driverFirefox, 10).until(
          EC.presence_of_element_located((By.CLASS_NAME, 'ipc-metadata-list-summary-item__t'))
      )
      movie_url = movie.get_attribute("href")
  finally:
      driverFirefox.quit()

  movie_url = "reviews/?".join(movie_url.split('?'))

  # extracting reviews
  driver = webdriver.Chrome(options=options)
  driver.get(movie_url)

  limit = 0 # limiting number of reviews (number of clicks to load more button) for the sake of runtime
  while limit < 10:
    try:
        load_more_button = driver.find_element(By.CSS_SELECTOR, ".ipl-load-more__button")
        load_more_button.click()
        time.sleep(float(np.random.randint(30,40)) / 10)
    except:
        break
    limit += 1

  html = driver.page_source
  soup = BeautifulSoup(html, "html.parser")

  # two different classes for texts based on IMDb's website structure
  for review in soup.find_all("div", class_="text show-more__control"):
    reviews.append(review.get_text()) # reviews array is pass by reference
  for review in soup.find_all("div", class_="text show-more__control clickable"):
    reviews.append(review.get_text())

# **Topic Modelling**

**1. Overview**

This step will look at the reviews and pull out major themes and topics for a particular movie. For example, for a Superhero movie, themes that may come out could be: *plot, action, sacrifice, visuals, music, etc.* We will assign these labels latter, but we want some sort of way to extract different aspects and themes of a movie.

**2. Process Structure**

To implement this process, we will be using the [BERTopic](https://maartengr.github.io/BERTopic/index.html) package. It was developed by Data Scientist Maarten Grootendorst in 2020. Essentially, utilizing transformers and c-TF-IDF, it is able to cluster texts in a corpus into clear categories. We will apply BERTopic to our segmentation of movie reviews into themes. This process is known in the NLP field as topic modelling.

Before this, however, we need to clean the data to optimize performance of our eventual BERTopic model. We will split the entire corpus into smaller sentences (we will specify as length of 3 sentences). Essentially, we treat the corpus as containing many for 'documents' each one being at most 3 sentences. Further, we will remove most instances of actor names as they often take up the major representation of topics when we would rather look at other keywords.

**3. Technicals**

*BERTopic structure:*

To do this unsupervised classification, we will be using the BERTopic model. Its pipeline involves several components. The first is embedding the texts into vectors in dense vector space, using an embedding model called [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2). The process follows with dimensionality reduction to make finding clusters easier, and then finding those well-formed clusters of these vectors. Next, we want extract key words that represent that topic from each cluster. We first employ a slightly altered version of TF-IDF that BERTopic provides. Then we chose to use MaximalMarginalRelevance to find keywords that best define that cluster. It does so by finding a balance between relevance to a unified topic within a cluster and diversity between topics in other clusters.

Finally, we should get some well defined topics for our movie reviews!

*Hyperparameter Tuning*

Hyperparameter tuning is very important here as results are not optimal under certain parameters. The main components that were subject to tuning was with the UNAP model to identify the optimal dimension reduction to be able to form clear clusters but also have sufficient diversity from each other. Similarly, HSBSCAN was optimize to create clusters that could lead to the best topic modelling.



In [None]:
def remove_people_names(text):
  tagged_words = pos_tag(nltk.word_tokenize(text))
  filtered_words = [word for word, pos in tagged_words if pos != 'NNP'] # removes proper nouns
  return ' '.join(filtered_words)

# splits text into sentences of length <= 3
def split_into_paragraphs(text_list, indexingDocs):
    paragraphs = []
    index = 0

    for text in text_list:
        sentences = sent_tokenize(text)
        while len(sentences) > 0:
            if len(sentences) < 3:
                paragraphs.append(' '.join(sentences))
                break
            else:
                paragraphs.append(' '.join(sentences[:3]))
                indexingDocs[paragraphs[-1]] = index
                sentences = sentences[3:]
        index += 1
    return paragraphs

"""
BERTopic Model
"""
def BERTopicModel(docs, reviewsDf):
  # embedding, dimensionality reduction, and clustering
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
  umap_model = UMAP(n_neighbors = 25, n_components=5, min_dist=0.0, metric='cosine', random_state=0)
  hdbscan_model = HDBSCAN(min_cluster_size=len(docs)//75, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)

  # create topic representation from the clusters
  vectorizer_model = CountVectorizer(stop_words="english", min_df=2, max_df = len(reviewsDf), ngram_range=(1,2))
  ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
  representation_model = MaximalMarginalRelevance(diversity=0.3, top_n_words=10)

  # define the pipeline
  BERTopic_model = BERTopic(
    embedding_model = embedding_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    vectorizer_model = vectorizer_model,
    ctfidf_model = ctfidf_model,
    representation_model = representation_model,
    calculate_probabilities = True,
    top_n_words=10
  )
  return BERTopic_model

# **Sentiment Analysis**

**1. Overview**

Now with clear set topics, we want to analyze the sentiment of the documents belonging to each category/topic. In the end, we should be able to graph the proportion of positive, neutral, and negative ratings for each topic.

**2. Process Structure**

The goal is to create a visualization of the proportion of positive, neutral, and negative reviews.

One pertinent issue is that by splitting reviews into smaller sentence chunks, some of these chunks may only capture plot description which is common in movie reviews. In short, the plot may create an extremely negative sentiment score when that segment did not even include the review's opinion, which might be present in the rest of the review, but which we had cut up.

To fix this, we will tie text segments, each back to their orignal review and evaluate the sentiment for the original review. We take this score and apply it as representing each text segment. In the cases where a review is overwhelmingly positive except for one section on a topic, this design will suffer. However, considering the large dataset size and general style of reviews (in terms the effect of bias), this design choice seems to provide good results.

**3. Technicals**

For the specific sentiment analysis, we will be using NLTK's SentimentIntensityAnalyzer which assigns a score of -1 to 1 (-1 being negative, 0 being neutral, and 1 being positive) sentiment in a text.

Following the design strucutre described above, we will iterate through each sub-text and assign a sentiment score to each using the NLTK package. For each topic, we then count the number of positive, neutral, and negative reviews.

We categorize sentiment by the score as follows:

```
positive: score >= 0.25
negative: score <= -0.25
neutral: -0.25 < score < 0.25
```




In [None]:
"""
reviewsDict: Dictionary with keys (0 - #topics) with list of
all ORIGINAL reviews that have sub-texts belonging to that topic
"""
def documentReversal(topicsDf, reviewsDf, docs, indexingDocs, probs):
  reviewsDict = defaultdict(list)

  for i in range(len(probs)):
    docIndex = np.argmax(probs[i])
    reviewsDict[docIndex].append(reviewsDf["Reviews"][indexingDocs[docs[i]]])

  return reviewsDict

"""
Sentiment Analysis
"""
def sentimentSegmentation(neg, neu, pos, reviewsDict):
  sentiment_for_each_topic = []
  sia = SentimentIntensityAnalyzer()

  for i in range(0, len(reviewsDict) - 1):
    total_score_for_topic = 0
    total_count_for_topic = 0

    for text in reviewsDict[i]:
      sentiment_score = sia.polarity_scores(text)['compound']
      if (sentiment_score <= -0.25):
        neg[i] += 1
      elif (sentiment_score >= 0.25):
        pos[i] += 1
      else:
        neu[i] += 1

# **Topic Labelling**

**1. Overview**

With defined clusters and keyword representation of those clusters, we will begin to assign topic labels to those clusters. The goal is to create a plot of the sentiment towards each cluster paired with a topic label for that cluster.

**2. Process Strucutre**

The first step is to generate the labels for each category, which we do using OpenAI's GPT-3.5 API. This, however, generates often overlapping categories, and some cleaning is required. This is done through another embedding and comparison process to essentially group closely related topic labels into one. Then we should be able to plot the sentiment with a corresponding topic label, concluding the main goal of this project!

**3. Technicals**

For the first step to generate topic labels using GPT-3.5, passing through key words and representative documents for each topic, paired with some prompt engineering.

Next, we want to iron out the topics so that there is sufficient diversity between those labels by grouping related categories. We do so by first embedding an array of the keywords for each topic alongside the assigned topic label. Next, we calculate the similarities between each embedding, considering groups with high enough similarity for grouping being a cosine similarity of 0.5. The grouping is done in the relabelling() function.

In the end, we should end up with fairly distinct topics that would represent major themes or components of a movie.

In [None]:
def generate_labels(topicsDf):
  client = OpenAI(api_key='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')

  INSTRUCTIONS = """You are a topic labeler for a movie review segmentation model. The developers have already clustered movie reviews into categories, and each category was assigned a list of the most common words (called 'Representation') that represent it. You will be given this list of representative words along with a few example reviews (called 'Representative_Docs') from that category.

Your task is to output a single, concise category label that best describes the given representative words and example reviews, regardless of the sentiment expressed in the reviews. The label should be a short phrase or a few words, such as 'Music', 'Acting', 'Story Arc', 'Math/Professor Elements', 'Character Depth', 'Emotional Engagement', or 'Messaging/Morals'.

The input will be provided in the following format:

Here are the representative words for this cluster:
[list of representative words separated by commas or newlines]

Here are some representative reviews:
[multiple example reviews separated by newlines]

Give a label for this segmented category.

Please respond with only the category label, without any additional explanation or context. If you cannot determine a suitable label from the given information, respond with 'Unclear'. When determining the category label, focus on the topics or aspects of the movie being discussed, rather than the overall sentiment or opinion expressed in the reviews."""

  responses = []
  messages = [
      {"role": "system", "content": INSTRUCTIONS}
      ]

  for index, row in topicsDf.iterrows():
    messages.append(
        {"role": "user", "content": f"Here are the representative words for this cluster:\n\n{topicsDf['Representation'].iloc[index]}\n\nHere are some representative reviews:{topicsDf['Representative_Docs'].iloc[index][:3]}. Give a label for this segmented category."},
    )
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=messages
    )
    responses.append(response.choices[0].message.content)
    messages.append(
        {"role": "assistant", "content": response.choices[0].message.content},
    )

  return responses


def preprocess_embeddings(labels, topicsDf):
  keywordsList = []

  for index, row in topicsDf.iterrows():
    key_words = topicsDf['Representation'].iloc[index][:10]
    keywordsList.append([labels[index - 1]] + key_words)

  return keywordsList


def embed(labels, topicsDf):
  client = OpenAI(api_key='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')

  embeddings = []

  for keywords in preprocess_embeddings(labels, topicsDf):
    response = client.embeddings.create(
      input=keywords,
      model="text-embedding-3-small"
  )

    embeddings.append(response.data[0].embedding)
  return embeddings


def calcSimilarities(embeddings):
  similarity_threshold = 0.5

  embeddings = np.array(embeddings)
  similarities = cosine_similarity(embeddings)
  similaritiesArr = []

  for i in range(len(similarities)):
      for j in range(i + 1, len(similarities)):
          similarity = similarities[i, j]
          if similarity > similarity_threshold:
            similaritiesArr.append([i, j, similarity])

  similaritiesArr = np.array(similaritiesArr)
  similaritiesArrSorted = []

  if (len(similaritiesArr) > 0):
    similaritiesArrSorted = similaritiesArr[similaritiesArr[:, 2].argsort()[::-1]]
  else:
    similaritiesArrSorted = similaritiesArrSorted

  return similaritiesArrSorted


def relabelling(similaritiesArrSorted, neg, neu, pos, labels):
  for ele in similaritiesArrSorted:
    if (len(labels) <= 5):
      break

    neg[int(ele[0])] += neg[int(ele[1])]
    neu[int(ele[0])] += neu[int(ele[1])]
    pos[int(ele[0])] += pos[int(ele[1])]

    neg[int(ele[1])] = 0
    neu[int(ele[1])] = 0
    pos[int(ele[1])] = 0

  index = 0
  while (index < len(neg)):
    if neg[index] == 0 and neu[index] == 0 and pos[index] == 0:
      del neg[index]
      del neu[index]
      del pos[index]
      del labels[index]
      index -= 1
    index += 1

  for i in range(len(neg)):
    tot = neg[i]+neu[i]+pos[i]
    neg[i] = neg[i]/tot
    neu[i] = neu[i]/tot
    pos[i] = pos[i]/tot

# **Running Everything**

Finally, we create the run() function that will compile all our functions and create a plot of the sentiment for each topic/theme in a movie.

In [None]:
def run():
  reviews = []
  querie = input("Input a movie to gather reviews! ")
  get_movie_info(querie, reviews)

  reviewsDf = pd.DataFrame({"Reviews": reviews})
  reviewsDf['Reviews'] = reviewsDf['Reviews'].apply(lambda x: remove_people_names(x))

  indexingDocs = defaultdict(int)
  # docs is the new corpus to be inputted into BERTopic model
  docs = split_into_paragraphs(reviewsDf['Reviews'], indexingDocs)

  """
  topics stores which topic each 'document' in the corpus belongs to
  probs stores the probability of each 'document' belonging to all categories
  -1 label for a topic are attributed to 'documents' that don't fit well with topics
  """
  BERTopic_model = BERTopicModel(docs, reviewsDf)
  topics, probs = BERTopic_model.fit_transform(docs)

  topicsDf = BERTopic_model.get_topic_info() # DataFrame with information about each topic
  topicsDf = topicsDf.loc[1:].reset_index(drop=True)

  # clearning topicsDf for categories with < 10 representative keywords
  for i in range(len(topicsDf)):
    if topicsDf["Representation"][i][-1] == '':
      topicsDf = topicsDf.drop(i)
      probs = np.delete(probs, i, axis = 1)
  topicsDf = topicsDf.reset_index(drop=True)

  # sentiment evaluation for each topic
  reviewsDict = documentReversal(topicsDf, reviewsDf, docs, indexingDocs, probs)
  neg = [0]*(len(reviewsDict))
  neu = [0]*(len(reviewsDict))
  pos = [0]*(len(reviewsDict))
  sentimentSegmentation(neg, neu, pos, reviewsDict)

  # initial topic labels
  labels = generate_labels(topicsDf)

  # fine tuning topic labels
  embeddings = embed(labels, topicsDf)
  similaritiesArrSorted = calcSimilarities(embeddings)
  relabelling(similaritiesArrSorted, neg, neu, pos, labels)

  finalDf = pd.DataFrame({"Topics": labels, "Negative": neg, "Neutral": neu, "Positive": pos})

  # deleting rows with "Unclear" label
  for i in range(len(finalDf)):
    if (finalDf.loc[i]["Topics"] == "Unclear"):
      del neg[i]
      del neu[i]
      del pos[i]
  finalDf = finalDf[finalDf["Topics"] != "Unclear"].reset_index(drop=True)

  # plotting
  fig = px.bar(finalDf, y="Topics", x=["Negative", "Neutral", "Positive"],
              title="Sentiment Distribution by Topic",
              labels={"value": "Sentiment Distribution", "Topics": "Topics"},
              color_discrete_map={"Negative": "red", "Neutral": "gray", "Positive": "green"},
               orientation = "h"
               )
  fig.update_layout(
    font=dict(
        size=13
    )
  )
  fig.show()

In [None]:
run()