# Imports
Imports of the required libraries to complete lab3

In [2]:
# regex
import re

# pandas + numpy
import numpy as np
import pandas as pd

# setting pandas options
pd.set_option('display.max_colwidth', 200)


# storing and loading models
import pickle

# to set types for functions
from typing import Tuple

# Plotting
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


# gpu debug
import torch

# setting device to use GPU for NLP backend if you have GPU available
device = "cuda" if torch.cuda.is_available() else "cpu"


# SBERT
from sentence_transformers import SentenceTransformer

# UMAP
from umap import UMAP

#HDBSCAN
from hdbscan import HDBSCAN

# topic finding
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading model from pickle if possible, to avoid downloading it again
try:
    model = pickle.load(open(f'model-{device}.pkl', 'rb'))

    model_load = True

except:
    model = SentenceTransformer('all-mpnet-base-v2', device=device)
    pickle.dump(model, open(f'model-{device}.pkl', 'wb'))

    model_load = False

print(f"""
GPUs detected:          {torch.cuda.device_count()}
Using GPU:              {torch.cuda.is_available()}
Device:                 {device}
Got model from pickle:  {model_load}
""")




GPUs detected:          0
Using GPU:              False
Device:                 cpu
Got model from pickle:  True



# Functions
Creating a function that finds the most relevant word per cluster id.

In [3]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words


## Cleaning
Here we define a function who's main task is to "clean" a string. We're casting all elements in the string to lower case as well as removing punctation and other non-alphanumeric characters.
The function takes a string as an argument and returning the "cleaned" string.
 

In [4]:
def string_cleaner(input: str) -> str:
    """
    Function to clean up strings.

    Args:
        input (str): String to be cleaned.

    Returns:
        str: Cleaned string.
    """
    
    # turning lowercase
    input = input.lower()

    # removing punctuation and other non-alphanumeric characters
    input = re.sub(r'[^\w\s]', '', input)
    
    return input

## Topic Modeling
Creating a function that maps topics to cluster ids
The function returning a Dictionary with cluster ids as keys and topics as values. 

In [5]:
def tfidf_most_relevant_word(input: list, num_words=5) -> list:
  """
  Function that finds the most relevant words per cluster id.

  Args:
      input (list): A list of title strings aggregated by cluster id.
      num_words (int, optional): How many words you want. Defaults to 5.

  Returns:
      list: Returns a list of most relevant words, with lenght of unique cluster Ids
  """

  most_relevant_words = []
  
  for corpus in input:
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(vectorizer.get_feature_names_out()) # get_feature_names
    most_relevant_words.append(tfidf_feature_names[importance[:num_words]])

  return most_relevant_words
  

def topic_by_clusterId(result: pd.DataFrame) -> dict:
  """
  Function that maps topics to cluster ids.

  Args:
      result (pd.DataFrame): Dataframe with cluster ids and topics.

  Returns:
      dict: Dictionary with cluster ids as keys and topics as values.
  """

  #print(result.isna().sum())

  df_group = result[["titles", "cluster_label"]].groupby("cluster_label").agg(list).reset_index()

  df_group["topics"] = tfidf_most_relevant_word(df_group["titles"])

  return dict(zip(df_group.cluster_label, df_group.topics))

## Plotting Functions


In [6]:
# when you actually cast the type here, then it works with how pandas casts types and you don't have to worry about copying seriers
def result_df_maker(embeddings: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> pd.DataFrame:
  """
  Function to make a dataframe with the embeddings, cluster labels, topic per cluster label and titles.

  Args:
      embeddings (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      pd.DataFrame: Dataframe with embeddings, cluster labels, topics per cluster, and titles.
  """
  result = pd.DataFrame(embeddings, columns=['x', 'y'])

  result["titles"] = titles

  result["cluster_label"] = cluster_labels

  topic_dict = topic_by_clusterId(result)

  result["topics"] = result["cluster_label"].apply(lambda x: topic_dict[x])

  result["topics"] = result["topics"].apply(lambda x: " ".join(x))

  return result

def result_splitter(result: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
  """
  Function to split the dataframe into two dataframes, one for clustered and one for outliers.

  Args:
      result (pd.DataFrame): Dataframe with embeddings, cluster labels, topics per cluster, and titles.

  Returns:
      Tuple[np.ndarray, np.ndarray]: Tuple of two dataframes, one for clustered and one for outliers.
  """

  clustered = result.loc[result.cluster_label != -1, :]
  outliers = result.loc[result.cluster_label == -1, :]
  return clustered, outliers

# the cavalry is not here, but it's fine! Why? I am here!
def result_tracer(clustered: pd.DataFrame, outliers: pd.DataFrame) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      clustered (pd.DataFrame): clustered dataframe to be colored by cluster and get hover data
      outliers (pd.DataFrame): outlier data frame with grey color and no hover data

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  trace_cluster = go.Scattergl(
    x=clustered.x, 
    y=clustered.y, 
    mode="markers", 
    name="Clustered",

    # styling markers
    marker=dict(
      size=2, 
      color=clustered.cluster_label,
      colorscale="Rainbow"
    ), 

    # setting hover text to the titles of the videos
    hovertemplate="<b>Topics:</b> %{customdata[0]} <br><b>Cluster Id:</b> %{customdata[1]}<extra></extra>", 
    customdata=np.column_stack([clustered.topics, clustered.cluster_label]),
  )

  trace_outlier = go.Scattergl(
    x=outliers.x,
    y=outliers.y,
    mode="markers",
    name="Outliers",

    marker=dict(
      size=1,
      color="grey"
    ),

    hovertemplate="Outlier<extra></extra>"
  )

  return trace_cluster, trace_outlier

def result_tracer_wrapper(uembs: np.ndarray, cluster_labels: np.ndarray, titles: np.ndarray) -> Tuple[go.Scattergl, go.Scattergl]:
  """
  Function to make a scatter traces of the clustered and outliers.

  Args:
      uembs (np.ndarray): 2D array of embeddings.
      cluster_labels (np.ndarray): array of cluster labels.
      titles (np.ndarray): array of titles.

  Returns:
      Tuple[go.Scattergl, go.Scattergl]: Tuple of two scatter traces.
  """

  result = result_df_maker(uembs, cluster_labels, titles)
  clustered, outliers = result_splitter(result)
  trace_cluster, trace_outlier = result_tracer(clustered, outliers)
  return trace_cluster, trace_outlier

In [7]:
def subplotter(trace_nested_list: list, titles: list, base_size=1000) -> go.Figure:
    """
    Function to make a figure with subplots of the clustered and outliers.

    Args:
        trace_nested_list (list): list holding rows of columns, each column holding traces. 
        titles (list): Titles for the subplots
        base_size (int, optional): Base size of the sub plots. Defaults to 1000.

    Returns:
        go.Figure: Figure with subplots.
    """
    
    row_count = len(trace_nested_list)
    col_count = len(trace_nested_list[0])
    
    fig = make_subplots(
        rows=row_count, 
        cols=col_count,
        subplot_titles=(titles),
        vertical_spacing=0.02,
        horizontal_spacing=0.02
    )

    for i, row in enumerate(trace_nested_list):
        for j, col in enumerate(row):

            # adding both outlieers and clustered
            for trace in col:
                fig.add_trace(trace, row=i+1, col=1)
    
    # figure settings
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    
    fig.update_layout(width=base_size*col_count, height=base_size*row_count, plot_bgcolor='rgba(250,250,250,1)')

    return fig

## Saving / Showing Plots

In [8]:
def fig_show_save(fig: go.Figure, filename: str, show=True):
  """
  Function to show and save a figure.

  Args:
      fig (go.Figure): fig to be saved and shown
      filename (str): filename to save the figure, without extension
      show (bool, optional): Option to disable showing of figure (in case too big for notebook). Defaults to True.
  """
  
  # writing both interactible .html and static image .png
  fig.write_html(f"figures/{filename}.html")
  fig.write_image(f"figures/{filename}.png")

  if show: 
    fig.show()

# Data Part
In the code below we are using panda, as imported, to read the data from the file USvideos.csv under the folder data.

we're then copying data from one serie (title) and stores it in the df-variable.

We're using a sample of the dataset insted, in the case where we dont have a gpu to do out text encodings. 



In [9]:
# got data from kaggle: https://www.kaggle.com/datasets/datasnaek/youtube-new?resource=download

df_whole = pd.read_csv("data/USvideos.csv")

df = df_whole[["title"]].copy()

# if your computer does not have GPU support, you can use a sample of the dataset instead to make it run in a reasonable time
# if you want to use the full dataset even wihtout GPU in case you have a very strong CPU, then you can just comment out the next line
if device == "cpu": df = df.sample(frac=0.05)

print(df_whole.shape)

print(df)

(40949, 16)
                                                                                                   title
31639  Houston Rockets vs Minnesota Timberwolves - Full Game Highlights  | Game 3 | April 21, 2018 | NBA
36033                                                           WE GOT INTEL'S PROTOTYPE GRAPHICS CARD!!
735                                                                   EARL GREY MACARONS- The Scran Line
40685                                              I Filled My Swimming Pool with 10,000 Water Balloons!
19041                                                                   Ready to Serve | Romney for Utah
...                                                                                                  ...
25107                                                                    Pen Circle Traps Ant (But How?)
25874                      Rihanna Claps Back at Snapchat for Domestic Violence Ad Featuring Chris Brown
4727                                       

## Cleaning
In the code below we're applying the function, string_cleaner, defined above. This function is applied only on one column ('title') and we store the returned column as "title_clean".

At the end we print out the first 5 rows.

In [10]:
df["title_clean"] = df["title"].apply(string_cleaner)

df.head(5)

Unnamed: 0,title,title_clean
31639,"Houston Rockets vs Minnesota Timberwolves - Full Game Highlights | Game 3 | April 21, 2018 | NBA",houston rockets vs minnesota timberwolves full game highlights game 3 april 21 2018 nba
36033,WE GOT INTEL'S PROTOTYPE GRAPHICS CARD!!,we got intels prototype graphics card
735,EARL GREY MACARONS- The Scran Line,earl grey macarons the scran line
40685,"I Filled My Swimming Pool with 10,000 Water Balloons!",i filled my swimming pool with 10000 water balloons
19041,Ready to Serve | Romney for Utah,ready to serve romney for utah


# ML Part
In this part we're focusing on the Machine learning. This is where the machine uses ML algorithms to provide output.

## Getting Encodings
In the first line below we're taking one column (title_clean) and making it to a numpy array. This array is put into the function of the model.

The model is defined in the imports at the top of the document in line 50.
This line says: Use cpu to run the model 'all-mpnet-base-v2', which is a sentence-transformer model who maps sentences and paragraphs to a 768 dimentional space. This model is trained and on a large and diverse dataset in advance.

First we're printing the shape of the model to se how one embs-datapoint is displayed.

Then we list different information of the embs variable, just to get a better understanding of how it works.

We're also printing different outputs of the embs variable to get a better understanding of how the sentence-transformer works.

In [11]:
df["title_clean"]

31639    houston rockets vs minnesota timberwolves  full game highlights   game 3  april 21 2018  nba
36033                                                           we got intels prototype graphics card
735                                                                 earl grey macarons the scran line
40685                                             i filled my swimming pool with 10000 water balloons
19041                                                                 ready to serve  romney for utah
                                                     ...                                             
25107                                                                    pen circle traps ant but how
25874                   rihanna claps back at snapchat for domestic violence ad featuring chris brown
4727                                                                             sia  candy cane lane
2635                                                               standup battle 

In [12]:
embs = model.encode(df["title_clean"].to_numpy())

print(f"The shape of our embeddings: {embs.shape}")

The shape of our embeddings: (2047, 768)


In [13]:
print(f"""
{type(embs)}
hvor langt er selve embs: {len(embs)}
hvor langt er et element: {len(embs[0])}
hva er første element i embs: 
{embs[0]} 
""")


<class 'numpy.ndarray'>
hvor langt er selve embs: 2047
hvor langt er et element: 768
hva er første element i embs: 
[-2.26884745e-02 -8.31665471e-03  2.42827404e-02 -1.27505111e-02
  1.11368829e-02  1.74464416e-02 -8.17276761e-02  1.84070077e-02
 -1.79314483e-02 -1.54649168e-02 -1.04354843e-02 -1.90188047e-02
  3.55703980e-02  5.01903147e-02  6.82189316e-02 -9.90920067e-02
  1.90044846e-02  1.99982729e-02  5.37842661e-02  1.50026046e-02
  3.45795453e-02 -1.56342881e-04  2.44024601e-02  9.49529186e-03
 -6.11622781e-02  1.85784716e-02 -1.30533697e-02 -2.22329088e-02
 -3.56743932e-02 -5.91970123e-02 -1.01654720e-03 -3.95411178e-02
  4.78973165e-02 -8.46622214e-02  1.68445865e-06 -2.57827360e-02
  1.44640813e-02  1.98779330e-02 -9.83369537e-03  4.73073199e-02
  2.50553824e-02  3.56453508e-02  2.98828315e-02 -1.77229550e-02
 -5.65097062e-03 -6.67793378e-02  2.49763746e-02  3.45051959e-02
  1.50609575e-02  1.52258379e-02  2.00261623e-02  1.24098388e-02
  2.53189821e-02 -3.49230343e-03 -2.03

In [14]:
df["embs"] = list(embs)

df.head(3)

Unnamed: 0,title,title_clean,embs
31639,"Houston Rockets vs Minnesota Timberwolves - Full Game Highlights | Game 3 | April 21, 2018 | NBA",houston rockets vs minnesota timberwolves full game highlights game 3 april 21 2018 nba,"[-0.022688475, -0.008316655, 0.02428274, -0.012750511, 0.011136883, 0.017446442, -0.081727676, 0.018407008, -0.017931448, -0.015464917, -0.010435484, -0.019018805, 0.035570398, 0.050190315, 0.0682..."
36033,WE GOT INTEL'S PROTOTYPE GRAPHICS CARD!!,we got intels prototype graphics card,"[0.03667365, 0.015063113, -0.017930608, 0.031443328, -0.02084526, -0.01935764, 0.017878868, 0.07568726, -0.028062126, 0.022469163, 0.04301863, 0.019576399, 0.042497408, 0.07550257, 0.0020217155, -..."
735,EARL GREY MACARONS- The Scran Line,earl grey macarons the scran line,"[-0.0040706764, -0.014441631, -0.017189505, -0.007013595, -0.07196707, -0.0029409204, -0.042063713, -0.037196796, -0.019869322, 0.032365736, 0.057181276, 0.03180227, 0.0077950316, 0.06523984, -0.0..."


## Dimensinality Reduction
The code below does exactly what the heading says. As described above the embed variable is now transformed to a variable holding 756 dimentions which all is corresponding to each word in the title and to each other.

For a computer this 756 dimention makes perfectly sense, but for the human eye and mind, we have to reduce the number of dimention to be able to read it. 
We're going from multiple dimentions to 2 dimentions. We create a umap variable. This variable is a class called UMAP. The class instantiates a umap-object with two parameters. Then we uses this objects function to "fit_transform" the variable embs and calls this new variable embs_2d.

Fit_transform is a function that fit embs into an embedded space and returns that transformed output. 
In our case, it returns the embed in a 2_dimentional display. 

At the end we're plotting the result to show how the embeddings are when dimensionality reduction is used. But when we're looking at this plot it does not provide us the desired information.
Where are the groups, where are the outlier? Is it possible to see it?

At the end we're also plotting some different information to get a better understanding of how the transformation works.

In [15]:
# reducing the dimensions on my data to 2 dimensions
umap = UMAP(n_neighbors=20, min_dist=0.1)

embs_2d = umap.fit_transform(embs)

fig = px.scatter(x=embs_2d[:,0], y=embs_2d[:,1])

fig.update_layout(width=800, height=800)
fig.update_traces(marker=dict(size=2))

# plotting to show how the embeddings are when just dimensionality reduction is used
fig_show_save(fig, "umap-scatter")

In [16]:
print(f"""
{type(embs_2d)}
hvor langt er embs_2d: {len(embs_2d)}
hvor langt er et element: {len(embs_2d[0])}
hva er første element i embs_2d: 
{embs_2d[0]} 
""")


<class 'numpy.ndarray'>
hvor langt er embs_2d: 2047
hvor langt er et element: 2
hva er første element i embs_2d: 
[16.309422 11.577305] 



## Clustering 2D data
In this part we're going to divide the population/data into number of groups such that data points in the same group are more similar to other data points in the same group than thos in other groups. 

We're instainsiate a object with some parameters. 
We are using HDBSCAN which is a hierarchical Density-based Spatial Clustering of Applications with noise. This is a clustering algorithm.

We instansiate a object with some parameters.

Then we apply the function .fit on this object.
This enables us to detect how many clusters the embeddings have and how many of them are outliers.


In [17]:
clusters_2d = HDBSCAN(min_cluster_size=10, cluster_selection_method="leaf").fit(embs_2d)

print(f"""
    2D
    Number of clusters: {len(set(clusters_2d.labels_)) - 1}
    Number of rows as outliers: {clusters_2d.labels_.tolist().count(-1)}
""")


    2D
    Number of clusters: 47
    Number of rows as outliers: 1006



# Results

## Plotting the results
The last part revolves around displaying the the results.

To get a better understanding of the plot above, and the cluster, we arrange each group together with a corresponding color. This means that each point with the same color belongs to the same group. The data-points with no color, are outliers. This means that the HDBSCAN algorithms did not find any group it belonged to.

We can hoover over the plot to see which cluster_id each group belongs to. 



In [18]:
set(clusters_2d.labels_)

{-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46}

In [19]:
trace_cluster_2d, trace_outlier_2d = result_tracer_wrapper(embs_2d, clusters_2d.labels_, df["title_clean"].to_numpy())


col11 = [trace_cluster_2d, trace_outlier_2d]


row1 = [col11]


trace_list = [row1]

fig = subplotter(trace_list, ["Topics by HDBSCAN Cluster", ])

fig_show_save(fig, "topics-by-hdbscan-clusters")

## Showing topic per cluster

This last part is just showing the topic per cluster. This means that we list each cluster_label which corresponds with the cluster_id. The we see what topics each cluster_label is about and how many videos that are a part of each topic. 

In [20]:
result_2d = result_df_maker(embs_2d, clusters_2d.labels_, df["title_clean"].to_numpy())

result_2d[["cluster_label", "topics"]].groupby(["cluster_label", "topics"])["topics"].count().reset_index(name="vidoes_count").sort_values(by="vidoes_count", ascending=False).head(20)

Unnamed: 0,cluster_label,topics,vidoes_count
0,-1,official video trailer 2018 music,1006
31,30,meghan markle scene harry bellas,74
13,12,trailers honest star wars jedi,58
35,34,snl kevin trump hart matters,39
41,40,pond diving scuba treasure shell,36
38,37,ft shawn mendes makeup young,35
22,21,game history battle happening emblem,33
33,32,boy audio imagine dragons live,32
30,29,jenner hannah kris instagram rogers,32
21,20,makeup tutorial beauty everyday challenge,32
