## Topic modelling for Italian Documents (BERTopic)

### 1. Imports and Setup

In [None]:
%pip install bertopic hdbscan umap sentence_transformers torch

In [None]:
%pip install -U nbformat

In [None]:
from src.utils import load_or_download_embedding, get_embedding

import pandas as pd
import numpy as np
from numpy.linalg import norm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import torch
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from wordcloud import WordCloud

from nltk.tokenize import sent_tokenize

import plotly.graph_objs as go

##### Torch detects the runtime environment (are you running on a GPU)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 📦 3.Word2vec embedding model

#### Load pre-trained word embeddings (**Word2vec**)

In [None]:
print("Loading Word2Vec word embeddings...")
model = load_or_download_embedding("word2vec-google-news-300")

#### In order to load pre-trained word embeddings (**Glove**)

In [None]:
print("Loading Glove word embeddings...")
model = load_or_download_embedding("glove-wiki-gigaword-50")

In [None]:
print("Number of word vectors in the model:", len(model))
print("Dimension of each word vector:", model.vector_size)

#### Displaying the word vectors (not very useful for humans 😊)

In [None]:
word1 = "deep"
word2 = "learning"

embedding1 = get_embedding(word1, model)
embedding2 = get_embedding(word2, model)

print(f"Embedding for '{word1}' (first 10 dimensions):", embedding1[:10], "...")
print(f"Embedding for '{word2}' (first 10 dimensions):", embedding2[:10], "...")

#### Get most similar words to a given word (**most_similar**)

##### 🔍 How `most_similar` Works

The method `most_similar("word", topn=10)` returns the words that are most similar to the input word based on their embedding vectors.

Internally, the model computes the **cosine similarity** between the vector of the given word and the vectors of all other words in the vocabulary:

$
\text{similarity}(\vec{v}_1, \vec{v}_2) = \frac{\vec{v}_1 \cdot \vec{v}_2}{\|\vec{v}_1\| \cdot \|\vec{v}_2\|}
$

Where:
- $ \vec{v}_1 $ is the vector for the input word (e.g., `"learning"`)
- $ \vec{v}_2 $ is the vector for every other word in the vocabulary

The method returns the top `n` words with the highest similarity scores.

> 💡 This kind of similarity works well when the vectors have been trained on large corpora and reflect contextual word usage.


In [None]:
word = "learning"

similar_words = model.most_similar(word, topn=10)

# Print results
print(f"Most similar words to {word}:")
for word, score in similar_words:
    print(f"{word}: {score:.4f}")

#### 📊 Let's visualize word vectors in 2D

We'll use dimensionality reduction to project high-dimensional word embeddings (usually 100–300 dimensions) down to 2D so we can plot them and visually explore semantic relationships.

##### ✅ Option 1: Simple and fast — PCA (Principal Component Analysis)

In [None]:
def plot_words_pca(model, words, highlight_words=None, figsize=(10, 7), title="PCA of Word Embeddings"):
    """
    Plot a 2D PCA projection of word embeddings.

    Parameters:
    - model: gensim KeyedVectors
    - words: list of words to plot
    - highlight_words: list of words to highlight (optional)
    - figsize: tuple for figure size
    - title: plot title
    """
    vectors = [model[word] for word in words if word in model.key_to_index]
    filtered_words = [word for word in words if word in model.key_to_index]

    if len(vectors) == 0:
        print("No valid words found in the model.")
        return

    # Reduce dimensions with PCA
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(vectors)

    # Setup plot
    plt.figure(figsize=figsize)
    plt.title(title, fontsize=16)
    plt.grid(True, alpha=0.3)

    for i, word in enumerate(filtered_words):
        x, y = reduced[i]
        is_highlighted = highlight_words and word in highlight_words
        color = "crimson" if is_highlighted else "steelblue"
        fontsize = 14 if is_highlighted else 12
        plt.scatter(x, y, c=color, s=100 if is_highlighted else 60, edgecolors='k', linewidths=0.5)
        plt.text(x + 0.02, y + 0.02, word, fontsize=fontsize, color=color)

    plt.xlabel("PC1", fontsize=13)
    plt.ylabel("PC2", fontsize=13)
    plt.tight_layout()
    plt.show()



In [None]:
words = ["king", "queen", "man", "woman", "paris", "france", "rome", "italy"]
highlight_words = ["paris", "france", "rome", "italy"]
plot_words_pca(model, words, highlight_words)


##### 🌈 Option 2: More powerful — t-SNE

Better at preserving nonlinear relationships, but slower and more sensitive to parameters.

In [None]:
def plot_words_tsne(model, words, perplexity=None, max_iter=1000, figsize=(10, 7), title="t-SNE of Word Embeddings"):
    """
    Plot a 2D t-SNE projection of word embeddings.

    Automatically adjusts perplexity if not set or too high.

    Parameters:
    - model: gensim KeyedVectors
    - words: list of words to plot
    - perplexity: t-SNE perplexity (optional)
    - max_iter: number of iterations
    - figsize: figure size
    - title: plot title
    """
    vectors = [model[word] for word in words if word in model.key_to_index]
    filtered_words = [word for word in words if word in model.key_to_index]

    if len(vectors) < 2:
        print("⚠️ Need at least 2 valid words for t-SNE.")
        return

    vectors = np.array(vectors)

    # Set or adjust perplexity
    max_perplexity = len(vectors) - 1
    if perplexity is None or perplexity >= max_perplexity:
        perplexity = max(2, min(30, max_perplexity))
        print(f"Using perplexity={perplexity}")

    # Run t-SNE
    tsne = TSNE(n_components=2, perplexity=perplexity, max_iter=max_iter, random_state=42)
    reduced = tsne.fit_transform(vectors)

    # Plot
    plt.figure(figsize=figsize)
    plt.title(title, fontsize=16)
    plt.grid(True, alpha=0.3)

    for i, word in enumerate(filtered_words):
        x, y = reduced[i]
        plt.scatter(x, y, c="darkorange", s=70, edgecolors='k', linewidths=0.5)
        plt.text(x + 1, y + 1, word, fontsize=12, color="black")

    plt.tight_layout()
    plt.show()


In [None]:
words = ["king", "queen", "man", "woman", "paris", "france", "rome", "italy"]
plot_words_tsne(model, words)

#### 🌍 Plotting Word Embeddings in 3D

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import plotly.graph_objs as go

def plot_words_pca_3d(model, words, title="3D PCA of Word Embeddings"):
    """
    Interactive 3D PCA plot of word embeddings using Plotly.
    """
    vectors = [model[word] for word in words if word in model.key_to_index]
    filtered_words = [word for word in words if word in model.key_to_index]

    if len(vectors) < 3:
        print("Need at least 3 valid words for 3D plot.")
        return

    vectors = np.array(vectors)
    pca = PCA(n_components=3)
    reduced = pca.fit_transform(vectors)

    x, y, z = reduced[:, 0], reduced[:, 1], reduced[:, 2]

    trace = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers+text',
        text=filtered_words,
        textposition='top center',
        marker=dict(
            size=6,
            color='mediumturquoise',
            line=dict(width=0.5, color='black')
        )
    )

    layout = go.Layout(
        title=title,
        margin=dict(l=0, r=0, b=0, t=40),
        scene=dict(
            xaxis_title='PC1',
            yaxis_title='PC2',
            zaxis_title='PC3'
        )
    )

    fig = go.Figure(data=[trace], layout=layout)
    fig.show()


In [None]:
words = ["king", "queen", "man", "woman", "paris", "france", "rome", "italy"]
plot_words_pca_3d(model, words)

In [None]:
# Define the plurality direction
plurality_vector = model["cats"] - model["cat"]
plurality_unit = plurality_vector / norm(plurality_vector)

# Word vectors
puppy = model["puppy"]
puppies = model["puppies"]

# 1. Alignment with plurality direction
cos_singular = np.dot(puppy, plurality_unit) / norm(puppy)
cos_plural = np.dot(puppies, plurality_unit) / norm(puppies)

# 2. Predict plural form
puppies_tilde = puppy + plurality_vector

# 3. Cosine similarity between predicted and real plural
similarity_tilde = np.dot(puppies_tilde, puppies) / (norm(puppies_tilde) * norm(puppies))

# Output
print(f"📐 Alignment with plurality direction:")
print(f" - puppy     (singular): {cos_singular:.4f}")
print(f" - puppies   (plural):   {cos_plural:.4f}")
print()
print(f"🔁 Cosine similarity:")
print(f" - between 'puppies' and predicted 'puppies_tilde': {similarity_tilde:.4f}")


The quality is nltk italian stopwords is very low, therefore we implement a method that import a list of stopwords in the **resources** folder

In [None]:
def load_stopwords(it_path='resources/stopwords_it.txt', include_english=True):
    """
    Loads a list of Italian stopwords from file and optionally adds English stopwords from NLTK.
    
    Parameters:
    - it_path: path to the Italian stopwords file (one word per line)
    - include_english: whether to include English stopwords from NLTK
    
    Returns:
    - A list of unique stopwords
    """
    # Load Italian stopwords from file
    with open(it_path, 'r', encoding='utf-8-sig') as file:
        stopwords_it = file.read().splitlines()
    
    # Optionally include English stopwords from NLTK
    if include_english:
        nltk.download('stopwords', quiet=True)
        stopwords_en = stopwords.words('english')
    else:
        stopwords_en = []

    # Combine, remove duplicates and strip whitespace
    stopwords_tot = set(word.strip().lower() for word in stopwords_it + stopwords_en if word.strip())
    
    return list(stopwords_tot)


### 4. BERTopic Topic Modeling

#### Load dataset (stored in **data** folder)

In [None]:
# Use the following code to perform test on a larger dataset
df = pd.read_csv("data/repubblica_sample.csv")
documents = df["full_text"].values

#### Tokenize the dataset

In [None]:
sentences = [sent_tokenize(doc, language="italian") for doc in documents]

In [None]:
sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2").to(device)

In [None]:
embeddings = sentence_model.encode(sentences, show_progress_bar=True, batch_size=128, device=device)

In [None]:
# Use our custom method to load stopwords
stop_words = load_stopwords()
print(f"Total stopwords loaded: {len(stop_words)}")
print(stop_words[:10])  # show a sample

In [None]:
vectorizer = CountVectorizer(stop_words=list(stop_words), ngram_range=(1, 2))
umap_model = UMAP(n_neighbors=50, n_components=5, metric="cosine", min_dist=0.01, random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=1000, min_samples=1, cluster_selection_epsilon=0.01)

In [None]:
bertopic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    verbose=True
)

topics, probs = bertopic_model.fit_transform(sentences, embeddings)

In [None]:
info_topic = bertopic_model.get_topic_info()
info_topic.to_csv('results/topic_info.csv', index=False)

In [None]:
bertopic_model.get_document_info(sentences)