## Topic modelling for Italian Documents (LDA)

### 1. Imports and Setup

In [None]:
# %pip install pyLDAvis

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

#### Download NLKT resources

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

The quality is nltk italian stopwords is very low, therefore we implement a method that import a list of stopwords in the **resources** folder

In [None]:
def load_stopwords(it_path='resources/stopwords_it.txt', include_english=True):
    """
    Loads a list of Italian stopwords from file and optionally adds English stopwords from NLTK.
    
    Parameters:
    - it_path: path to the Italian stopwords file (one word per line)
    - include_english: whether to include English stopwords from NLTK
    
    Returns:
    - A list of unique stopwords
    """
    # Load Italian stopwords from file
    with open(it_path, 'r', encoding='utf-8-sig') as file:
        stopwords_it = file.read().splitlines()
    
    # Optionally include English stopwords from NLTK
    if include_english:
        nltk.download('stopwords', quiet=True)
        stopwords_en = stopwords.words('english')
    else:
        stopwords_en = []

    # Combine, remove duplicates and strip whitespace
    stopwords_tot = set(word.strip().lower() for word in stopwords_it + stopwords_en if word.strip())
    
    return list(stopwords_tot)


In [None]:
# Use nlkt stopwords
# stop_words = set(stopwords.words('italian'))

# Use our custom method to load stopwords
stop_words = load_stopwords()
print(f"Total stopwords loaded: {len(stop_words)}")
print(stop_words[:10])  # show a sample

### 2. Define a simple test corpus

In [None]:
documents = [
    "Amo il deep learning e l'elaborazione del linguaggio naturale.",
    "I modelli di linguaggio naturale sono affascinanti.",
    "Il topic modeling aiuta a scoprire i temi nei testi.",
    "Il machine learning consente la scoperta automatica degli argomenti.",
    "Le reti neurali apprendono rappresentazioni dai dati.",
    "L'intelligenza artificiale sta trasformando le industrie.",
    "Le tecniche di analisi del testo migliorano il recupero delle informazioni.",
    "I modelli linguistici di grandi dimensioni alimentano chatbot e assistenti.",
]

In [None]:
# Use the following code to perform test on a larger dataset
df = pd.read_csv("data/repubblica_sample.csv")
documents = df["full_text"].values

### 📦 3.Gensim: creating dictionary and corpus

Documents must be **tokenized** (lists of words) and represented as **bag-of-words** in the format (word_id, count).


#### Simple Preprocessing and **Tokenization** (we could do much more advances stuff....)

In [None]:
# Tokenization + lowercase + remove stopwords and punctuation
def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

processed_docs = [preprocess(doc) for doc in documents]

In [None]:
processed_docs

In [None]:
# Step 1: Create a dictionary from the processed documents
# The dictionary maps each unique word in the entire corpus to a unique integer ID
# For example, 'linguaggio' might be assigned ID 0, 'modello' might be ID 1, and so on
dictionary = corpora.Dictionary(processed_docs)

# Step 2: Create the corpus in Bag-of-Words (BoW) format
# For each document, we generate a list of tuples: (word_id, word_count)
# This means we’re counting how many times each word (by its ID) appears in the document
# Example output for a document might be: [(0, 2), (3, 1)]
# → word with ID 0 appears 2 times, word with ID 3 appears once
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [None]:
# Display the dictionary
print(dictionary.token2id)

In [None]:
# Display the BoW corpus
print(corpus)

#### 🛠️ Define a function to convert gensim_corpus to a dataframe (**gensim_corpus_to_dataframe**)

In [None]:
def gensim_corpus_to_dataframe(corpus, dictionary, doc_labels=None):
    """
    Converts a Gensim corpus into a readable pandas DataFrame (BoW matrix).
    
    Parameters:
    - corpus: list of documents in Gensim BoW format [(word_id, count), ...]
    - dictionary: Gensim Dictionary object mapping word IDs to words
    - doc_labels: optional list of labels for the rows (e.g., ['Doc 1', 'Doc 2', ...])
    
    Returns:
    - A pandas DataFrame where rows = documents, columns = words, values = word counts
    """
    
    # Create vocabulary list from dictionary (index = word IDs)
    vocab = [dictionary[i] for i in range(len(dictionary))]
    
    # Reconstruct BoW matrix
    bow_matrix = []
    for doc_bow in corpus:
        word_freq = dict(doc_bow)
        row = [word_freq.get(i, 0) for i in range(len(dictionary))]
        bow_matrix.append(row)
    
    # Create DataFrame
    df_bow = pd.DataFrame(bow_matrix, columns=vocab)
    
    # Optional: set custom document labels
    if doc_labels is None:
        df_bow.index = [f'Doc {i+1}' for i in range(len(corpus))]
    else:
        df_bow.index = doc_labels

    return df_bow


#### 🧪 Example usage

In [None]:
df_bow = gensim_corpus_to_dataframe(corpus, dictionary, doc_labels=[f'Doc {i+1}' for i in range(len(documents))])

### 📦 4.Scikit-learn: creating the Bag-of-Words matrix

Documents must be converted into **strings** (not token lists), then transformed into a **document-term matrix** using `CountVectorizer`,  where each row represents a document and each column a word, with values as word counts.


In [None]:
processed_docs = [' '.join(doc) for doc in processed_docs]  # join tokens back into strings

In [None]:
# Step 1: Initialize the CountVectorizer
# This tool will convert our text data into a matrix of token counts (Bag of Words)
vectorizer = CountVectorizer()

# Step 2: Fit the vectorizer to our documents and transform them into a sparse matrix
# Note: processed_docs must be a list of strings (not token lists), like ['I love AI', 'AI is cool']
X_bow = vectorizer.fit_transform(processed_docs)

# Step 3: Convert the sparse matrix to a dense array and wrap it in a pandas DataFrame
# Each row = a document, each column = a word, each cell = how many times the word appears in the document
pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())


In [None]:
# Display the content of the sparse matrix
X_bow[1].toarray()

### 5. Train an LDA model with Gensim

In [None]:
from gensim.models import LdaModel

# Step 1: Set number of topics (e.g. 3, but tune this!)
num_topics = 5

# Step 2: Train the LDA model
lda_model = LdaModel(
    corpus=corpus,              # BoW representation of your documents
    id2word=dictionary,         # Mapping from IDs to words
    num_topics=num_topics,      # Number of latent topics
    random_state=42,            # For reproducibility
    passes=10,                  # Number of passes through the corpus during training
    alpha='auto',               # Automatically learn the Dirichlet prior
    per_word_topics=True        # Output word-level topic assignments (optional)
)


#### 🔍 Inspect the discovered topics

In [None]:
# Print the top words for each topic
for i, topic in lda_model.print_topics():
    print(f"Topic {i + 1}: {topic}")

#### 🧪 Predict topic(s) for a document

In [None]:
# Get topic distribution for a specific document (e.g. doc 0)
doc_topics = lda_model.get_document_topics(corpus[0])
print(doc_topics)


#### Display topics using **pyLDAvis**

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)

In [None]:
# Display topics
pyLDAvis.display(lda_display)