#### Latent Dirichlet Allocation (LDA) Topic Modeling

This notebook is dedicated to Latent Dirichlet Allocation (LDA), a technique to discover the abstract "topics".
 * LDA is often used to categoryzed documents, but in this context it will categorize paragraphs.
 * The script applies LDA (gensim) to identify recurring themes across the minutes.
 * The number of topics (n_components), is set to 8. This decision was taken on appendix, notebook 5.1. 
 * The script also displays the most significant words for each of these identified topics.

In [None]:
import glob
import os
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

In [None]:
FOLDER_MINUTES_LEMMATIZED = "./data/processed/copom_minutes_lemmatized"

In [None]:
all_docs_with_metadata = []
all_docs_for_lda = []

filepaths = glob.glob(f"{FOLDER_MINUTES_LEMMATIZED}/*.txt")
for path in filepaths:
    filename = os.path.basename(path)
    with open(path, 'r', encoding='utf-8') as f:
        paragraphs = [line.split() for line in f.readlines() if len(line.split()) > 5]
        for p in paragraphs:
            all_docs_with_metadata.append({'text': p, 'filename': filename})
            all_docs_for_lda.append(p)

In [None]:
dictionary = corpora.Dictionary(all_docs_for_lda)

corpus = [dictionary.doc2bow(doc) for doc in all_docs_for_lda]

In [None]:
NUM_TOPICS = 8

lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=NUM_TOPICS,
                     random_state=100,
                     passes=15)

In [None]:
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

In [None]:
# 5. Organizar os resultados em um DataFrame
def get_dominant_topic(doc_bow, lda_model):
    topic_dist = lda_model.get_document_topics(doc_bow)
    dominant_topic = sorted(topic_dist, key=lambda x: x[1], reverse=True)[0][0]
    return dominant_topic

In [None]:
results = []
# Itera sobre a lista que contém os metadados
for i, doc_info in enumerate(all_docs_with_metadata):
    doc_bow = corpus[i] # Pega o BoW correspondente pelo índice
    dominant_topic = get_dominant_topic(doc_bow, lda_model)
    results.append({
        'filename': doc_info['filename'], # Adiciona o nome do arquivo
        'original_text': ' '.join(doc_info['text']),
        'dominant_topic': dominant_topic
    })

df_results = pd.DataFrame(results)
df_results.to_excel('./data/processed/lda_results.xlsx', index=False)
print(df_results.head())