In [26]:
import glob
import os
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

In [27]:
FOLDER_MINUTES_LEMMATIZED = "./data/processed/copom_minutes_lemmatized"

In [28]:
all_docs_with_metadata = []
all_docs_for_lda = []

filepaths = glob.glob(f"{FOLDER_MINUTES_LEMMATIZED}/*.txt")
for path in filepaths:
    filename = os.path.basename(path)
    with open(path, 'r', encoding='utf-8') as f:
        paragraphs = [line.split() for line in f.readlines() if len(line.split()) > 5]
        for p in paragraphs:
            all_docs_with_metadata.append({'text': p, 'filename': filename})
            all_docs_for_lda.append(p)

In [29]:
dictionary = corpora.Dictionary(all_docs_for_lda)

corpus = [dictionary.doc2bow(doc) for doc in all_docs_for_lda]

In [30]:
NUM_TOPICS = 8

lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=NUM_TOPICS,
                     random_state=100,
                     passes=15)

In [31]:
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.018*"economy" + 0.016*"economic" + 0.015*"growth" + 0.013*"market" + 0.012*"activity" + 0.011*"demand" + 0.010*"high" + 0.009*"remain" + 0.009*"recovery" + 0.008*"domestic"')
(1, '0.066*"inflation" + 0.054*"rate" + 0.034*"target" + 0.031*"scenario" + 0.027*"meeting" + 0.026*"copom" + 0.026*"exchange" + 0.021*"expectation" + 0.020*"projection" + 0.017*"forecast"')
(2, '0.062*"good" + 0.044*"production" + 0.031*"core" + 0.030*"increase" + 0.026*"capital" + 0.020*"industrial" + 0.019*"month" + 0.017*"consumer" + 0.013*"industry" + 0.012*"decrease"')
(3, '0.049*"inflation" + 0.043*"monetary" + 0.039*"policy" + 0.030*"committee" + 0.022*"copom" + 0.016*"risk" + 0.016*"target" + 0.014*"scenario" + 0.013*"rate" + 0.012*"member"')
(4, '0.062*"billion" + 0.043*"u" + 0.024*"operation" + 0.022*"credit" + 0.021*"increase" + 0.020*"month" + 0.020*"total" + 0.019*"reach" + 0.018*"rate" + 0.018*"average"')
(5, '0.078*"price" + 0.046*"increase" + 0.036*"inflation" + 0.032*"month" + 0.028*"index

In [32]:
# 5. Organizar os resultados em um DataFrame
def get_dominant_topic(doc_bow, lda_model):
    topic_dist = lda_model.get_document_topics(doc_bow)
    dominant_topic = sorted(topic_dist, key=lambda x: x[1], reverse=True)[0][0]
    return dominant_topic

In [34]:
results = []
# Itera sobre a lista que contém os metadados
for i, doc_info in enumerate(all_docs_with_metadata):
    doc_bow = corpus[i] # Pega o BoW correspondente pelo índice
    dominant_topic = get_dominant_topic(doc_bow, lda_model)
    results.append({
        'filename': doc_info['filename'], # Adiciona o nome do arquivo
        'original_text': ' '.join(doc_info['text']),
        'dominant_topic': dominant_topic
    })

df_results = pd.DataFrame(results)
df_results.to_excel('./data/processed/lda_results.xlsx', index=False)
print(df_results.head())

                  filename                                      original_text  \
0  100th Copom minutes.txt  member monetary policy committee analyze recen...   
1  100th Copom minutes.txt  august broad national consumer price index ipc...   
2  100th Copom minutes.txt  general price index rise august compare july t...   
3  100th Copom minutes.txt  alcohol fuel gasoline main individual contribu...   
4  100th Copom minutes.txt  august market price increase july account mont...   

   dominant_topic  
0               3  
1               5  
2               5  
3               5  
4               5  
