In [4]:
import warnings
warnings.filterwarnings("ignore")

# Passo 2

## importações necessárias

In [5]:
import pandas as pd
import numpy as np

import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import string
import re
import contractions

## Leitura e análise dos dados de treinamento

In [6]:
message = pd.read_csv("data/texts.txt", header=None, names=["message"], sep="\r")
label = pd.read_csv("data/score.txt", header=None, names=["labels"], sep="\r")

topic_df = pd.concat([message, label], axis=1)
# topic_df

In [7]:
# Tamanho medio dos documentos antes do pré-processamento
sample_message = topic_df["message"].tolist()
num_words = [len(s.split()) for s in sample_message]
print(f"{np.median(num_words)} palavras/doc")

9.0 palavras/doc


### Pré-processamento

In [8]:
nlp = spacy.load("en_core_web_sm")
stop_words = STOP_WORDS

In [9]:
punctuation = string.punctuation

In [10]:
nltk.download('wordnet') # download para fazer a lematização do texto usando WordNetLematizer

[nltk_data] Downloading package wordnet to /home/valentim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def contraction_text(texts):
    ''' 
        This function is used to expand the possible hiring of some words in the text
    '''
    
    expanded_text = []
    for word in texts.split():
        expanded_text.append(contractions.fix(word))

    texts = ' '.join(expanded_text)
    
    return texts

def lemmatizer_text(texts):
    ''' 
    function responsible for lemmatizing the text
    '''
    
    lemmatizer = WordNetLemmatizer()
    
    word_token = texts.split()
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in word_token]

    texts = ' '.join(lemmas)

    return texts

def processing(texts):
    texts = contraction_text(texts)
    texts = lemmatizer_text(texts)
    
    result = []
    pos_tag = ['ADJ', 'NOUN', 'VERB', 'PROPN'] # vetor que permite escolher somente entre adjetivos, substantivos, verbos e pronomes
    text = re.sub(r'(https?://[^\s\n\r]+|www\.[^\s\n\r]+|[0-9@#&!?:,.\)\(;])', '', texts) # expressão regular que retira do texto todos os números, simbolos (&#;) e URL
    doc = nlp(text.lower())

    for token in doc:
        if(token.text in stop_words or token.text in punctuation or len(token.text) <= 3 or token.text == "ltgt"):
            continue
        
        if(token.pos_ in pos_tag):
            result.append(token.text)
        
        text = ' '.join(str(element) for element in result if not element.isdigit())
    
    return text

In [12]:
# Cria-se uma nova coluna que armazena as respectivas mensagens modeladas
topic_df.message = topic_df.message.astype(str)
topic_df["processed_message"] = topic_df["message"].apply(processing)

topic_df.head()

Unnamed: 0,message,labels,processed_message
0,How did serfdom develop in and then leave Russ...,0,serfdom develop leave russia
1,What films featured the character Popeye Doyle ?,1,film feature character popeye doyle
2,How can I find a list of celebrities ' real na...,0,find list celebrities real
3,What fowl grabs the spotlight after the Chines...,1,fowl grab spotlight chinese year monkey
4,What is the full form of .com ?,2,form


In [13]:
# Tamanho medio dos documentos depois do pré-processamento
sample_message = topic_df["processed_message"].tolist()
num_words = [len(s.split()) for s in sample_message]
print(f"{np.median(num_words)} palavras/doc")

3.0 palavras/doc


# Passo 3

## Importações necessárias

In [14]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

## NMF aplicada

In [15]:
# converting the given text term-document matrix
vectorizer = TfidfVectorizer(max_features=20, min_df=10)
X = vectorizer.fit_transform(topic_df["processed_message"])
vaocabulary = np.array(vectorizer.get_feature_names_out())

In [16]:
nmf = NMF(n_components=5, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_

## Resultado

In [17]:
for i, topic in enumerate(H):
    print(f"Topic: {i + 1} {",".join(str(x) for x in vaocabulary[topic.argsort()[-20:]])}")

Topic: 1 president,mean,time,color,word,state,find,play,fear,city,country,live,yous,year,write,know,people,american,largest,world
Topic: 2 word,find,mean,country,world,fear,write,city,know,american,time,play,color,live,people,largest,year,president,state,yous
Topic: 3 state,color,mean,write,find,yous,city,world,word,fear,play,people,year,live,president,know,american,time,largest,country
Topic: 4 color,state,largest,city,yous,fear,country,live,world,people,year,american,find,time,play,know,president,write,word,mean
Topic: 5 color,mean,state,fear,country,president,world,yous,people,word,play,time,write,know,live,year,largest,american,find,city


# Perguntas do passo 3


## Resultado com 5 tópicos
### Com 10 palavras por tópicos
Topic: 1 mean,president,state,city,country,yous,year,find,people,world<br>
Topic: 2 state,mean,yous,world,city,find,people,year,president,country<br>
Topic: 3 mean,find,country,world,city,people,year,president,state,yous<br>
Topic: 4 mean,people,state,president,country,world,yous,year,find,city<br>
Topic: 5 state,people,find,world,yous,country,year,city,president,mean<br>

### Com 20 palavras por tópicos
Topic: 1 president,mean,word,color,time,state,find,city,fear,country,play,yous,live,year,write,know,people,american,largest,world<br>
Topic: 2 word,find,mean,world,country,fear,write,city,know,american,time,play,color,live,people,largest,year,president,state,yous<br>
Topic: 3 state,color,mean,write,yous,find,city,world,word,fear,play,people,year,president,live,know,american,time,largest,country<br>
Topic: 4 color,state,largest,city,yous,people,country,world,live,year,fear,american,find,time,play,know,president,write,word,mean<br>
Topic: 5 color,fear,mean,state,country,president,world,yous,people,play,word,time,write,know,year,live,american,largest,find,city<br>

## Resultado com 10 tópicos
### Com 10 palavras por tópicos

Topic: 1 mean,president,state,year,find,people,yous,country,city,world<br>
Topic: 2 mean,state,find,people,yous,year,president,city,world,country<br>
Topic: 3 mean,find,country,people,year,world,city,president,state,yous<br>
Topic: 4 mean,people,state,year,president,country,yous,find,world,city<br>
Topic: 5 city,country,find,people,world,state,yous,year,president,mean<br>
Topic: 6 mean,yous,president,country,people,state,city,year,world,find<br>
Topic: 7 mean,find,country,city,yous,president,world,people,state,year<br>
Topic: 8 mean,state,city,president,country,find,year,yous,world,people<br>
Topic: 9 country,mean,president,people,city,world,find,year,yous,state<br>
Topic: 10 find,world,state,people,city,country,mean,year,yous,president<br>

### Com 20 palavras por tópicos
Topic: 1 fear,color,president,time,mean,play,word,state,live,year,city,find,country,people,yous,write,know,american,largest,world<br>
Topic: 2 word,fear,find,mean,know,world,country,year,play,live,people,city,write,time,american,state,color,largest,president,yous<br>
Topic: 3 color,fear,mean,play,state,write,yous,word,year,people,city,find,world,live,president,know,time,american,largest,country<br>
Topic: 4 color,largest,fear,state,play,yous,people,year,find,city,country,world,american,live,time,know,president,write,word,mean<br>
Topic: 5 fear,color,play,people,mean,time,word,state,president,year,country,world,yous,find,write,know,live,american,largest,city<br>
Topic: 6 fear,color,largest,live,play,mean,yous,president,city,country,year,people,state,world,american,word,know,time,write,find<br>
Topic: 7 fear,color,largest,mean,know,word,yous,find,country,write,city,play,world,people,state,live,president,american,time,year<br>
Topic: 8 fear,color,write,president,american,time,people,play,word,mean,country,world,city,find,year,yous,know,largest,live,state<br>
Topic: 9 largest,american,play,write,president,mean,state,city,word,country,find,year,world,yous,time,know,color,fear,live,people<br>
Topic: 10 fear,color,live,president,state,mean,city,find,people,country,world,year,yous,largest,time,word,write,american,know,play<br>

#### O que você percebeu sobre variar tanto a quantidade de tópicos quanto de palavras? O que isso muda nas matrizes do modelo? Quais resultados são mais interpretáveis qualitativamente?

Aumentar o número de tópicos (n_components) torna os temas mais específicos, mas pode gerar redundância. Já diminuir o número de tópicos resulta em temas mais amplos, dificultando a identificação de nuances. Quanto ao vocabulário (max_features), mais palavras trazem mais variações, mas podem incluir ruído; menos palavras tornam o modelo mais robusto, mas podem deixar tópicos muito gerais. Os resultados mais interpretáveis qualitativamente surgem quando há equilíbrio entre tópicos amplos e específicos e um vocabulário focado em palavras relevantes.


# Passo 4

# Importações necessárias

In [18]:
from bertopic import BERTopic

# Criação e treinamento do modelo

In [19]:
five_topic_model = BERTopic(calculate_probabilities=True, nr_topics=5)
topics, probs = five_topic_model.fit_transform(topic_df["processed_message"])

In [20]:
ten_topic_model = BERTopic(calculate_probabilities=True, nr_topics=10)
topics, probs = ten_topic_model.fit_transform(topic_df["processed_message"])

# Análise dos tópicos gerados

Foram treinados dois modelos, um considerando 5 tópicos e outro considerando 10 tópicos.
Abaixo é mostrado, utilizando a função get_topic_info(), quais palavras pertencem a cada
tópico. É possível perceber algumas diferenças entre os dois modelos. Principalmente na 
quantidade de diversidade entre os tópicos. Sendo o de 10 o que mais representa essa diversidade.

In [21]:
five_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2187,-1_country_yous_world_people,"[country, yous, world, people, time, kind, yea...","[time year travel, common cause death yous, or..."
1,0,3540,0_world_city_yous_state,"[world, city, yous, state, country, president,...","[yous president appear, south american city wo..."
2,1,117,1_mile_weigh_weight_temperature,"[mile, weigh, weight, temperature, measure, ea...","[weight teaspoon matter black hole, teaspoon m..."
3,2,95,2_tuberculosis_blood_transplant_disease,"[tuberculosis, blood, transplant, disease, inf...","[people tuberculosis year, people tuberculosis..."
4,3,13,3_life_expectancy_average_swim,"[life, expectancy, average, swim, fraction, be...","[life expectancy elephant, average life expect..."


In [22]:
ten_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2117,-1_country_world_yous_time,"[country, world, yous, time, state, year, kind...","[kind puzzle appear yous york world december, ..."
1,0,1349,0_fear_president_mean_stand,"[fear, president, mean, stand, word, origin, y...","[letter word english language, president, fear]"
2,1,877,1_state_color_city_river,"[state, color, city, river, baseball, world, l...","[city state live, largest city world, largest ..."
3,2,694,2_film_bear_movie_write,"[film, bear, movie, write, book, actress, play...","[portray title character film jackal, film fil..."
4,3,653,3_invent_computer_card_company,"[invent, computer, card, company, address, dri...","[code invent, telephone invent, invent machine]"
5,4,129,4_blood_tuberculosis_hand_transplant,"[blood, tuberculosis, hand, transplant, birth,...","[medical term cancer blood, people tuberculosi..."
6,5,57,5_temperature_rain_measure_degrees,"[temperature, rain, measure, degrees, earth, s...","[temperature today, indoor sport phoenix infer..."
7,6,51,6_population_life_water_expectancy,"[population, life, water, expectancy, weigh, a...","[population australia, population world, world..."
8,7,15,7_california_legal_year_county,"[california, legal, year, county, state, capit...","[california year, california year, california ..."
9,8,10,8_difference_radio_classical_condition,"[difference, radio, classical, condition, stat...","[difference classical acoustic guitar, differe..."


## Vizualização dos tópicos

Julguei o modelo que utilizou 10 tópicos como o mais representativo.
Logo, somente o utilizei, descartando o de 5 tópicos para vizualização.

In [23]:
ten_topic_model.visualize_topics()

In [24]:
ten_topic_model.visualize_hierarchy()

In [25]:
ten_topic_model.visualize_barchart()

# Perguntas do passo 4

### Em comparação com o modelo anterior, os tópicos são mais interpretáveis qualitativamente? O que significa o score associado a cada termo de um tópico em cada um dos casos?
Sim, são mais interpretáveis. É bem nítido principalmente com a ajuda da vizualização que o BerTopic permite.

O score em BERTopic indica o quão representativo e importante um termo é para um tópico específico. É calculado usando c-TF-IDF, que ajusta o peso dos termos considerando os tópicos como "classes". Um score alto significa que o termo é distintivo para aquele tópico, enquanto um score baixo indica que o termo é mais comum entre diferentes tópicos.

# Passo 5

## Importações necessárias

In [26]:
import numpy as np
from collections import Counter
from itertools import combinations

## Funções para o cáculo do NPMI

In [57]:
def get_word_cooccurrences(docs, topic_words):
    '''
    Calcula as concorrências de palavras em uma lista de documentos.

    Paramêtros:
    docs: lista de todos os documentos já pré-processados.
    topic_words: lista de palavras de um determinados tópico a ser analisado.

    Retorna:
    - cooc_counts: Contagem de coocorrências das palavras do tópico.
    - word_counts: Contagem de aparições individuais de cada palavra.
    '''

    cooc_counts = Counter()
    words_counts = Counter()

    for doc in docs:
        # Seleciono todas as palavras do tópico que aparece no documento da iteração atual
        filtered_words = [word for word in doc.split() if word in topic_words]

        # Façõ a contagem individual de cada palavra
        words_counts.update(filtered_words)

        for pair in combinations(filtered_words, 2):
            cooc_counts[tuple(sorted(pair))] += 1
    
    return cooc_counts, words_counts

def calculate_pmi(cooc_counts, word_counts, total_docs):
    '''
    Calcula o PMI (Pointwise Manual Information) entre pares de palavras.

    Parâmetro:
    cooc_counts: contagem de coocorrências das palavras do tópico.
    words_counts: contaegem de aparições individuais de cada palavra do tópico.
    total_docs: quantidade total de documentos.

    Retorna:
    pmi_scores: Dicionário com os pares de palavras e seus respectivos PMI valores. 
    '''

    pmi_scores = {}
    
    for (w1, w2), cooc in cooc_counts.items():
        p_w1 = word_counts[w1] / total_docs
        p_w2 = word_counts[w2] / total_docs

        p_w1_w2 = cooc / total_docs

        pmi = np.log(p_w1_w2 / (p_w1 * p_w2))
        pmi_scores[(w1, w2)] = pmi

    return pmi_scores


def normalize_pmi(pmi_scores):
    '''
    Normalizar os valores de PMI para obter o NPMI.

    Parâmetros:
    pmi_scores: Dicionário com os pares de palavras e seus respectivos valores PMI.

    Retorna:
    npmi_scores: Dicionario com os pares de palavras e seus respectivos valores npmi
    '''

    max_pmi = max(pmi_scores.values())
    min_pmi = min(pmi_scores.values())

    npmi_scores = {pair: (pmi - min_pmi) / (max_pmi - min_pmi) for pair, pmi in pmi_scores.items()}

    return npmi_scores

## Aplicações das funções

In [73]:
# Transformação da lista de documentos e tópicos para a utilização correta das funções
docs = topic_df["message"].tolist()
topics = ten_topic_model.get_topic_info()["Representation"].tolist()

In [None]:
print("===================================== Métrica NPMI dos 10 topicos =====================================")
print()

index_topic = 1
for topic in topics:
    cooc_counts, words_counts = get_word_cooccurrences(docs, topic)
    pmi_scores = calculate_pmi(cooc_counts, words_counts, len(docs))
    npmi_scores = normalize_pmi(pmi_scores)

    values_npmi_scores = np.array([values for values in npmi_scores.values()])
    mean_npmi_scores = values_npmi_scores.mean()
    print(f"Topico {index_topic}: {mean_npmi_scores}")
    
    index_topic += 1


Topico 1: 0.47820062897809895
Topico 2: 0.44639961409211804
Topico 3: 0.5538094827366032
Topico 4: 0.45150594546858275
Topico 5: 0.426454715012994
Topico 6: 0.6090930078631326
Topico 7: 0.4172103301435215
Topico 8: 0.5874515886800841
Topico 9: 0.3017020232583292
Topico 10: 0.45147541242140693
