In [28]:
import warnings
warnings.filterwarnings("ignore")

# Passo 2

## importações necessárias

In [1]:
import pandas as pd
import numpy as np

import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import string
import re
import contractions

## Leitura e análise dos dados de treinamento

In [2]:
message = pd.read_csv("data/texts.txt", header=None, names=["message"], sep="\r")
label = pd.read_csv("data/score.txt", header=None, names=["labels"], sep="\r")

topic_df = pd.concat([message, label], axis=1)
# topic_df

In [3]:
# Tamanho medio dos documentos antes do pré-processamento
sample_message = topic_df["message"].tolist()
num_words = [len(s.split()) for s in sample_message]
print(f"{np.median(num_words)} palavras/doc")

9.0 palavras/doc


### Pré-processamento

In [4]:
nlp = spacy.load("en_core_web_sm")
stop_words = STOP_WORDS

In [5]:
punctuation = string.punctuation

In [6]:
nltk.download('wordnet') # download para fazer a lematização do texto usando WordNetLematizer

[nltk_data] Downloading package wordnet to /home/valentim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def contraction_text(texts):
    ''' 
        This function is used to expand the possible hiring of some words in the text
    '''
    
    expanded_text = []
    for word in texts.split():
        expanded_text.append(contractions.fix(word))

    texts = ' '.join(expanded_text)
    
    return texts

def lemmatizer_text(texts):
    ''' 
    function responsible for lemmatizing the text
    '''
    
    lemmatizer = WordNetLemmatizer()
    
    word_token = texts.split()
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in word_token]

    texts = ' '.join(lemmas)

    return texts

def processing(texts):
    texts = contraction_text(texts)
    texts = lemmatizer_text(texts)
    
    result = []
    pos_tag = ['ADJ', 'NOUN', 'VERB', 'PROPN'] # vetor que permite escolher somente entre adjetivos, substantivos, verbos e pronomes
    text = re.sub(r'(https?://[^\s\n\r]+|www\.[^\s\n\r]+|[0-9@#&!?:,.\)\(;])', '', texts) # expressão regular que retira do texto todos os números, simbolos (&#;) e URL
    doc = nlp(text.lower())

    for token in doc:
        if(token.text in stop_words or token.text in punctuation or len(token.text) <= 3 or token.text == "ltgt"):
            continue
        
        if(token.pos_ in pos_tag):
            result.append(token.text)
        
        text = ' '.join(str(element) for element in result if not element.isdigit())
    
    return text

In [24]:
# Cria-se uma nova coluna que armazena as respectivas mensagens modeladas
topic_df.message = topic_df.message.astype(str)
topic_df["processed_message"] = topic_df["message"].apply(processing)

topic_df.head()

Unnamed: 0,message,labels,processed_message
0,How did serfdom develop in and then leave Russ...,0,serfdom develop leave russia
1,What films featured the character Popeye Doyle ?,1,film feature character popeye doyle
2,How can I find a list of celebrities ' real na...,0,find list celebrities real
3,What fowl grabs the spotlight after the Chines...,1,fowl grab spotlight chinese year monkey
4,What is the full form of .com ?,2,form


In [23]:
# Tamanho medio dos documentos depois do pré-processamento
sample_message = topic_df["processed_message"].tolist()
num_words = [len(s.split()) for s in sample_message]
print(f"{np.median(num_words)} palavras/doc")

3.0 palavras/doc


# Passo 3

## Importações necessárias

In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

## NMF aplicada

In [11]:
# converting the given text term-document matrix
vectorizer = TfidfVectorizer(max_features=20, min_df=10)
X = vectorizer.fit_transform(topic_df["processed_message"])
vaocabulary = np.array(vectorizer.get_feature_names_out())

In [12]:
nmf = NMF(n_components=5, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_

## Resultado

In [13]:
for i, topic in enumerate(H):
    print(f"Topic: {i + 1} {",".join(str(x) for x in vaocabulary[topic.argsort()[-20:]])}")

Topic: 1 president,mean,time,color,word,state,find,fear,play,city,country,yous,live,year,write,know,people,american,largest,world
Topic: 2 word,find,mean,world,country,fear,write,know,city,american,time,play,color,live,people,largest,year,president,state,yous
Topic: 3 state,color,mean,find,write,yous,city,word,world,fear,play,people,year,live,president,know,american,time,largest,country
Topic: 4 color,state,largest,fear,city,yous,country,world,people,live,year,american,find,time,play,know,president,write,word,mean
Topic: 5 color,mean,fear,state,country,president,world,yous,people,play,word,time,write,know,year,live,american,largest,find,city


# Perguntas do passo 3


## Resultado com 5 tópicos
### Com 10 palavras por tópicos
Topic: 1 mean,president,state,city,country,yous,year,find,people,world<br>
Topic: 2 state,mean,yous,world,city,find,people,year,president,country<br>
Topic: 3 mean,find,country,world,city,people,year,president,state,yous<br>
Topic: 4 mean,people,state,president,country,world,yous,year,find,city<br>
Topic: 5 state,people,find,world,yous,country,year,city,president,mean<br>

### Com 20 palavras por tópicos
Topic: 1 president,mean,word,color,time,state,find,city,fear,country,play,yous,live,year,write,know,people,american,largest,world<br>
Topic: 2 word,find,mean,world,country,fear,write,city,know,american,time,play,color,live,people,largest,year,president,state,yous<br>
Topic: 3 state,color,mean,write,yous,find,city,world,word,fear,play,people,year,president,live,know,american,time,largest,country<br>
Topic: 4 color,state,largest,city,yous,people,country,world,live,year,fear,american,find,time,play,know,president,write,word,mean<br>
Topic: 5 color,fear,mean,state,country,president,world,yous,people,play,word,time,write,know,year,live,american,largest,find,city<br>

## Resultado com 10 tópicos
### Com 10 palavras por tópicos

Topic: 1 mean,president,state,year,find,people,yous,country,city,world<br>
Topic: 2 mean,state,find,people,yous,year,president,city,world,country<br>
Topic: 3 mean,find,country,people,year,world,city,president,state,yous<br>
Topic: 4 mean,people,state,year,president,country,yous,find,world,city<br>
Topic: 5 city,country,find,people,world,state,yous,year,president,mean<br>
Topic: 6 mean,yous,president,country,people,state,city,year,world,find<br>
Topic: 7 mean,find,country,city,yous,president,world,people,state,year<br>
Topic: 8 mean,state,city,president,country,find,year,yous,world,people<br>
Topic: 9 country,mean,president,people,city,world,find,year,yous,state<br>
Topic: 10 find,world,state,people,city,country,mean,year,yous,president<br>

### Com 20 palavras por tópicos
Topic: 1 fear,color,president,time,mean,play,word,state,live,year,city,find,country,people,yous,write,know,american,largest,world<br>
Topic: 2 word,fear,find,mean,know,world,country,year,play,live,people,city,write,time,american,state,color,largest,president,yous<br>
Topic: 3 color,fear,mean,play,state,write,yous,word,year,people,city,find,world,live,president,know,time,american,largest,country<br>
Topic: 4 color,largest,fear,state,play,yous,people,year,find,city,country,world,american,live,time,know,president,write,word,mean<br>
Topic: 5 fear,color,play,people,mean,time,word,state,president,year,country,world,yous,find,write,know,live,american,largest,city<br>
Topic: 6 fear,color,largest,live,play,mean,yous,president,city,country,year,people,state,world,american,word,know,time,write,find<br>
Topic: 7 fear,color,largest,mean,know,word,yous,find,country,write,city,play,world,people,state,live,president,american,time,year<br>
Topic: 8 fear,color,write,president,american,time,people,play,word,mean,country,world,city,find,year,yous,know,largest,live,state<br>
Topic: 9 largest,american,play,write,president,mean,state,city,word,country,find,year,world,yous,time,know,color,fear,live,people<br>
Topic: 10 fear,color,live,president,state,mean,city,find,people,country,world,year,yous,largest,time,word,write,american,know,play<br>

#### O que você percebeu sobre variar tanto a quantidade de tópicos quanto de palavras? O que isso muda nas matrizes do modelo? Quais resultados são mais interpretáveis qualitativamente?

Aumentar o número de tópicos (n_components) torna os temas mais específicos, mas pode gerar redundância. Já diminuir o número de tópicos resulta em temas mais amplos, dificultando a identificação de nuances. Quanto ao vocabulário (max_features), mais palavras trazem mais variações, mas podem incluir ruído; menos palavras tornam o modelo mais robusto, mas podem deixar tópicos muito gerais. Os resultados mais interpretáveis qualitativamente surgem quando há equilíbrio entre tópicos amplos e específicos e um vocabulário focado em palavras relevantes.


# Passo 4

# Importações necessárias

In [29]:
from bertopic import BERTopic

# Criação e treinamento do modelo

In [15]:
five_topic_model = BERTopic(calculate_probabilities=True, nr_topics=5)
topics, probs = five_topic_model.fit_transform(topic_df["processed_message"])

In [16]:
ten_topic_model = BERTopic(calculate_probabilities=True, nr_topics=10)
topics, probs = ten_topic_model.fit_transform(topic_df["processed_message"])

# Análise dos tópicos gerados

Foram treinados dois modelos, um considerando 5 tópicos e outro considerando 10 tópicos.
Abaixo é mostrado, utilizando a função get_topic_info(), quais palavras pertencem a cada
tópico. É possível perceber algumas diferenças entre os dois modelos. Principalmente na 
quantidade de diversidade entre os tópicos. Sendo o de 10 o que mais representa essa diversidade.

In [17]:
five_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2177,-1_world_yous_country_people,"[world, yous, country, people, play, write, ti...","[richest people world, english play write chil..."
1,0,2305,0_fear_mean_color_stand,"[fear, mean, color, stand, find, bear, word, o...","[color, mean, fear]"
2,1,1431,1_city_state_yous_world,"[city, state, yous, world, president, country,...","[largest country south america, president yous..."
3,2,23,2_tuberculosis_life_average_expectancy,"[tuberculosis, life, average, expectancy, cost...","[drug treat tuberculosis cost, people tubercul..."
4,3,16,3_temperature_degrees_cooler_cucumber,"[temperature, degrees, cooler, cucumber, heat,...","[temperature center earth, indoor sport phoeni..."


In [18]:
ten_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2144,-1_world_country_yous_write,"[world, country, yous, write, people, find, ki...","[people randy craft kill, country canada mexic..."
1,0,1937,0_mean_fear_president_word,"[mean, fear, president, word, film, bear, orig...","[mean, yous president, origin word]"
2,1,693,1_world_state_city_largest,"[world, state, city, largest, country, river, ...","[world country, largest city yous, largest cou..."
3,2,593,2_invent_color_card_drink,"[invent, color, card, drink, flag, company, pl...","[color need color pink, color, color]"
4,3,250,3_baseball_horse_team_sport,"[baseball, horse, team, sport, game, college, ...","[kentucky horse park, city kentucky horse park..."
5,4,91,4_blood_tuberculosis_hand_transplant,"[blood, tuberculosis, hand, transplant, finger...","[people tuberculosis, people tuberculosis, bod..."
6,5,90,5_computer_address_mail_internet,"[computer, address, mail, internet, answerscom...",[address find mail address member house repres...
7,6,75,6_temperature_water_weigh_measure,"[temperature, water, weigh, measure, earth, de...","[temperature surface, temperature center earth..."
8,7,67,7_bible_pope_vatican_cross,"[bible, pope, vatican, cross, prophet, islamic...","[pope, pope inaugurate vatican international r..."
9,8,12,8_life_expectancy_average_fraction,"[life, expectancy, average, fraction, beaver, ...","[life expectancy elephant, average life expect..."


## Vizualização dos tópicos

Julguei o modelo que utilizou 10 tópicos como o mais representativo.
Logo, somente o utilizei, descartando o de 5 tópicos para vizualização.

In [31]:
ten_topic_model.visualize_topics()

In [32]:
ten_topic_model.visualize_hierarchy()

In [33]:
ten_topic_model.visualize_barchart()

# Perguntas do passo 4

### Em comparação com o modelo anterior, os tópicos são mais interpretáveis qualitativamente? O que significa o score associado a cada termo de um tópico em cada um dos casos?
Sim, são mais interpretáveis. É bem nítido principalmente com a ajuda da vizualização que o BerTopic permite.

O score em BERTopic indica o quão representativo e importante um termo é para um tópico específico. É calculado usando c-TF-IDF, que ajusta o peso dos termos considerando os tópicos como "classes". Um score alto significa que o termo é distintivo para aquele tópico, enquanto um score baixo indica que o termo é mais comum entre diferentes tópicos.