# Passo 2

## importações necessárias

In [5]:
import pandas as pd
import numpy as np

import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import string
import re
import contractions

In [6]:
message = pd.read_csv("data/texts.txt", header=None, names=["message"], sep="\r")
label = pd.read_csv("data/score.txt", header=None, names=["labels"], sep="\r")

topic_df = pd.concat([message, label], axis=1)
# topic_df

In [7]:
# Tamanho medio dos documentos antes do pré-processamento
sample_message = topic_df["message"].tolist()
num_words = [len(s.split()) for s in sample_message]
print(f"{np.median(num_words)} palavras/doc")

9.0 palavras/doc


In [8]:
nlp = spacy.load("en_core_web_sm")
stop_words = STOP_WORDS

In [9]:
punctuation = string.punctuation

In [10]:
nltk.download('wordnet') # download para fazer a lematização do texto usando WordNetLematizer

[nltk_data] Downloading package wordnet to /home/valentim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def contraction_text(texts):
    ''' 
        This function is used to expand the possible hiring of some words in the text
    '''
    
    expanded_text = []
    for word in texts.split():
        expanded_text.append(contractions.fix(word))

    texts = ' '.join(expanded_text)
    
    return texts

def lemmatizer_text(texts):
    ''' 
    function responsible for lemmatizing the text
    '''
    
    lemmatizer = WordNetLemmatizer()
    
    word_token = texts.split()
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in word_token]

    texts = ' '.join(lemmas)

    return texts

def processing(texts):
    texts = contraction_text(texts)
    texts = lemmatizer_text(texts)
    
    result = []
    pos_tag = ['ADJ', 'NOUN', 'VERB', 'PROPN'] # vetor que permite escolher somente entre adjetivos, substantivos, verbos e pronomes
    text = re.sub(r'(https?://[^\s\n\r]+|www\.[^\s\n\r]+|[0-9@#&!?:,.\)\(;])', '', texts) # expressão regular que retira do texto todos os números, simbolos (&#;) e URL
    doc = nlp(text.lower())

    for token in doc:
        if(token.text in stop_words or token.text in punctuation or len(token.text) <= 3 or token.text == "ltgt"):
            continue
        
        if(token.pos_ in pos_tag):
            result.append(token.text)
        
        text = ' '.join(str(element) for element in result if not element.isdigit())
    
    return text

In [12]:
topic_df.message = topic_df.message.astype(str)
topic_df["processed_message"] = topic_df["message"].apply(processing)

topic_df.head()

Unnamed: 0,message,labels,processed_message
0,How did serfdom develop in and then leave Russ...,0,serfdom develop leave russia
1,What films featured the character Popeye Doyle ?,1,film feature character popeye doyle
2,How can I find a list of celebrities ' real na...,0,find list celebrities real
3,What fowl grabs the spotlight after the Chines...,1,fowl grab spotlight chinese year monkey
4,What is the full form of .com ?,2,form


In [13]:
# Tamanho medio dos documentos antes do pré-processamento
sample_message = topic_df["processed_message"].tolist()
num_words = [len(s.split()) for s in sample_message]
print(f"{np.median(num_words)} palavras/doc")

3.0 palavras/doc


# Passo 3

## Importações necessárias

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

## NMF aplicada

In [17]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(topic_df["message"])
words = np.array(vectorizer.get_feature_names_out())

In [21]:
nmf = NMF(n_components=20, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_

## Resultado

In [22]:
for i, topic in enumerate(H):
    print(f"Topic: {i + 1} {",".join(str(x) for x in words[topic.argsort()[-10:]])}")

Topic: 1 meaning,united,states,is,population,word,what,origin,of,the
Topic: 2 old,died,were,long,much,have,people,there,many,how
Topic: 3 prime,minister,killed,created,won,president,wrote,the,invented,who
Topic: 4 long,word,take,an,have,it,stand,what,mean,does
Topic: 5 if,make,find,get,why,say,call,how,you,do
Topic: 6 new,used,movie,what,film,america,which,the,city,in
Topic: 7 state,day,made,known,color,capital,called,an,what,is
Topic: 8 about,the,located,information,get,come,from,find,can,where
Topic: 9 american,by,woman,space,year,born,president,what,first,was
Topic: 10 why,names,colors,cities,countries,some,two,there,what,are
Topic: 11 another,famous,named,known,term,the,abbreviation,what,stand,for
Topic: 12 way,used,the,first,long,from,take,be,it,to
Topic: 13 children,cold,what,glass,water,being,food,kind,of,fear
Topic: 14 begin,war,originate,become,come,play,what,die,year,did
Topic: 15 its,with,by,real,dog,lawyer,craft,randy,that,name
Topic: 16 wrote,as,children,by,two,with,what,d