In [12]:
import pandas as pd
import nltk
import unicodedata
import re        
from nltk.corpus import stopwords
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
pd.options.display.max_colwidth = 100


In [14]:
def preprocess(text):
    new_text = text.lower()
    new_text = re.sub(r'http\S+', '', new_text)
    new_text = re.sub(r'www.\S+', '', new_text)
    new_text = re.sub("@[A-Za-z0-9_]+","", new_text)
    new_text = unicodedata.normalize('NFKD', new_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    new_text = re.sub(r'[^\w\s]', '', new_text) # remove punc.
    new_text = re.sub(r'\d+','',new_text)# remove numbers
    return new_text

In [15]:
base = pd.read_csv("../dados/nlp/tweets_cleaned.csv")


In [16]:
# Sort by total of RTs
pd.set_option('display.max_colwidth', -1)
base.nlargest(50, columns='retweets_count')[['tweet','retweets_count']].head()

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,tweet,retweets_count
3455,"acontecimentos como o de hoje mostram, para quem insiste em não ver, que o tráfico de drogas e o crime organizado vão muito além da favela. disparar tiros a esmo, do alto de um helicóptero, contra a favela não tem nada de combate ao crime! é apenas crueldade, faxina social mesmo.",5357.0
1734,"essas crianças uniformizadas, dentro da escola, abaixadinhas com medo de tomar tiro de helicóptero devia ser uma dessas cenas que causa demissão de alto a baixo na cadeia de comando. devia virar escândalo, não feito político.",3630.0
277,morador da favela da cidade de deus flagra uma granada sendo jogada em solo do helicóptero águia da polícia do rio! pic.twitter.com/zu9dlpiq1z,2065.0
9366,professor: - o mandante - comeu a inspetora - além de ser um sapão é inteligente pra caralho - as vezes começa a tremer e parece q vai explodir mas logo passa - já se mijou - foge tão bem da policia tal qual eu da diretora quando peço pra tomar agua e fico caminhando pic.twitter.com/2ks8tifyxl,1967.0
4553,atenção! um helicóptero da polícia está sobrevoando agora a maré e causando um terror em moradores e moradoras. muitos tiros são ouvidos e recebemos relatos de que moradores estão sendo impedidos entrar na favela! isso tudo em horário escolar! atirar a esmo é crime! é terrorismo! pic.twitter.com/z0wiwqox4m,1681.0


In [17]:
base['msg'] = base['tweet'].apply(preprocess)

In [18]:
### STOPWORDS

stopwords = nltk.corpus.stopwords.words('portuguese')

stopwords_en = nltk.corpus.stopwords.words('english')

newStopWords = ['lula','http','3r6nsl','u200d','helicoptero','policia',
                'aeronave','ly','tb','tche', 'pra', '…','cv43sn','gate21','xa0','tinyurl',
                'helicóptero','durante','aguia','bit','https','fu','ahh','br','tiro',
                'bit','favela','sido','cara','cv43sn\'','enquanto', 'pic', 'twitter', 'www', 'ser'
               'ver','suzano','tipo','xa0\'','bom','bolivia','anti','diz','xa0helicoptero','apos','dois',
                'list','rio','da','em','quem','por','na','mas','ja','era','para','mais','se','nao','do',
                'que','de','ou','com','teria','sempre','outro','ao','os','duas','at','uma','um',
                'tiros','tiro','disparo','helicoptero','q','ta','pq','html','01','2011','tá','ig_twitter_share','uns']

stopwords.extend(newStopWords)

stopwords.extend(stopwords_en)

In [19]:
tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words=stopwords)

doc_term_matrix = tfidf_vect.fit_transform(base.msg.values.astype('U'))

In [20]:
pd.DataFrame(doc_term_matrix.toarray(), columns=tfidf_vect.get_feature_names())

Unnamed: 0,aaa,aaaa,aaaaa,aaaaaaaaaa,aaah,aah,abafado,abafados,abafar,abaixa,...,zn,zodiaco,zona,zonas,zs,zuada,zuando,zuar,zumbi,zumbis
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Topic model w/ NMF
componentsnumber = 3
nmf = NMF(n_components=componentsnumber, random_state=42)
nmf.fit(doc_term_matrix )

for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')



Top 10 words for topic #0:
['cana', 'mandado', 'gente', 'desse', 'dessa', 'voando', 'dia', 'tava', 'porra', 'hora', 'nada', 'caralho', 'baixinho', 'tao', 'acordei', 'morro', 'crlh', 'estao', 'mano', 'mane', 'rasante', 'mt', 'deus', 'passou', 'crl', 'varios', 'passando', 'cima', 'baixo', 'dando']


Top 10 words for topic #1:
['tava', 'ter', 'moradores', 'atirando', 'alemao', 'perto', 'deus', 'deu', 'governador', 'manha', 'sao', 'hoje', 'complexo', 'rua', 'pm', 'nada', 'ser', 'gente', 'ai', 'la', 'escola', 'ate', 'agora', 'dar', 'cima', 'sobrevoando', 'operacao', 'vai', 'casa', 'aqui']


Top 10 words for topic #2:
['assustada', 'hj', 'escutando', 'acordada', 'hoje', 'acordo', 'agr', 'ouvindo', 'escutei', 'sobrevoando', 'manha', 'nada', 'ouvi', 'dormir', 'susto', 'despertador', 'acorda', 'acordou', 'bala', 'morador', 'acerta', 'perdida', 'som', 'terror', 'botando', 'chega', 'acordar', 'dia', 'acordei', 'barulho']




In [11]:
# Topic model w/ LDA

LDA = LatentDirichletAllocation(n_components=componentsnumber, random_state=42)
LDA.fit(doc_term_matrix)

for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

Top 10 words for topic #0:
['crlh', 'mandando', 'baixinho', 'animais', 'porra', 'caralho', 'globo', 'ai', 'dia', 'ate', 'gente', 'deu', 'estao', 'casa', 'rua', 'vai', 'mt', 'passou', 'tao', 'la', 'medo', 'crl', 'passando', 'mano', 'deus', 'aqui', 'varios', 'cima', 'baixo', 'dando']


Top 10 words for topic #1:
['vc', 'caveirao', 'rj', 'contra', 'dando', 'video', 'atirando', 'bandidos', 'policiais', 'pessoas', 'criancas', 'pode', 'pm', 'ter', 'complexo', 'alemao', 'policial', 'alvo', 'moradores', 'witzel', 'sao', 'fuzil', 'escola', 'ser', 'civil', 'mare', 'operacao', 'governador', 'vai', 'dar']


Top 10 words for topic #2:
['dormir', 'gente', 'passando', 'rodando', 'ouvindo', 'chega', 'voando', 'caiu', 'hoje', 'operacao', 'escutei', 'tudo', 'regiao', 'baixo', 'tava', 'som', 'agora', 'despertador', 'acordar', 'nada', 'ouvi', 'hora', 'manha', 'casa', 'dando', 'dia', 'sobrevoando', 'aqui', 'acordei', 'barulho']


