# Política no Twitter

Projeto da disciplina de Introdução a Ciência de Dados.

## Configuração

+ Instale e configure sua versão do Anaconda. Orientações disponíveis no próprio [site](https://docs.anaconda.com/anaconda/install/) do produto

+ Ative o conda, crie e utilize um novo ambiente

```shell
source path_to_conda/activate
conda env create -f environment.yml
conda activate ptwitter
```

+ Inicie o ```jupyter-lab```.


## Processo

+ Importando os arquivos _json_ disponíveis na pasta de dados e armazenando-os em um _dataframe_. 

In [36]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import json
from pandas.api.types import CategoricalDtype


dfs = []
profiles = []
to_be_dropped = ['id', 'display_text_range', 'source', 
                 'in_reply_to_status_id', 'in_reply_to_status_id_str', 
                 'in_reply_to_user_id', 'in_reply_to_user_id_str', 
                 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
                 'contributors', 'is_quote_status', 'lang', 'extended_entities',
                 'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink']


# object_hook para processar as strings do json como datetime
def process_tweets(dct):
    # Chaves e sub-chaves usadas para decompor as informacoes de hashtags e mencoes 
    meta = {'hashtags': 'text', 'user_mentions': 'screen_name'}
    if 'created_at' in dct:
        try:
            dct['created_at'] = dt.datetime.strptime(dct['created_at'],'%a %b %d %H:%M:%S %z %Y').replace(hour=0, minute=0, second=0, microsecond=0)
        except ValueError:
            dct['created_at'] = np.nan
    
    for k, sk in meta.items():
        # Procura pelas chaves de Meta no dicionario que represanta o Json
        if k in dct:
            # Itera pela lista de dicionarios em cada chave encontrada(ver estrutura do json) e agrega as sub-chaves em uma lista
            items = dct[k]
            dct[k] = [ivalue for item in items for ikey, ivalue in item.items() if ikey == sk] if items else np.nan
    return dct

with os.scandir('dados') as lsit:
    fit = (f for f in lsit if f.is_file())
    for f in fit:
        with open(f.path, mode='r') as fp:
            profile = f.name.split('.')[0]
            profiles.append(profile)
            tweets = json.load(fp, object_hook=process_tweets)
            df = pd.DataFrame(tweets)
            df.drop(columns=to_be_dropped, inplace=True)
            df['tweet_len'] = df['full_text'].str.len()
            df['profile'] = profile
            df = df.join(pd.json_normalize(df['entities']).drop(columns=['symbols', 'urls', 'media']))
            dfs.append(df)

tweets_df = pd.concat(dfs)
cat_type = CategoricalDtype(categories=profiles , ordered=True)
tweets_df['profile'] = tweets_df['profile'].astype(cat_type)

In [37]:
tweets_df.head(5)

Unnamed: 0,created_at,id_str,full_text,truncated,entities,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,quoted_status,tweet_len,profile,hashtags,user_mentions
0,2010-04-01 00:00:00+00:00,11402700435,"EM ""DITADURA"" SEM PAREDÃO, ATÉ CHICO ALENCAR É...",False,"{'hashtags': nan, 'symbols': [], 'user_mention...",30,15,False,False,,,137,jairbolsonaro,,
1,2019-01-10 00:00:00+00:00,1083318129135112192,"Bom dia! 🇧🇷 #tbt com o amigo ""Canguru"", que já...",False,"{'hashtags': ['tbt'], 'symbols': [], 'user_men...",1773,31618,False,False,False,,114,jairbolsonaro,[tbt],
2,2019-07-19 00:00:00+00:00,1152036400138579968,- Para descontrair. Proibido queimar ovo. (Kkk...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",5859,46637,False,False,False,,72,jairbolsonaro,,
3,2018-10-09 00:00:00+00:00,1049492883328380928,Trecho de entrevista ao vivo para o Jornal Nac...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",13915,56322,False,False,False,,89,jairbolsonaro,,
4,2017-08-23 00:00:00+00:00,900375277557215232,Querem criar o fundão bilionário na Reforma Po...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",2271,7147,False,False,,,138,jairbolsonaro,,


+ Criando um índice baseado no perfil do usuário, data de criação e identificação do tuíte.

In [38]:
# mindex = pd.MultiIndex.from_frame(tweets_df[['profile', 'created_at', 'id_str']])
tweets_df.set_index(['profile', 'created_at', 'id_str'], inplace=True)
tweets_df.sort_index(inplace=True)
mindex = tweets_df.index

+ Utilizando a biblioteca spacy para processar o texto dos tuítes

In [39]:
mindex[0:1]

MultiIndex([('jairbolsonaro', '2010-04-01 00:00:00+00:00', '11398556610')],
           names=['profile', 'created_at', 'id_str'])

In [43]:
tweets_df.loc[('jairbolsonaro', '2010-04-01', '11402700435')]

full_text             EM "DITADURA" SEM PAREDÃO, ATÉ CHICO ALENCAR É...
truncated                                                         False
entities              {'hashtags': nan, 'symbols': [], 'user_mention...
retweet_count                                                        30
favorite_count                                                       15
favorited                                                         False
retweeted                                                         False
possibly_sensitive                                                  NaN
quoted_status                                                       NaN
tweet_len                                                           137
hashtags                                                            NaN
user_mentions                                                       NaN
Name: (jairbolsonaro, 2010-04-01 00:00:00+00:00, 11402700435), dtype: object

In [44]:
from collections import Counter
import pt_core_news_sm
import spacy
from spacy.tokens import Token
from spacymoji import Emoji

# Regras para desconsiderar tags e mentions na contagem de palavras
is_hashtag_getter = lambda token: len(token.text) > 1 and token.text.startswith('#')
is_mention_getter = lambda token: len(token.text) > 1 and token.text.startswith('@')
is_currency_getter = lambda token: token.text.lower() == 'r$'
is_abrev_getter = lambda token: len(token.text) <= 2 and token.text.lower() in ['c/', 'p/', 'q']

Token.set_extension("is_hashtag", getter=is_hashtag_getter, force=True)
Token.set_extension("is_mention", getter=is_mention_getter, force=True)
Token.set_extension("is_currency", getter=is_currency_getter, force=True)
Token.set_extension("is_abrev", getter=is_abrev_getter, force=True)

# Novas stop words para PT 
custom_stop_words = ['a', 'e', 'o', 'n', 'd', 'A', 'E', 'O', 'N', 'D']

nlp = pt_core_news_sm.load()

# Configurando para remover emojis
emoji = Emoji(nlp, merge_spans=False)
nlp.add_pipe(emoji, first=True)

# Configurando o modelo com as novas stop words
for sw in custom_stop_words:
    nlp.vocab[sw].is_stop = True

# Configurando prefixos para n separar tags e @
prefixes = list(nlp.Defaults.prefixes)
# prefixes
prefixes.remove("#")
# prefixes.append('R\\$')
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

docs = list(nlp.pipe(tweets_df['full_text'].str.replace(r'\n', '').to_numpy(),
                     disable=["tagger", "parser", "ner", "textcat"]))


# Filtrando somente os tokens que sao efetivamente palavras
# filt_docs = [list(filter(lambda token: not token.is_stop and not token.is_punct 
#                          and not token.like_url and not token._.is_hashtag 
#                          and not token._.is_mention and not token._.is_currency
#                          and not token._.is_abrev and not token._.is_emoji, doc)) for doc in docs]
# filt_docs
# pd.Series([list(map(lambda token: token.text.lower() , doc)) for doc in docs], index=index)

# print(len(docs))

# words = [token.text.lower() for doc in docs for token in doc if  not token.is_stop 
#          and not token.is_punct and not token.like_url and not token._.is_hashtag 
#          and not token._.is_mention and not token._.is_currency and not token._.is_abrev
#          and not token._.is_emoji]

# words_freq = Counter(words)

# print(words_freq.most_common(15))

In [45]:
def process_doc(doc):
    return {'vocab_len': len(doc), 'palavras': [token.text.lower() for token in doc if  not token.is_stop 
         and not token.is_punct and not token.like_url and not token._.is_hashtag 
         and not token._.is_mention and not token._.is_currency and not token._.is_abrev
         and not token._.is_emoji]}

tweets_df = tweets_df.join(pd.DataFrame(list(map(process_doc, docs)), index=mindex, columns=['vocab_len', 'palavras']))
tweets_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,full_text,truncated,entities,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,quoted_status,tweet_len,hashtags,user_mentions,vocab_len,palavras
profile,created_at,id_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
jairbolsonaro,2010-04-01 00:00:00+00:00,11398556610,Boa noite a todos @depchicolanecar @reportercrime,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",18,19,False,False,,,49,,[reportercrime],6,[noite]
jairbolsonaro,2010-04-01 00:00:00+00:00,11398669496,"Me desculpem, boa noite @depchicoalencar @repo...",False,"{'hashtags': nan, 'symbols': [], 'user_mention...",2,2,False,False,,,55,,[reportercrime],7,"[desculpem, noite]"
jairbolsonaro,2010-04-01 00:00:00+00:00,11399072031,Os militares assumiram o governo em março de 6...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",9,1,False,False,,,134,,,26,"[militares, assumiram, governo, março, 64, ape..."
jairbolsonaro,2010-04-01 00:00:00+00:00,11399181472,"Em 64, o povo ñ suportava + greves, desordem s...",False,"{'hashtags': nan, 'symbols': [], 'user_mention...",6,1,False,False,,,79,,,18,"[64, ñ, suportava, +, greves, desordem, social..."
jairbolsonaro,2010-04-01 00:00:00+00:00,11399973381,#debate64 O Governo se fez enérgico. Os q pega...,False,"{'hashtags': ['debate64'], 'symbols': [], 'use...",3,7,False,False,,,138,[debate64],,25,"[governo, enérgico, pegaram, armas, acusavam, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LulaOficial,2020-01-08 00:00:00+00:00,1214911118357016578,Na relação internacional sempre são dois inter...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",3165,19479,False,False,,,194,,,43,"[internacional, interesses, outro, equilibrar,..."
LulaOficial,2020-01-08 00:00:00+00:00,1214913998346424323,Vamos ter que brigar muito para o Brasil recup...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",996,6428,False,False,,,155,,,32,"[vamos, brigar, brasil, recuperar, soberania, ..."
LulaOficial,2020-01-08 00:00:00+00:00,1214917269769347075,Agora inventaram uma palavra pra justificar o ...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",2428,12574,False,False,,,268,,,51,"[inventaram, palavra, pra, justificar, fazendo..."
LulaOficial,2020-01-08 00:00:00+00:00,1215010265638604800,"Bolsonaro, pensa no Brasil e pare de ser puxa ...",False,"{'hashtags': nan, 'symbols': [], 'user_mention...",8812,39979,False,False,False,,110,,,20,"[bolsonaro, pensa, brasil, pare, puxa, saco, e..."


In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
labels, data = zip(*words_freq.most_common(10))

x = np.arange(len(labels))  # the label locations

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Frequência')
ax.set_title('Palavras mais utilizadas pelo Lula')

plt.bar(x, list(data))
plt.xticks(x, labels, rotation=80)
fig.tight_layout()
plt.show()

In [None]:
a = [1, 2]

b = 'Cheio' if a else 'Vazio'
b

In [None]:
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)

In [8]:
print(('olá', 1))

('olá', 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,full_text,truncated,entities,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,quoted_status,tweet_len,hashtags,user_mentions
profile,created_at,id_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
jairbolsonaro,2010-04-01 00:00:00+00:00,11398556610,Boa noite a todos @depchicolanecar @reportercrime,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",18,19,False,False,,,49,,[reportercrime]
jairbolsonaro,2010-04-01 00:00:00+00:00,11398669496,"Me desculpem, boa noite @depchicoalencar @repo...",False,"{'hashtags': nan, 'symbols': [], 'user_mention...",2,2,False,False,,,55,,[reportercrime]
jairbolsonaro,2010-04-01 00:00:00+00:00,11399072031,Os militares assumiram o governo em março de 6...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",9,1,False,False,,,134,,
jairbolsonaro,2010-04-01 00:00:00+00:00,11399181472,"Em 64, o povo ñ suportava + greves, desordem s...",False,"{'hashtags': nan, 'symbols': [], 'user_mention...",6,1,False,False,,,79,,
jairbolsonaro,2010-04-01 00:00:00+00:00,11399973381,#debate64 O Governo se fez enérgico. Os q pega...,False,"{'hashtags': ['debate64'], 'symbols': [], 'use...",3,7,False,False,,,138,[debate64],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LulaOficial,2020-01-08 00:00:00+00:00,1214911118357016578,Na relação internacional sempre são dois inter...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",3165,19479,False,False,,,194,,
LulaOficial,2020-01-08 00:00:00+00:00,1214913998346424323,Vamos ter que brigar muito para o Brasil recup...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",996,6428,False,False,,,155,,
LulaOficial,2020-01-08 00:00:00+00:00,1214917269769347075,Agora inventaram uma palavra pra justificar o ...,False,"{'hashtags': nan, 'symbols': [], 'user_mention...",2428,12574,False,False,,,268,,
LulaOficial,2020-01-08 00:00:00+00:00,1215010265638604800,"Bolsonaro, pensa no Brasil e pare de ser puxa ...",False,"{'hashtags': nan, 'symbols': [], 'user_mention...",8812,39979,False,False,False,,110,,
