In [1]:
import string
from collections import Counter

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from polyglot.text import Text

In [2]:
df = pd.read_csv('/mnt/data/tweets.csv.xz')
df.head()

Unnamed: 0,nbr_retweet,user_id,url,text,usernameTweet,datetime,is_reply,is_retweet,ID,nbr_reply,nbr_favorite,medias,has_media
0,0,67061352,/anaperugini/status/248235576548012032,Dep. @anaperugini no programa Notícias em Deb...,anaperugini,2012-09-18 22:42:28,False,False,248235576548012032,0,0,,
1,116,22864100,/DeputadoFederal/status/960864983881043968,"E agora, Moro ? pic.twitter.com/5t4sLmm6gc",DeputadoFederal,2018-02-06 11:17:18,False,False,960864983881043968,6,155,,
2,1,63020349,/deputadoariosto/status/512154450879074304,Não existe crise em 2014 para empresa maior em...,deputadoariosto,2014-09-17 05:21:51,False,False,512154450879074304,0,0,,
3,22,35805725,/marcofeliciano/status/522439346008621058,"""Tu, Senhor, guardarás em perfeita paz aquele ...",marcofeliciano,2014-10-15 14:30:21,False,False,522439346008621058,1,51,,
4,0,92033111,/depbulhoes/status/713733683065331714,@Marciabasto Eu que agradeço por sua compreens...,depbulhoes,2016-03-26 11:25:45,False,False,713733683065331714,0,0,,


In [3]:
df.shape

(3018484, 13)

## Entidades

In [None]:
def valid_token(token):
    token = token.strip()
    
    if token in stopwords.words('portuguese'):
        return False
    
    return token.isalpha()

def pre_process(rows):
    contents = ' '.join(rows.text)
    tokens = (token for token in word_tokenize(contents) if valid_token(token))
    return ' '.join(tokens)

def get_entities(rows, **kwargs):
    top = kwargs.get('top', 10)
  
    data = {}
    for count in range(1, top + 1):
        data[f'top_entity_{count}'] = None
        data[f'top_entity_{count}_count'] = None

    if len(rows) < 100:
        return pd.Series(data)

    text = Text(pre_process(rows), hint_language_code='pt')
    entities = (' '.join(entity) for entity in text.entities)
    counter = Counter(entities)

   
    for count, obj in enumerate(counter.most_common(top), 1):
        text, entity_count = obj
        data[f'top_entity_{count}'] = text
        data[f'top_entity_{count}_count'] = entity_count
        
    return pd.Series(data)

grouped = df.groupby('usernameTweet') \
     .apply(get_entities) \
     .dropna() \
     .sort_values('top_entity_1_count', ascending=False) \
     .reset_index()
grouped.head()

In [None]:
total = pd.DataFrame([get_entities(df)])
total.head()

In [None]:
grouped.to_csv('/mnt/data/tweets_grouped_by_congressperson.csv')
total.to_csv('/mnt/data/all_tweets_by_congresspeople.csv')