In [1]:
import sys
sys.path.insert(1, '../')

import pandas as pd
from source import preprocess as pp
import os

DATA_PATH = '../data/2022-brazilian-presidential/'

### __Files__

In [2]:
# loading file names
file_list = [file for file in os.listdir(DATA_PATH) if file.startswith('2022-')]
file_list.sort()
search_terms = pd.Series([file[11:-8] for file in file_list])
search_dates = pd.Series([file[0:10] for file in file_list])
# displaying the number of searched terms (total and unique)
print((
    'Number of search terms: {}\n'
    'Number of unique search terms: {}'
).format(
    search_terms.shape[0],
    search_terms.unique().shape[0]
))

Number of search terms: 104
Number of unique search terms: 91


In [3]:
dates_n_terms = pd.DataFrame({
    'date': search_dates,
    'search_terms': search_terms
})
# displaying search dates and terms
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(dates_n_terms)

Unnamed: 0,date,search_terms
0,2022-08-01,BOLSONARO NÃO TRABALHA
1,2022-08-09,NÃO VOTE EM RACISTA
2,2022-08-22,#BolsonaroNoJN
3,2022-08-22,#Eleicoes2022
4,2022-08-22,#ForaBolsonaro
5,2022-08-22,#GloboLixo
6,2022-08-25,#LADRAONOJN
7,2022-08-25,BOBO DA CORTE
8,2022-08-25,Faz o L
9,2022-08-28,#DebateNaBand


In [4]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pd.DataFrame(dates_n_terms['search_terms'].value_counts()))

Unnamed: 0_level_0,count
search_terms,Unnamed: 1_level_1
ORÇAMENTO SECRETO NÃO,3
Eduardo Bolsonaro,3
BOLSONARO NÃO TRABALHA,2
PERDEU MANÉ,2
#PagueMinhaBolsa,2
CAPES,2
Infiltrados,2
#LulaPresidente1️⃣3️⃣,2
NÓS VENCEMOS,2
Faz o L,2


In [5]:
file_example = pd.read_parquet(os.path.join(DATA_PATH, file_list[0]))
file_example.head()

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
0,2022-08-04 02:52:05,1555023739988971521,Passou 4 anos andando de moto e jetski enquant...,zitelli_carlos,"{'id': 721831120120188928, 'name': 'Carlos Zit...",False,,False,,False,,True,"{'user': 'tesoureiros', 'user_id': 10714084365...",
1,2022-08-04 02:48:40,1555022883138813952,Os assuntos do momento: \nCORNOS COM BOLSONARO...,MarcosTil,"{'id': 87421436, 'name': 'Marcos Til', 'descri...",False,,False,,False,,True,"{'user': 'detremura', 'user_id': 102826672, 't...",
2,2022-08-04 02:48:09,1555022751106228226,Em 3 anos e meio Bolsonaro trabalhou menos de ...,marialo71834768,"{'id': 1535023761241657351, 'name': 'Patriotis...",False,,False,,False,,True,"{'user': 'ThiagoResiste', 'user_id': 127077402...",
3,2022-08-04 02:41:05,1555020974847819778,"Ah, mas o problema da crise econômica é do 'fi...",desestressada1,"{'id': 1090045777395761154, 'name': 'desestres...",False,,False,,False,,True,"{'user': 'PedroRonchi2', 'user_id': 1097073004...",
4,2022-08-04 02:33:33,1555019075117621248,Em 3 anos e meio Bolsonaro trabalhou menos de ...,Edu64986869,"{'id': 1521688957720965120, 'name': 'Edu', 'de...",False,,False,,False,,True,"{'user': 'ThiagoResiste', 'user_id': 127077402...",


In [6]:
file_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21380 entries, 0 to 21379
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   created_at      21380 non-null  object
 1   tweet_id        21380 non-null  int64 
 2   tweet_content   21380 non-null  object
 3   user            21380 non-null  object
 4   user_info       21380 non-null  object
 5   has_mention     21380 non-null  bool  
 6   mentions        3477 non-null   object
 7   is_reply        21380 non-null  bool  
 8   reply_to        2686 non-null   object
 9   is_quote        21380 non-null  bool  
 10  quoted_from     1348 non-null   object
 11  is_retweet      21380 non-null  bool  
 12  retweeted_from  16312 non-null  object
 13  hashtags        1114 non-null   object
dtypes: bool(4), int64(1), object(9)
memory usage: 1.7+ MB


### __Quantitative Analysis__

In [7]:
tweets = pd.DataFrame()

for file in file_list:
    df = pd.read_parquet(os.path.join(DATA_PATH, file))
    tweets = pd.concat([tweets, df], ignore_index=True)

tweets.drop_duplicates(keep='first', inplace=True, ignore_index=True)

#### __Textual Preprocessing__



- URL removal
- Mention removal (@user)
- Hashtags removal (#removal)
- Punctuation removal
- Emoji removal
- Removing unwanted white spaces
    - Trailing spaces
    - Multiple spaces
    - Line breaks
- Text Normalization
    - Lowercase
    - Stopwords removal


In [8]:
tweets['preproc_tweet'] = tweets['tweet_content'].map(lambda x: pp.preprocess_text(x))
tweets.to_parquet(os.path.join(DATA_PATH, '2022_all_tweets'), index=False)
tweets_unique_preproc = tweets['preproc_tweet'].unique().shape[0]
tweets_unique = tweets['tweet_content'].unique().shape[0]

In [9]:
display(pd.DataFrame(
    index = ['tweets_total', 'tweets_unique', 'tweets_preproc_unique'],
    data = [tweets.shape[0], tweets_unique, tweets_unique_preproc],
    columns = ['amount']
))

Unnamed: 0,amount
tweets_total,6647017
tweets_unique,1262224
tweets_preproc_unique,920830


In [10]:
user_names = tweets['user'].unique()
print(f'Total number of users: {user_names.shape[0]}')

Total number of users: 951602


In [11]:
dates = dates_n_terms['date'].unique()
dates.shape

(36,)

In [12]:
days = {}
terms = {}

In [13]:
for day in dates:
    day_tweets = tweets.loc[tweets['created_at'].str.startswith(day)]
    day_terms = dates_n_terms.loc[dates_n_terms['date']==day]
    number_tweets = day_tweets.shape[0]
    numer_tweets_unique = day_tweets['preproc_tweet'].unique().shape[0]
    number_users = day_tweets['user'].unique().shape[0]
    number_terms = day_terms.shape[0]
    days[day] = {
        'number_of_tweets': number_tweets,
        'number_of_unique_tweets': numer_tweets_unique,
        'number_of_users': number_users,
        'number_of_terms': number_terms
    }

In [14]:
daily_status = pd.DataFrame.from_dict(days, orient='index')
daily_status

Unnamed: 0,number_of_tweets,number_of_unique_tweets,number_of_users,number_of_terms
2022-08-01,14455,1040,6294,1
2022-08-09,4010,429,2119,1
2022-08-22,101933,31118,44557,4
2022-08-25,26891,9424,19784,3
2022-08-28,25425,7518,12345,1
2022-09-23,117928,18471,47420,3
2022-09-24,70250,22612,27642,3
2022-09-25,29715,6633,16350,3
2022-09-26,10522,1774,6413,12
2022-09-28,20652,9552,8406,1


In [15]:
file_list

['2022-08-01-BOLSONARO NÃO TRABALHA.parquet',
 '2022-08-09-NÃO VOTE EM RACISTA.parquet',
 '2022-08-22-#BolsonaroNoJN.parquet',
 '2022-08-22-#Eleicoes2022.parquet',
 '2022-08-22-#ForaBolsonaro.parquet',
 '2022-08-22-#GloboLixo.parquet',
 '2022-08-25-#LADRAONOJN.parquet',
 '2022-08-25-BOBO DA CORTE.parquet',
 '2022-08-25-Faz o L.parquet',
 '2022-08-28-#DebateNaBand.parquet',
 '2022-09-23-DINHEIRO VIVO.parquet',
 '2022-09-23-Faltam 9.parquet',
 '2022-09-23-SÓ CIRO VENCE LULA.parquet',
 '2022-09-24-#DebateNoSBT.parquet',
 '2022-09-24-BOLSONARO CORRUPTO.parquet',
 '2022-09-24-MEU VOTO NÃO ANISTIA BANDIDO.parquet',
 '2022-09-25-#EsseÉMeuVoto.parquet',
 '2022-09-25-#LulaNo1ºTurno.parquet',
 '2022-09-25-Faltam 7.parquet',
 '2022-09-26-#BrasilDaEsperança.parquet',
 '2022-09-26-#BrasilVota22.parquet',
 '2022-09-26-#CiroManifestoANacao.parquet',
 '2022-09-26-#CiroNoFlow.parquet',
 '2022-09-26-#LulaNo1ºTurno.parquet',
 '2022-09-26-#TiraGomes.parquet',
 '2022-09-26-AGORA É LULA.parquet',
 '2022-09-

In [38]:
for term in dates_n_terms['search_terms']:
    print(f'Term: {term}')
    term_tweets = pd.DataFrame()
    days_count = 0
    for file in file_list:
        if file[11:-8] == term:
            print(f'-- File: {file}')
            days_count+=1
            df = pd.read_parquet(os.path.join(DATA_PATH,file))
            pd.concat([term_tweets, df], ignore_index=True)
    term_tweets.drop_duplicates(keep='first', inplace=True, ignore_index=True)
    term_tweets['preproc_tweet'] = term_tweets['tweet_content'].map(lambda x: pp.preprocess_text(x))
    number_tweets = term_tweets.shape[0]
    numer_tweets_unique = term_tweets['preproc_tweet'].unique().shape[0]
    number_users = term_tweets['user'].unique().shape[0]
    number_terms = term_terms.shape[0]
    terms[term] = {
        'number_of_tweets': number_tweets,
        'number_of_unique_tweets': numer_tweets_unique,
        'number_of_users': number_users,
        'number_of_days': days_count
    }

Term: BOLSONARO NÃO TRABALHA
-- File: 2022-08-01-BOLSONARO NÃO TRABALHA.parquet
-- File: 2022-11-14-BOLSONARO NÃO TRABALHA.parquet


KeyError: 'tweet_content'

In [20]:
pd.read_parquet(os.path.join(DATA_PATH,'2022-08-09-NÃO VOTE EM RACISTA.parquet')).head()

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
0,2022-08-12 02:51:44,1557922754606907394,Intolerância religiosa contra religiões de mat...,SeibelArno,"{'id': 2933870393, 'name': 'Arno Llantada Seib...",False,,False,,False,,True,"{'user': 'tesoureiros', 'user_id': 10714084365...",
1,2022-08-12 02:50:35,1557922465577406464,@ThiagoResiste Nao vote em racista.,mario_sesta,"{'id': 1461091609785815048, 'name': 'Mario Ber...",True,"[{'id': '1270774020808626176', 'username': 'Th...",True,"{'user': 'ThiagoResiste', 'user_id': 127077402...",False,,False,,
2,2022-08-12 02:47:03,1557921579039920131,NÃO VOTE EM RACISTA! \n\n “Um povo que não con...,Soange20,"{'id': 2552611191, 'name': 'Ange 🆙8️⃣0️⃣ Bora ...",False,,False,,False,,True,"{'user': 'Willsoouz', 'user_id': 7851346928781...",
3,2022-08-12 02:41:36,1557920205183090691,As religiões africanas são parte fundamental d...,monicasouzal,"{'id': 1401986340586233857, 'name': 'mônica', ...",False,,False,,False,,True,"{'user': 'AnonNovidades', 'user_id': 122426591...",
4,2022-08-12 02:25:26,1557916137417019395,As religiões africanas são parte fundamental d...,poetadeboca,"{'id': 1110740422551355393, 'name': 'Revoltada...",False,,False,,False,,True,"{'user': 'AnonNovidades', 'user_id': 122426591...",


In [21]:
pd.read_parquet(os.path.join(DATA_PATH,'2022-08-09-NÃO VOTE EM RACISTA.parquet')).head()


Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
0,2022-08-12 02:51:44,1557922754606907394,Intolerância religiosa contra religiões de mat...,SeibelArno,"{'id': 2933870393, 'name': 'Arno Llantada Seib...",False,,False,,False,,True,"{'user': 'tesoureiros', 'user_id': 10714084365...",
1,2022-08-12 02:50:35,1557922465577406464,@ThiagoResiste Nao vote em racista.,mario_sesta,"{'id': 1461091609785815048, 'name': 'Mario Ber...",True,"[{'id': '1270774020808626176', 'username': 'Th...",True,"{'user': 'ThiagoResiste', 'user_id': 127077402...",False,,False,,
2,2022-08-12 02:47:03,1557921579039920131,NÃO VOTE EM RACISTA! \n\n “Um povo que não con...,Soange20,"{'id': 2552611191, 'name': 'Ange 🆙8️⃣0️⃣ Bora ...",False,,False,,False,,True,"{'user': 'Willsoouz', 'user_id': 7851346928781...",
3,2022-08-12 02:41:36,1557920205183090691,As religiões africanas são parte fundamental d...,monicasouzal,"{'id': 1401986340586233857, 'name': 'mônica', ...",False,,False,,False,,True,"{'user': 'AnonNovidades', 'user_id': 122426591...",
4,2022-08-12 02:25:26,1557916137417019395,As religiões africanas são parte fundamental d...,poetadeboca,"{'id': 1110740422551355393, 'name': 'Revoltada...",False,,False,,False,,True,"{'user': 'AnonNovidades', 'user_id': 122426591...",
