In [1]:
import sys
sys.path.insert(1, '../')

import pandas as pd
from source import preprocess as pp
import os

DATA_PATH = '../data/2022-brazilian-presidential/'

### __Files__

In [2]:
# loading file names
file_list = os.listdir(DATA_PATH)
file_list.sort()
search_terms = pd.Series([file[11:-8] for file in file_list])
search_dates = pd.Series([file[0:10] for file in file_list])
# displaying the number of searched terms (total and unique)
print((
    'Number of search terms: {}\n'
    'Number of unique search terms: {}'
).format(
    search_terms.shape[0],
    search_terms.unique().shape[0]
))

Number of search terms: 104
Number of unique search terms: 91


In [3]:
dates_n_terms = pd.DataFrame({
    'date': search_dates,
    'search_terms': search_terms
})
# displaying search dates and terms
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(dates_n_terms)

Unnamed: 0,date,search_terms
0,2022-08-01,BOLSONARO NÃO TRABALHA
1,2022-08-09,NÃO VOTE EM RACISTA
2,2022-08-22,#BolsonaroNoJN
3,2022-08-22,#Eleicoes2022
4,2022-08-22,#ForaBolsonaro
5,2022-08-22,#GloboLixo
6,2022-08-25,#LADRAONOJN
7,2022-08-25,BOBO DA CORTE
8,2022-08-25,Faz o L
9,2022-08-28,#DebateNaBand


In [4]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pd.DataFrame(dates_n_terms['search_terms'].value_counts()))

Unnamed: 0_level_0,count
search_terms,Unnamed: 1_level_1
ORÇAMENTO SECRETO NÃO,3
Eduardo Bolsonaro,3
BOLSONARO NÃO TRABALHA,2
Faz o L,2
#LulaNo1ºTurno,2
#LulaPresidente1️⃣3️⃣,2
NÓS VENCEMOS,2
PERDEU MANÉ,2
#PagueMinhaBolsa,2
CAPES,2


In [5]:
file_example = pd.read_parquet(DATA_PATH+file_list[0])
file_example.head()

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
0,2022-08-04 02:52:05,1555023739988971521,Passou 4 anos andando de moto e jetski enquant...,zitelli_carlos,"{'id': 721831120120188928, 'name': 'Carlos Zit...",False,,False,,False,,True,"{'user': 'tesoureiros', 'user_id': 10714084365...",
1,2022-08-04 02:48:40,1555022883138813952,Os assuntos do momento: \nCORNOS COM BOLSONARO...,MarcosTil,"{'id': 87421436, 'name': 'Marcos Til', 'descri...",False,,False,,False,,True,"{'user': 'detremura', 'user_id': 102826672, 't...",
2,2022-08-04 02:48:09,1555022751106228226,Em 3 anos e meio Bolsonaro trabalhou menos de ...,marialo71834768,"{'id': 1535023761241657351, 'name': 'Patriotis...",False,,False,,False,,True,"{'user': 'ThiagoResiste', 'user_id': 127077402...",
3,2022-08-04 02:41:05,1555020974847819778,"Ah, mas o problema da crise econômica é do 'fi...",desestressada1,"{'id': 1090045777395761154, 'name': 'desestres...",False,,False,,False,,True,"{'user': 'PedroRonchi2', 'user_id': 1097073004...",
4,2022-08-04 02:33:33,1555019075117621248,Em 3 anos e meio Bolsonaro trabalhou menos de ...,Edu64986869,"{'id': 1521688957720965120, 'name': 'Edu', 'de...",False,,False,,False,,True,"{'user': 'ThiagoResiste', 'user_id': 127077402...",


In [6]:
file_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21380 entries, 0 to 21379
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   created_at      21380 non-null  object
 1   tweet_id        21380 non-null  int64 
 2   tweet_content   21380 non-null  object
 3   user            21380 non-null  object
 4   user_info       21380 non-null  object
 5   has_mention     21380 non-null  bool  
 6   mentions        3477 non-null   object
 7   is_reply        21380 non-null  bool  
 8   reply_to        2686 non-null   object
 9   is_quote        21380 non-null  bool  
 10  quoted_from     1348 non-null   object
 11  is_retweet      21380 non-null  bool  
 12  retweeted_from  16312 non-null  object
 13  hashtags        1114 non-null   object
dtypes: bool(4), int64(1), object(9)
memory usage: 1.7+ MB


### __Quantitative Analysis__

In [3]:
tweets = pd.DataFrame()

for file in file_list:
    df = pd.read_parquet(os.path.join(DATA_PATH, file))
    tweets = pd.concat([tweets, df], ignore_index=True)

tweets.drop_duplicates(keep='first', inplace=True, ignore_index=True)

#### __Textual Preprocessing__



- URL removal
- Mention removal (@user)
- Hashtags removal (#removal)
- Punctuation removal
- Emoji removal
- Removing unwanted white spaces
    - Trailing spaces
    - Multiple spaces
    - Line breaks
- Text Normalization
    - Lowercase
    - Stopwords removal


In [9]:
tweets['preproc_tweets'] = tweets['tweet_content'].map(lambda x: pp.preprocess_text(x))
tweets.to_parquet(os.path.join(DATA_PATH, '2022_all_tweets'), index=False)
tweets_unique_preproc = tweets['preproc_tweet'].unique().shape[0]
tweets_unique = tweets['tweet_content'].unique().shape[0]

In [10]:
display(pd.DataFrame(
    index = ['tweets_total', 'tweets_unique', 'tweets_preproc_unique'],
    data = [tweets.shape[0], tweets_unique, tweets_unique_preproc]
))

Unnamed: 0,0
tweets_total,6676993
tweets_unique,1262224
tweets_preproc_unique,920830


In [None]:
#users

In [3]:
users = pd.Series()
tweets = pd.DataFrame()
days = {}
terms = {}

In [7]:
for file in file_list:
    df = pd.read_parquet(DATA_PATH+file)
    users = pd.concat([users, df['user']])
    tweets = pd.concat([tweets, df])
    tweets.drop_duplicates
    day = file[:10]
    term = file[11:-8]
    if day not in days.keys():
        days[day] = tweets
    else:
        days[day] = pd.concat([days[day], tweets]).drop_duplicates(ignore_index=True)
    if term not in terms.keys():
        terms[term] = tweets
    else:
        terms[term] = pd.concat([terms[term], tweets]).drop_duplicates(ignore_index=True)

In [None]:
tweets.shape

In [None]:
tweets.drop_duplicates(inplace=True, ignore_index=True)
tweets_unique = tweets['preproc_tweet'].unique().shape[0]

In [None]:
for key in days.keys():
    tweets = days[key].drop_duplicates()
    tweets_unique = tweets['']
    days[keys] = {
        
    }