In [1]:
import sys
sys.path.insert(1, '../')
import pandas as pd
from source import preprocess as pp
import os

DATA_PATH = '../data/2023-brazilian-early-political-events/'

### __Files__

In [2]:
# loading file names
file_list = [file for file in os.listdir(DATA_PATH) if file.startswith('2023-')]
file_list.sort()
search_terms = pd.Series([file[11:-8] for file in file_list])
search_dates = pd.Series([file[0:10] for file in file_list])
# displaying the number of searched terms (total and unique)
print((
    'Number of search terms: {}\n'
    'Number of unique search terms: {}'
).format(
    search_terms.shape[0],
    search_terms.unique().shape[0]
))

Number of search terms: 223
Number of unique search terms: 157


In [3]:
dates_n_terms = pd.DataFrame({
    'date': search_dates,
    'search_terms': search_terms
})
# displaying search dates and terms
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(dates_n_terms)

Unnamed: 0,date,search_terms
0,2023-01-01,#LulaPresidente
1,2023-01-01,Alckmin
2,2023-01-01,Eduardo Bolsonaro
3,2023-01-01,Faixa Presidencial
4,2023-01-01,Itamaraty
5,2023-01-01,Janja
6,2023-01-01,Planalto
7,2023-01-01,SEM ANISTIA
8,2023-01-01,Sigilo de 100
9,2023-01-08,#GolpeDeEstado


In [4]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pd.DataFrame(dates_n_terms['search_terms'].value_counts()))

Unnamed: 0_level_0,count
search_terms,Unnamed: 1_level_1
CPMI,8
#PL2630Nao,5
#PLdaCensuraNao,5
Fake News,5
PL 2630 JÁ,5
Telegram,5
Xandão,4
Monark,4
PL 2630,3
GRANDE DIA,3


In [5]:
file_example = pd.read_parquet(os.path.join(DATA_PATH, file_list[0]))
file_example.head()

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
0,2023-01-02 02:58:57,1609746038943879173,Pra comemorar \n\n#LulaPresidente #DotadosComL...,pedro09232434,"{'id': 1337611906920681473, 'name': 'pedro', '...",False,,False,,False,,True,"{'user': 'jrdotadao', 'user_id': 1353322535086...","['#LulaPresidente', '#DotadosComLula']"
1,2023-01-02 02:58:57,1609746037308014592,WILLIAM BONNER E RENATA LO PRETE\nO encerramen...,missnancynunes,"{'id': 115765976, 'name': '🇧🇷🚩Nancy Nunes🚩🇨🇱',...",False,,False,,False,,True,"{'user': 'excentricko', 'user_id': 59014640, '...","['#PossePresidencial', '#LulaPresidente']"
2,2023-01-02 02:58:55,1609746028336672768,Quem é quem: os 37 ministros empossados por Lu...,Nobrega9Nobrega,"{'id': 1599603460802060290, 'name': 'Regina No...",False,,False,,False,,True,"{'user': 'MarcianoBrito13', 'user_id': 2960449...","['#PosseDoLula', '#LulaPresidente', '#LulaEoBr..."
3,2023-01-02 02:58:54,1609746024012124160,Abertura histórica do Fantástico. 2023 não can...,Tokyouir__,"{'id': 1499200767277871105, 'name': 'Kas⚖️🌶🦋',...",False,,False,,False,,True,"{'user': 'brunoGUZZO', 'user_id': 40162286, 't...",['#LulaPresidente']
4,2023-01-02 02:58:52,1609746014142881793,"O sol voltou a brilhar, Brasil!! @LulaOficial ...",ThirzziaGC,"{'id': 185431184, 'name': 'Thi', 'description'...",True,"[{'id': '2670726740', 'username': 'LulaOficial'}]",False,,False,,False,,['#LulaPresidente']


In [6]:
file_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18842 entries, 0 to 18841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   created_at      18842 non-null  object
 1   tweet_id        18842 non-null  int64 
 2   tweet_content   18842 non-null  object
 3   user            18842 non-null  object
 4   user_info       18842 non-null  object
 5   has_mention     18842 non-null  bool  
 6   mentions        3078 non-null   object
 7   is_reply        18842 non-null  bool  
 8   reply_to        990 non-null    object
 9   is_quote        18842 non-null  bool  
 10  quoted_from     677 non-null    object
 11  is_retweet      18842 non-null  bool  
 12  retweeted_from  6023 non-null   object
 13  hashtags        18842 non-null  object
dtypes: bool(4), int64(1), object(9)
memory usage: 1.5+ MB


### __Quantitative Analysis__

#### __Textual Preprocessing__



- URL removal
- Mention removal (@user)
- Hashtags removal (#removal)
- Punctuation removal
- Emoji removal
- Removing unwanted white spaces
    - Trailing spaces
    - Multiple spaces
    - Line breaks
- Text Normalization
    - Lowercase
    - Stopwords removal


In [7]:
import warnings; warnings.simplefilter('ignore')

tweets = pd.DataFrame()
users = pd.Series()
tweets_total = 0

for file in file_list:
    df = pd.read_parquet(os.path.join(DATA_PATH, file))
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['created_at'] = df['created_at'].dt.strftime('%Y-%m-%d %H:%M:%S')
    users = pd.concat([users, df['user']], ignore_index=True)
    tweets_total += df.shape[0]
    tweets = pd.concat([tweets, df], ignore_index=True)
    users.drop_duplicates(keep='first', inplace=True, ignore_index=True)

tweets['preproc_tweet'] = tweets['tweet_content'].map(lambda x: pp.preprocess_text(x))
tweets_unique = tweets['preproc_tweet'].unique().shape[0]
users_unique = users.shape[0]

display(pd.DataFrame(
    index = ['tweets_total', 'tweets_unique'],
    data = [tweets_total, tweets_unique],
    columns = ['amount']
))

Unnamed: 0,amount
tweets_total,13910048
tweets_unique,1877897


In [8]:
print(f'Total number of users: {users.shape[0]}')

Total number of users: 1346340


In [9]:
dates = dates_n_terms['date'].unique()
dates.shape

(56,)

In [10]:
days = {}
terms = {}

In [11]:
for day in dates:
    day_tweets = tweets.loc[tweets['created_at'].str.startswith(day)]
    day_terms = dates_n_terms.loc[dates_n_terms['date']==day]
    number_tweets = day_tweets.shape[0]
    numer_tweets_unique = day_tweets['preproc_tweet'].unique().shape[0]
    number_users = day_tweets['user'].unique().shape[0]
    number_terms = day_terms.shape[0]
    days[day] = {
        'number_of_tweets': number_tweets,
        'number_of_unique_tweets': numer_tweets_unique,
        'number_of_users': number_users,
        'number_of_terms': number_terms
    }

In [12]:
daily_status = pd.DataFrame.from_dict(days, orient='index')
daily_status

Unnamed: 0,number_of_tweets,number_of_unique_tweets,number_of_users,number_of_terms
2023-01-01,351845,71509,158536,9
2023-01-08,4133333,569285,619205,39
2023-01-12,24556,7083,15891,1
2023-01-13,72052,23271,42544,4
2023-01-17,23401,5884,18987,1
2023-01-18,33154,3176,16027,1
2023-01-21,269745,26222,91065,4
2023-01-23,260687,32746,94470,3
2023-01-24,157034,25041,65165,5
2023-01-25,138274,21224,61539,2


In [13]:
file_list

['2023-01-01-#LulaPresidente.parquet',
 '2023-01-01-Alckmin.parquet',
 '2023-01-01-Eduardo Bolsonaro.parquet',
 '2023-01-01-Faixa Presidencial.parquet',
 '2023-01-01-Itamaraty.parquet',
 '2023-01-01-Janja.parquet',
 '2023-01-01-Planalto.parquet',
 '2023-01-01-SEM ANISTIA.parquet',
 '2023-01-01-Sigilo de 100.parquet',
 '2023-01-08-#GolpeDeEstado.parquet',
 '2023-01-08-Alexandre de Moraes.parquet',
 '2023-01-08-Anderson Torres.parquet',
 '2023-01-08-Bandidos.parquet',
 '2023-01-08-Bolsonaristas.parquet',
 '2023-01-08-Brasília-first_half.parquet',
 '2023-01-08-Capitólio.parquet',
 '2023-01-08-Ciro.parquet',
 '2023-01-08-Congresso Nacional.parquet',
 '2023-01-08-Conivente.parquet',
 '2023-01-08-Criminosos.parquet',
 '2023-01-08-Democracia.parquet',
 '2023-01-08-Distrito Federal.parquet',
 '2023-01-08-Esplanada.parquet',
 '2023-01-08-Estado Democrático de Direito.parquet',
 '2023-01-08-Flávio Dino.parquet',
 '2023-01-08-Força Nacional.parquet',
 '2023-01-08-Galoucura.parquet',
 '2023-0

In [14]:
terms = {}

for term in dates_n_terms['search_terms']:
    term_tweets = pd.DataFrame()
    days_count = 0
    for file in file_list:
        if file[11:-8] == term:
            days_count+=1
            df = pd.read_parquet(os.path.join(DATA_PATH,file))
            term_tweets = pd.concat([term_tweets, df], ignore_index=True)
    term_tweets.drop_duplicates(keep='first', inplace=True, ignore_index=True)
    term_tweets['preproc_tweet'] = term_tweets['tweet_content'].map(lambda x: pp.preprocess_text(x))
    number_tweets = term_tweets.shape[0]
    number_tweets_unique = term_tweets['preproc_tweet'].unique().shape[0]
    number_users = term_tweets['user'].unique().shape[0]
    terms[term] = {
        'number_of_tweets': number_tweets,
        'number_of_unique_tweets': number_tweets_unique,
        'number_of_users': number_users,
        'number_of_days': days_count
    }

In [15]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pd.DataFrame(terms).T)

Unnamed: 0,number_of_tweets,number_of_unique_tweets,number_of_users,number_of_days
#LulaPresidente,18842,11846,13481,1
Alckmin,90419,19955,55990,1
Eduardo Bolsonaro,11163,1325,8508,1
Faixa Presidencial,53683,6468,40150,1
Itamaraty,1888,545,1484,1
Janja,126101,30889,79473,1
Planalto,270178,32059,138600,2
SEM ANISTIA,139742,15977,67676,2
Sigilo de 100,37902,4400,29302,1
#GolpeDeEstado,16493,3111,12134,1
