# __Extracted Tweets Data Analyis__

In [1]:
import matplotlib.pyplot as plt
import os, json, datetime
import seaborn as sns
import pandas as pd
%matplotlib widget

In [2]:
DATA_PATH = 'data/tweets/'

week_list = [week_dir for week_dir in os.listdir(DATA_PATH) if os.path.isdir(DATA_PATH+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]
print('Amount of extracted weeks: ', len(week_list))

Amount of extracted weeks:  21


In [3]:
week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list

['week_01',
 'week_02',
 'week_03',
 'week_04',
 'week_05',
 'week_06',
 'week_07',
 'week_08',
 'week_09',
 'week_10',
 'week_11',
 'week_12',
 'week_pr_01',
 'week_pr_02',
 'week_13',
 'week_14',
 'week_15',
 'week_16',
 'week_17',
 'week_pr_03',
 'week_18']

In [4]:
days = []
tts_amount = {
    'week': {
        'query_ext': [],
        'hashtag_ext': []
    },
    'day': {
        'query_ext': [],
        'hashtag_ext': []
    }
}
frequent_top_10_hashtags = {
    'query_ext': [],
    'hashtag_ext': []
}

for week in week_list:
    for file in os.listdir(DATA_PATH+week):
        if file.endswith('.json'):
            with open(f'{DATA_PATH+week}/{file}', encoding='utf-8') as week_info:
                data = json.load(week_info)
                tts_amount['week']['query_ext'].append(data['tweets_amount']['query_ext'])
                tts_amount['week']['hashtag_ext'].append(data['tweets_amount']['hashtag_ext'])
                for hashtag in data['top_10_hashtags']['query_ext']:
                    frequent_top_10_hashtags['query_ext'].append(hashtag)
                for hashtag in data['top_10_hashtags']['hashtag_ext']:
                    frequent_top_10_hashtags['hashtag_ext'].append(hashtag)
                for day in data['days_info']:
                    days.append(f'{week}_{day}')
                    tts_amount['day']['query_ext'].append(data['days_info'][day]['tweets_amount']['query_ext'])
                    tts_amount['day']['hashtag_ext'].append(data['days_info'][day]['tweets_amount']['hashtag_ext'])

## __Quantitative Analysis__

### __Tweets Amount__

In [5]:
weekly_tts_amount = pd.DataFrame(tts_amount['week'])
weekly_tts_amount['week'] = week_list

plt.figure(figsize=(14,6))
sns.set_style("whitegrid")
ax = sns.lineplot(x='week', y='value', hue='dataset', data=pd.melt(weekly_tts_amount, ['week']).rename(columns={'variable':'dataset'}))
ax.set(xlabel='week number', ylabel='amount of tweets')
plt.title('Tweets amount per week')
plt.xticks(rotation=15)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [6]:
daily_tts_amount = pd.DataFrame(tts_amount['day'])
daily_tts_amount['day'] = days

plt.figure(figsize=(14,6))
sns.set_style("whitegrid")
ax = sns.lineplot(x='day', y='value', hue='dataset', data=pd.melt(daily_tts_amount, ['day']).rename(columns={'variable':'dataset'}))
ax.set(xlabel='days', ylabel='amount of tweets')
plt.title('Tweets amount per day')
plt.tick_params(
    axis='x',
    which='both',
    bottom=False,
    top=False,
    labelbottom=False)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [7]:
print(f'''
Total number of weeks: {len(week_list)}

Total number of tweets:
- Query dataset: {sum(tts_amount['week']['query_ext'])}
- Hashtag dataset: {sum(tts_amount['week']['hashtag_ext'])}

Weeks with the highest amount of tweets:
- Query dataset: 
{weekly_tts_amount.sort_values(by='query_ext', ascending=False)[:5][['week','query_ext']].values.tolist()}
- Hashtag dataset: 
{weekly_tts_amount.sort_values(by='hashtag_ext', ascending=False)[:5][['week','hashtag_ext']].values.tolist()}

Days with the highest amount of tweets:
- Query dataset:
{daily_tts_amount.sort_values(by='query_ext', ascending=False)[:5][['day', 'query_ext']].values.tolist()}
- Hashtag dataset:
{daily_tts_amount.sort_values(by='hashtag_ext', ascending=False)[:5][['day', 'hashtag_ext']].values.tolist()}
''')


Total number of weeks: 21

Total number of tweets:
- Query dataset: 350135
- Hashtag dataset: 2736534

Weeks with the highest amount of tweets:
- Query dataset: 
[['week_02', 37097], ['week_04', 37037], ['week_03', 36058], ['week_01', 29618], ['week_05', 26749]]
- Hashtag dataset: 
[['week_10', 255016], ['week_09', 246175], ['week_05', 230436], ['week_13', 197249], ['week_11', 192390]]

Days with the highest amount of tweets:
- Query dataset:
[['week_03_day_4', 14333], ['week_02_day_3', 11035], ['week_01_day_3', 10495], ['week_04_day_4', 10262], ['week_04_day_3', 9957]]
- Hashtag dataset:
[['week_05_day_7', 131549], ['week_10_day_2', 87000], ['week_03_day_5', 83904], ['week_pr_03_day_3', 82177], ['week_08_day_7', 60863]]



### __User Amount__

In [8]:
file_lists = {}
user_count = {
    'query_ext': [],
    'hashtag_ext': []    
}

for week in week_list:
    query_users = []
    hashtag_users = []
    file_lists[week] = {
        'query_ext': [],
        'hashtag_ext': []
    }
    for file in os.listdir(DATA_PATH+week):
        if file.endswith('query_ext.parquet'):
            file_lists[week]['query_ext'].append(file)
        if file.endswith('hashtags_ext.parquet'):
            file_lists[week]['hashtag_ext'].append(file)
    file_lists[week]['query_ext'].sort()
    file_lists[week]['hashtag_ext'].sort()
    
    for file in file_lists[week]['query_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for user in df['user']:
            query_users.append(eval(user)['username'])
    user_count['query_ext'].append(len(set(query_users)))
    for file in file_lists[week]['hashtag_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for user in df['user']:
            hashtag_users.append(eval(user)['username'])
    user_count['hashtag_ext'].append(len(set(hashtag_users)))

In [10]:
user_count_df = pd.DataFrame(user_count)
user_count_df['week'] = week_list

plt.figure(figsize=(14,6))
sns.set_style("whitegrid")
ax = sns.lineplot(x='week', y='value', hue='dataset', data=pd.melt(user_count_df, ['week']).rename(columns={'variable':'dataset'}))
ax.set(xlabel='week number', ylabel='amount of users')
plt.title('Users amount per week')
plt.xticks(rotation=15)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## __Qualitative Analysis__

### __Hashtag Analysis__

#### Most frequent hashtags in the Top 10

In [11]:
pd.Series(frequent_top_10_hashtags['query_ext']).value_counts()[:10]

#forabolsonaro            17
#cpi                      16
#cpidapandemia            15
#cpidacovid               13
#forabolsonarogenocida    13
#pandemia                 12
#covid                    10
#brasil                   10
#covid19                   9
#cpidocirco                9
dtype: int64

In [12]:
pd.Series(frequent_top_10_hashtags['hashtag_ext']).value_counts()[:10]

#cpidapandemia            4
#forabol卐onarogenocida    4
#cpidacovid19             4
#fakenews                 3
#forabolsonaro            3
#coronavac                3
#elenao                   3
#bolsonaronacadeia        3
#12setforabolsonaro       3
#impeachmentja            3
dtype: int64

#### Most frequent hashtags throughout the period

In [13]:
hashtags = {
    'query_ext': [],
    'hashtag_ext': []
}

for week in week_list:
    for file in file_lists[week]['query_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for hashtag_list in df['hashtags']:
            hashtag_list = eval(hashtag_list)
            if hashtag_list:
                for hashtag in hashtag_list:
                    hashtags['query_ext'].append(hashtag.lower())
    for file in file_lists[week]['hashtag_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for hashtag_list in df['hashtags']:
            hashtag_list = eval(hashtag_list)
            if hashtag_list:
                for hashtag in hashtag_list:
                    hashtags['hashtag_ext'].append(hashtag.lower())

In [14]:
pd.Series(hashtags['query_ext']).value_counts()[:10]

cpidacovid        8984
cpidapandemia     3763
cpi               2616
forabolsonaro     1908
globonews         1176
cpidocirco        1140
cpidogenocidio    1066
covid             1032
pandemia          1008
covid19            941
dtype: int64

In [15]:
pd.Series(hashtags['hashtag_ext']).value_counts()[:10]

forabolsonaro            536287
cpidacovid               529892
forabolsonarogenocida    140463
cpidapandemia            125809
renanvagabundo           101326
renansabiadetudo          88846
cpidocirco                82118
29mforabolsonaro          79660
euautorizopresidente      79109
cpidotse                  56164
dtype: int64

### __Topic Analysis__

In [16]:
from bertopic import BERTopic

In [17]:
docs = {
    'query_ext': [],
    'hashtag_ext': []
}

for week in week_list:
    for file in file_lists[week]['query_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for tweet in df['content']:
            docs['query_ext'].append(tweet)
    for file in file_lists[week]['hashtag_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for tweet in df['content']:
            docs['hashtag_ext'].append(tweet)

In [None]:
topic_model = BERTopic(language='multilingual', calculate_probabilities=True, verbose=True)

topics_q, probs_q = topic_model.fit_transform(docs['query_ext'])
topics_h, probs_h = topic_model.fit_transform(docs['hashtag_ext'])

Batches:   0%|          | 0/10942 [00:00<?, ?it/s]

2021-09-30 12:45:22,028 - BERTopic - Transformed documents to Embeddings


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
