# __Extracted Tweets Data Analyis__

In [1]:
import matplotlib.pyplot as plt
import os, json, datetime
import seaborn as sns
import pandas as pd
%matplotlib widget

In [2]:
DATA_PATH = 'data/tweets/'

week_list = [week_dir for week_dir in os.listdir(DATA_PATH) if os.path.isdir(DATA_PATH+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]
print('Amount of extracted weeks: ', len(week_list))

Amount of extracted weeks:  24


In [3]:
week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list

['week_01',
 'week_02',
 'week_03',
 'week_04',
 'week_05',
 'week_06',
 'week_07',
 'week_08',
 'week_09',
 'week_10',
 'week_11',
 'week_12',
 'week_pr_01',
 'week_pr_02',
 'week_13',
 'week_14',
 'week_15',
 'week_16',
 'week_17',
 'week_pr_03',
 'week_18',
 'week_19',
 'week_20',
 'week_21']

In [4]:
days = []
tts_amount = {
    'week': {
        'query_ext': [],
        'hashtag_ext': []
    },
    'day': {
        'query_ext': [],
        'hashtag_ext': []
    }
}
frequent_top_10_hashtags = {
    'query_ext': [],
    'hashtag_ext': []
}

for week in week_list:
    for file in os.listdir(DATA_PATH+week):
        if file.endswith('.json'):
            with open(f'{DATA_PATH+week}/{file}', encoding='utf-8') as week_info:
                data = json.load(week_info)
                tts_amount['week']['query_ext'].append(data['tweets_amount']['query_ext'])
                tts_amount['week']['hashtag_ext'].append(data['tweets_amount']['hashtag_ext'])
                for hashtag in data['top_10_hashtags']['query_ext']:
                    frequent_top_10_hashtags['query_ext'].append(hashtag)
                for hashtag in data['top_10_hashtags']['hashtag_ext']:
                    frequent_top_10_hashtags['hashtag_ext'].append(hashtag)
                for day in data['days_info']:
                    days.append(f'{week}_{day}')
                    tts_amount['day']['query_ext'].append(data['days_info'][day]['tweets_amount']['query_ext'])
                    tts_amount['day']['hashtag_ext'].append(data['days_info'][day]['tweets_amount']['hashtag_ext'])

## __Quantitative Analysis__

### __Tweets Amount__

In [5]:
weekly_tts_amount = pd.DataFrame(tts_amount['week'])
weekly_tts_amount['week'] = week_list

plt.figure(figsize=(14,6))
sns.set_style("whitegrid")
ax = sns.lineplot(x='week', y='value', hue='dataset', data=pd.melt(weekly_tts_amount, ['week']).rename(columns={'variable':'dataset'}))
ax.set(xlabel='week number', ylabel='amount of tweets')
plt.title('Tweets amount per week')
plt.xticks(rotation=15)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [6]:
daily_tts_amount = pd.DataFrame(tts_amount['day'])
daily_tts_amount['day'] = days

plt.figure(figsize=(14,6))
sns.set_style("whitegrid")
ax = sns.lineplot(x='day', y='value', hue='dataset', data=pd.melt(daily_tts_amount, ['day']).rename(columns={'variable':'dataset'}))
ax.set(xlabel='days', ylabel='amount of tweets')
plt.title('Tweets amount per day')
plt.tick_params(
    axis='x',
    which='both',
    bottom=False,
    top=False,
    labelbottom=False)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [7]:
print(f'''
Total number of weeks: {len(week_list)}

Total number of tweets:
- Query dataset: {sum(tts_amount['week']['query_ext'])}
- Hashtag dataset: {sum(tts_amount['week']['hashtag_ext'])}

Weeks with the highest amount of tweets:
- Query dataset: 
{weekly_tts_amount.sort_values(by='query_ext', ascending=False)[:5][['week','query_ext']].values.tolist()}
- Hashtag dataset: 
{weekly_tts_amount.sort_values(by='hashtag_ext', ascending=False)[:5][['week','hashtag_ext']].values.tolist()}

Days with the highest amount of tweets:
- Query dataset:
{daily_tts_amount.sort_values(by='query_ext', ascending=False)[:5][['day', 'query_ext']].values.tolist()}
- Hashtag dataset:
{daily_tts_amount.sort_values(by='hashtag_ext', ascending=False)[:5][['day', 'hashtag_ext']].values.tolist()}
''')


Total number of weeks: 24

Total number of tweets:
- Query dataset: 387377
- Hashtag dataset: 2958486

Weeks with the highest amount of tweets:
- Query dataset: 
[['week_02', 37097], ['week_04', 37037], ['week_03', 36058], ['week_01', 29618], ['week_05', 26749]]
- Hashtag dataset: 
[['week_10', 255016], ['week_09', 246175], ['week_05', 230436], ['week_13', 197249], ['week_11', 192390]]

Days with the highest amount of tweets:
- Query dataset:
[['week_03_day_4', 14333], ['week_02_day_3', 11035], ['week_01_day_3', 10495], ['week_04_day_4', 10262], ['week_04_day_3', 9957]]
- Hashtag dataset:
[['week_05_day_7', 131549], ['week_10_day_2', 87000], ['week_03_day_5', 83904], ['week_pr_03_day_3', 82177], ['week_08_day_7', 60863]]



### __User Amount__

In [8]:
file_lists = {}
user_count = {
    'query_ext': [],
    'hashtag_ext': []    
}

for week in week_list:
    query_users = []
    hashtag_users = []
    file_lists[week] = {
        'query_ext': [],
        'hashtag_ext': []
    }
    for file in os.listdir(DATA_PATH+week):
        if file.endswith('query_ext.parquet'):
            file_lists[week]['query_ext'].append(file)
        if file.endswith('hashtags_ext.parquet'):
            file_lists[week]['hashtag_ext'].append(file)
    file_lists[week]['query_ext'].sort()
    file_lists[week]['hashtag_ext'].sort()
    
    for file in file_lists[week]['query_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for user in df['user']:
            query_users.append(eval(user)['username'])
    user_count['query_ext'].append(len(set(query_users)))
    for file in file_lists[week]['hashtag_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for user in df['user']:
            hashtag_users.append(eval(user)['username'])
    user_count['hashtag_ext'].append(len(set(hashtag_users)))

In [9]:
user_count_df = pd.DataFrame(user_count)
user_count_df['week'] = week_list

plt.figure(figsize=(14,6))
sns.set_style("whitegrid")
ax = sns.lineplot(x='week', y='value', hue='dataset', data=pd.melt(user_count_df, ['week']).rename(columns={'variable':'dataset'}))
ax.set(xlabel='week number', ylabel='amount of users')
plt.title('Users amount per week')
plt.xticks(rotation=15)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## __Qualitative Analysis__

### __Hashtag Analysis__

#### Most frequent hashtags in the Top 10

In [10]:
pd.Series(frequent_top_10_hashtags['query_ext']).value_counts()[:10]

#forabolsonaro            20
#cpi                      18
#cpidapandemia            18
#cpidacovid               16
#forabolsonarogenocida    15
#pandemia                 13
#covid                    12
#cpidocirco               12
#covid19                  10
#brasil                   10
dtype: int64

In [11]:
pd.Series(frequent_top_10_hashtags['hashtag_ext']).value_counts()[:10]

#forabolsonaro            6
#cpidapandemia            5
#cpidacovid19             4
#impeachmentja            4
#vacinassalvamvidas       4
#fakenews                 4
#forabol卐onarogenocida    4
#bolsonaro2022            4
#12setforabolsonaro       3
#vacina                   3
dtype: int64

#### Most frequent hashtags throughout the period

In [12]:
hashtags = {
    'query_ext': [],
    'hashtag_ext': []
}

for week in week_list:
    for file in file_lists[week]['query_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for hashtag_list in df['hashtags']:
            hashtag_list = eval(hashtag_list)
            if hashtag_list:
                for hashtag in hashtag_list:
                    hashtags['query_ext'].append(hashtag.lower())
    for file in file_lists[week]['hashtag_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for hashtag_list in df['hashtags']:
            hashtag_list = eval(hashtag_list)
            if hashtag_list:
                for hashtag in hashtag_list:
                    hashtags['hashtag_ext'].append(hashtag.lower())

In [13]:
pd.Series(hashtags['query_ext']).value_counts()[:10]

cpidacovid        9839
cpidapandemia     4109
cpi               2928
forabolsonaro     2047
cpidocirco        1460
globonews         1350
covid             1113
cpidogenocidio    1084
pandemia          1078
covid19           1021
dtype: int64

In [14]:
pd.Series(hashtags['hashtag_ext']).value_counts()[:10]

cpidacovid               597552
forabolsonaro            580075
forabolsonarogenocida    147935
cpidapandemia            140279
renanvagabundo           115554
cpidocirco               107150
renansabiadetudo          88848
29mforabolsonaro          79660
euautorizopresidente      79111
cpidotse                  56165
dtype: int64

### __Topic Analysis__

In [15]:
import re, string
import nltk

In [16]:
docs = {
    'query_ext': [],
    'hashtag_ext': []
}

for week in week_list:
    for file in file_lists[week]['query_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for tweet in df['content']:
            docs['query_ext'].append(tweet)
    for file in file_lists[week]['hashtag_ext']:
        df = pd.read_parquet(f'{DATA_PATH+week}/{file}')
        for tweet in df['content']:
            docs['hashtag_ext'].append(tweet)
   
# removing duplicated tweets
for key in docs:
    tweets_series = pd.Series(docs[key])
    tweets_series.drop_duplicates(inplace=True)
    docs[key] = tweets_series.tolist()

#### Preprocessing Pipeline

In [17]:
stop_words = nltk.corpus.stopwords.words('portuguese')
stop_words.append('pra')
stop_words.append('tá')
stop_words.append('sobre')

def remove_emoji(tweet):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    
    return emoji_pattern.sub(r'', tweet)

def remove_link(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'bit.ly/\S+', '', tweet)
    tweet = tweet.strip('[link]')
    tweet = re.sub(r'pic.twitter\S+', '', tweet)
    return tweet

def remove_users(tweet):
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)
    return tweet

def remove_hashtags(tweet):
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)
    return tweet

def remove_av(tweet):
    tweet = re.sub(r'VIDEO:', '', tweet)
    tweet = re.sub(r'AUDIO:', '', tweet)
    return tweet

def tweet_preproc(tweet):
    '''
    Remove: @mentions, #hashtags, URL
    links, punctuation and emojis,
    and multiple white spaces
    '''
    tweet = remove_emoji(tweet)
    tweet = remove_link(tweet)
    tweet = remove_users(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = re.sub(r'\d', '', tweet)
    tweet = tweet.lower()
    tweet = [word for word in tweet.split() if len(word)>2 and word not in stop_words]
    
    return tweet


In [18]:
for key in docs:
    doc = []
    for tweet in docs[key]:
        doc.append(tweet_preproc(tweet))
    docs[key] = doc

In [19]:
 print(f'''
Amount of tweets
- Query dataset: {len(docs['query_ext'])}
- Hashtag dataset: {len(docs['hashtag_ext'])}
''')


Amount of tweets
- Query dataset: 384798
- Hashtag dataset: 2917027



In [20]:
docs['query_ext'][0]

['cpi',
 'covid',
 'faz',
 'rir',
 'jovem',
 'bando',
 'corrupto',
 'vai',
 'fazer',
 'alguma',
 'coisa',
 'além',
 'proselitismo',
 'político',
 'estude',
 'definição',
 'genocídio',
 'pare',
 'passar',
 'vergonha',
 'cairão',
 'lado',
 'mil',
 'direita',
 'serás',
 'atingindo']

In [21]:
docs['hashtag_ext'][0]

['fique', 'certo', 'extrema', 'esquerda', 'vai', 'ser', 'contra']

#### Train STTM Model

In [22]:
from gsdmm import MovieGroupProcess # loading gsdmm model

### __QUERY DATASET__

In [23]:
vocab = set(x for doc in docs['query_ext'] for x in doc)
n_terms = len(vocab)

print(f'''
QUERY DATASET
Vocabulary size: {n_terms}
Number of documents: {len(docs['query_ext'])}
''')


QUERY DATASET
Vocabulary size: 126808
Number of documents: 384798



In [24]:
mgp = MovieGroupProcess(K=15, alpha=0.10, beta=0.10, n_iters=30)

y = mgp.fit(docs['query_ext'], n_terms)

In stage 0: transferred 345091 clusters with 15 clusters populated
In stage 1: transferred 246673 clusters with 15 clusters populated
In stage 2: transferred 152354 clusters with 15 clusters populated
In stage 3: transferred 112733 clusters with 15 clusters populated
In stage 4: transferred 90144 clusters with 15 clusters populated
In stage 5: transferred 78185 clusters with 15 clusters populated
In stage 6: transferred 74144 clusters with 15 clusters populated
In stage 7: transferred 71833 clusters with 15 clusters populated
In stage 8: transferred 70263 clusters with 15 clusters populated
In stage 9: transferred 69260 clusters with 15 clusters populated
In stage 10: transferred 68107 clusters with 15 clusters populated
In stage 11: transferred 67391 clusters with 15 clusters populated
In stage 12: transferred 67265 clusters with 15 clusters populated
In stage 13: transferred 67129 clusters with 15 clusters populated
In stage 14: transferred 66674 clusters with 15 clusters populated
I

In [27]:
import numpy as np

doc_count = np.array(mgp.cluster_doc_count)

In [28]:
print('Number of documents per topic: ', doc_count)

Number of documents per topic:  [51568 14770  2105 29863 49413 24082 29603 17869 32305 31697 30679  5934
 22207 11176 31527]


In [29]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [ 0  4  8  9 14 10  3  6  5 12  7  1 13 11  2]


In [30]:
# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(mgp.cluster_word_distribution, top_index, 15)


Cluster 0 : [('cpi', 55228), ('covid', 41397), ('pandemia', 11449), ('vai', 10150), ('bolsonaro', 6946), ('ser', 4782), ('nada', 4482), ('presidente', 4031), ('cara', 3861), ('governo', 3622), ('agora', 3389), ('tudo', 3310), ('gente', 3294), ('ver', 3135), ('brasil', 2928)]

Cluster 4 : [('cpi', 50455), ('covid', 43434), ('pandemia', 6268), ('hoje', 4243), ('dia', 3637), ('assistir', 3093), ('bbb', 2872), ('ver', 2700), ('vai', 2698), ('melhor', 2666), ('gente', 2276), ('assistindo', 2251), ('entretenimento', 2082), ('brasil', 1713), ('vendo', 1673)]

Cluster 8 : [('cpi', 36071), ('covid', 17342), ('pandemia', 16478), ('governadores', 7808), ('dinheiro', 6883), ('prefeitos', 4581), ('governo', 4476), ('investigar', 4454), ('presidente', 4413), ('federal', 4029), ('circo', 4000), ('stf', 3314), ('combate', 3300), ('povo', 3130), ('ser', 2982)]

Cluster 9 : [('cpi', 34124), ('covid', 22220), ('pandemia', 11076), ('bolsonaro', 7954), ('governo', 5311), ('presidente', 3668), ('contra', 3

### __HASHTAG DATASET__

In [35]:
vocab = set(x for doc in docs['hashtag_ext'] for x in doc)
n_terms = len(vocab)

print(f'''
HASHTAG DATASET
Vocabulary size: {n_terms}
Number of documents: {len(docs['hashtag_ext'])}
''')


HASHTAG DATASET
Vocabulary size: 388370
Number of documents: 2917027



In [38]:
mgp = MovieGroupProcess(K=15, alpha=0.10, beta=0.10, n_iters=30)

y = mgp.fit(docs['hashtag_ext'], n_terms)

In stage 0: transferred 2675100 clusters with 15 clusters populated
In stage 1: transferred 2407687 clusters with 15 clusters populated
In stage 2: transferred 1861902 clusters with 15 clusters populated
In stage 3: transferred 1411733 clusters with 15 clusters populated
In stage 4: transferred 1209460 clusters with 15 clusters populated
In stage 5: transferred 1127918 clusters with 15 clusters populated
In stage 6: transferred 1088881 clusters with 15 clusters populated
In stage 7: transferred 1065805 clusters with 15 clusters populated
In stage 8: transferred 1049648 clusters with 15 clusters populated
In stage 9: transferred 1037996 clusters with 15 clusters populated
In stage 10: transferred 1026816 clusters with 15 clusters populated
In stage 11: transferred 1016918 clusters with 15 clusters populated
In stage 12: transferred 1006848 clusters with 15 clusters populated
In stage 13: transferred 997350 clusters with 15 clusters populated
In stage 14: transferred 989140 clusters with

In [39]:
import numpy as np

doc_count = np.array(mgp.cluster_doc_count)

In [40]:
print('Number of documents per topic: ', doc_count)

Number of documents per topic:  [228136 140719 150552 237787 119179 234885 291932 326260 390294  79393
  56777 377937  18568 191352  73256]


In [41]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [ 8 11  7  6  3  5  0 13  2  1  4  9 14 10 12]


In [42]:
# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(mgp.cluster_word_distribution, top_index, 15)


Cluster 8 : [('senador', 36434), ('cpi', 30336), ('vai', 30135), ('marcos', 25608), ('rogério', 23999), ('cara', 22924), ('hoje', 21359), ('omar', 15818), ('falar', 14535), ('agora', 14159), ('ser', 13478), ('gente', 13426), ('fala', 12947), ('renan', 11604), ('heinze', 10882)]

Cluster 11 : [('vai', 45936), ('bolsonaro', 44919), ('vagabundo', 21019), ('ser', 20202), ('genocida', 18383), ('ladrão', 17004), ('presidente', 15387), ('brasil', 12316), ('agora', 12014), ('corrupto', 12010), ('lula', 11832), ('cadeia', 11390), ('cara', 11097), ('pode', 10785), ('renan', 10561)]

Cluster 7 : [('vai', 18528), ('hoje', 17046), ('gente', 16781), ('aqui', 14909), ('brasil', 14652), ('bolsonaro', 14131), ('dia', 12621), ('agora', 12447), ('vou', 11434), ('ser', 11167), ('ver', 10929), ('ainda', 8510), ('fazer', 8355), ('tudo', 8300), ('bem', 8201)]

Cluster 6 : [('bolsonaro', 39459), ('brasil', 35725), ('governo', 30356), ('povo', 23235), ('mil', 23061), ('país', 22456), ('genocida', 20821), ('pr