# __Tweets Extraction: Brazilian Covid-19 CPI (Parliamentary Commission of Inquiry)__

In [1]:
from modules.hashtag_media import hashtag_media
import snscrape.modules.twitter as sntwitter
from modules.week import Week
import pandas as pd
import os, datetime

In [4]:
COLUMNS = ['id', 'url', 'date', 'content', 'rendered_content', 'user_id', 'username', 'user_display_name', 'user_description',
          'user_description_urls', 'user_verified', 'user_created', 'user_followers_count', 'user_friends_counts', 'user_status_count', 'user_favourites_count',
          'user_listed_count', 'user_media_count', 'user_location', 'user_protected', 'user_url', 'user_profile_pic', 'user_profile_banner',
          'reply_count', 'retweet_count', 'like_count', 'quote_count', 'conversation_id', 'media', 'retweeted_tweet', 'quoted_tweet',
          'in_reply_to_tweet_id', 'in_reply_to_user', 'mentioned_users', 'coordinates', 'place', 'hashtags', 'cashtags']

def get_hashtags(hashtag_series):
    hashtag_list = []
    for hashtag_group in hashtag_series:
        if hashtag_group:
            for hashtag in hashtag_group:
                if hashtag.lower() not in hashtag_media:
                    hashtag_list.append('#'+hashtag.lower())
    return hashtag_list

def get_unique_hashtags(hashtag_list):
    hashtag_series = pd.Series(hashtag_list)
    return hashtag_series.unique()

def get_hashtag_count(hashtag_list):
    return get_unique_hashtags(hashtag_list).size

def get_top10(hashtag_list):
    hashtag_series = pd.Series(hashtag_list)
    return hashtag_series.value_counts().index[:10].tolist()

def day_tweet_extract(day):
    until = (datetime.datetime.strptime(day, '%Y-%m-%d') + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    query = f'cpi AND covid OR pandemia lang:pt since:{day} until:{until}'
    print('- Cpi covid|pandemia query extraction:')
    tweets_list = []
    start_time = datetime.datetime.now()
    for idx, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        tweets_list.append([tweet.id, tweet.url, tweet.date, tweet.content, tweet.renderedContent, tweet.user.id, tweet.user.username, tweet.user.displayname, tweet.user.description,
          tweet.user.descriptionUrls, tweet.user.verified, tweet.user.created, tweet.user.followersCount, tweet.user.friendsCount, tweet.user.statusesCount, tweet.user.favouritesCount,
          tweet.user.listedCount, tweet.user.mediaCount, tweet.user.location, tweet.user.protected, tweet.user.linkUrl, tweet.user.profileImageUrl, tweet.user.profileBannerUrl,
          tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.conversationId, tweet.media, tweet.retweetedTweet, tweet.quotedTweet,
          tweet.inReplyToTweetId, tweet.inReplyToUser, tweet.mentionedUsers, tweet.coordinates, tweet.place, tweet.hashtags, tweet.cashtags])
    tweets_q = pd.DataFrame(tweets_list, columns=COLUMNS)
    runtime = datetime.datetime.now() - start_time
    print(f'-- Runtime: {runtime}'
         f'\n-- Tweets amount: {tweets_q.shape[0]}')
    hashtag_list_q = get_hashtags(tweets_q['hashtags'])
    hashtag_count_q = get_hashtag_count(hashtag_list_q)
    unique_hashtags_q = get_unique_hashtags(hashtag_list_q)
    hashtag_top10_q = get_top10(hashtag_list_q)
    print(f'-- Unique hashtags amount: {hashtag_count_q}'
         f'\n-- Top 10 Hashtags: {hashtag_top10_q}'
         '\n- Hashtag query extraction:')
    hashtag_query = ' OR '.join(hashtag_top10_q)
    query = f'({hashtag_query}) lang:pt since:{day} until:{until}'
    start_time = datetime.datetime.now()
    tweets_list = []
    for idx, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        tweets_list.append([tweet.id, tweet.url, tweet.date, tweet.content, tweet.renderedContent, tweet.user.id, tweet.user.username, tweet.user.displayname, tweet.user.description,
          tweet.user.descriptionUrls, tweet.user.verified, tweet.user.created, tweet.user.followersCount, tweet.user.friendsCount, tweet.user.statusesCount, tweet.user.favouritesCount,
          tweet.user.listedCount, tweet.user.mediaCount, tweet.user.location, tweet.user.protected, tweet.user.linkUrl, tweet.user.profileImageUrl, tweet.user.profileBannerUrl,
          tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.conversationId, tweet.media, tweet.retweetedTweet, tweet.quotedTweet,
          tweet.inReplyToTweetId, tweet.inReplyToUser, tweet.mentionedUsers, tweet.coordinates, tweet.place, tweet.hashtags, tweet.cashtags])
    tweets_h = pd.DataFrame(tweets_list, columns=COLUMNS)
    runtime = datetime.datetime.now() - start_time
    print(f'-- Runtime: {runtime}'
         f'\n-- Tweets amount: {tweets_h.shape[0]}')
    hashtag_list_h = get_hashtags(tweets_h['hashtags'])
    hashtag_count_h = get_hashtag_count(hashtag_list_h)
    unique_hashtags_h = get_unique_hashtags(hashtag_list_h)
    hashtag_top10_h = get_top10(hashtag_list_h)
    print(f'-- Unique hashtags amount: {hashtag_count_h}'
         f'\n-- Top 10 Hashtags: {hashtag_top10_h}\n')
    return {
        'query_ext': {
            'hashtag_count': hashtag_count_q,
            'hashtag_top_10': hashtag_top10_q,
            'unique_hashtags': unique_hashtags_q,
            'tweets': tweets_q
        },
        'hashtag_ext': {
            'hashtag_count': hashtag_count_h,
            'hashtag_top_10': hashtag_top10_h,
            'unique_hashtags': unique_hashtags_h,
            'tweets': tweets_h
        }
    }

def week_tweet_extract(week):
    week_dir = f'data/tweets/week_{week.number}'
    if not os.path.exists(week_dir):
        os.mkdir(week_dir)
    print(f'== EXTRACTING FROM WEEK {week.number}: SINCE {week.start} UNTIL {week.end} =='
         f'\n\nData is being stored in the following directory: {week_dir}\n')
    week_info = week.info
    week_hashtags = {
        'query_ext': [],
        'hashtag_ext': []
    }
    week_tweets_amount = {
        'query_ext': 0,
        'hashtag_ext': 0
    }
    for idx, day in enumerate(week.days):
        _day = f'day_{idx+1}'
        deponents = week_info['days_info'][_day]['deponents']
        print(f'Extracting tweets from Day: {idx+1}: {day}'
             f'\n- Deponents of the day: {deponents}')
        day_ext = day_tweet_extract(day)
        week_info['days_info'][_day]['tweets_amount'] = {
            'query_ext': day_ext['query_ext']['tweets'].shape[0],
            'hashtag_ext': day_ext['hashtag_ext']['tweets'].shape[0]
        }
        week_info['days_info'][_day]['hashtags_amount'] = {
            'query_ext': day_ext['query_ext']['hashtag_count'],
            'hashtag_ext': day_ext['hashtag_ext']['hashtag_count']
        }
        week_info['days_info'][_day]['top_10_hashtags'] = {
            'query_ext': day_ext['query_ext']['hashtag_top_10'],
            'hashtag_ext': day_ext['hashtag_ext']['hashtag_top_10']
        }
        week_tweets_amount = {
            'query_ext': week_tweets_amount['query_ext'] + day_ext['query_ext']['tweets'].shape[0],
            'hashtag_ext': week_tweets_amount['hashtag_ext'] + day_ext['hashtag_ext']['tweets'].shape[0]
        }
        for hashtag in day_ext['query_ext']['unique_hashtags']:
            week_hashtags['query_ext'].append(hashtag)
        for hashtag in day_ext['hashtag_ext']['unique_hashtags']:
            week_hashtags['hashtag_ext'].append(hashtag)
        day_ext['query_ext']['tweets'].to_csv(f'{week_dir}/{_day}_{day}_query_extraction.csv')
        #day_ext['hashtag_ext']['tweets'].to_parquet(f'{week_dir}/{_day}_{day}_hashtags_extraction.parquet', engine='pyarrow')
    week_info['tweets_amount'] = week_tweets_amount
    week_info['hashtags_amount'] = {
        'query_ext': get_hashtag_count(week_hashtags['query_ext']),
        'hashtag_ext': get_hashtag_count(week_hashtags['hashtag_ext'])
    }
    week_info['top_1o_hashtags'] = {
        'query_ext': get_top10(week_hashtags['query_ext']),
        'hashtag_ext': get_top10(week_hashtags['hashtag_ext'])
    }
    print('Generating week info json file...\n')
    with open(week_di + f'/week_{week.number}_info.json', 'w') as json_file:
        json.dump(week_info, json_file, indent=4)
        

In [7]:
df=pd.read_csv('data/tweets/week_1/day_1_2021-04-25_query_extraction.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,url,date,content,rendered_content,user_id,username,user_display_name,user_description,...,media,retweeted_tweet,quoted_tweet,in_reply_to_tweet_id,in_reply_to_user,mentioned_users,coordinates,place,hashtags,cashtags
0,0,1386470045711282176,https://twitter.com/ivermectinaveia/status/138...,2021-04-25 23:59:52+00:00,@nadaadeclarar21 @Simonescbrasil CPI da Covid ...,@nadaadeclarar21 @Simonescbrasil CPI da Covid ...,1383975050227830797,ivermectinaveia,Xerox Holmes 🇧🇷🇮🇱🇮🇹,Quem poupa o Lobo sacrifica as ovelhas,...,[Photo(previewUrl='https://pbs.twimg.com/media...,,,1.386469e+18,https://twitter.com/draMilenaAdv,"[User(username='Simonescbrasil', id=1256337486...",,,,
1,1,1386470045371576322,https://twitter.com/Douglas12626875/status/138...,2021-04-25 23:59:52+00:00,Bolsonaro que se prepare pois o depoimento do ...,Bolsonaro que se prepare pois o depoimento do ...,874381124243525633,Douglas12626875,Douglas Santos ⚖🇧🇷🇺🇸🇾🇪,Eu sou da Direita Conservadora!\nProfissional ...,...,,,,,,,"Coordinates(longitude=-46.826039, latitude=-24...","Place(fullName='Sao Paulo, Brazil', name='Sao ...",,
2,2,1386469944376840195,https://twitter.com/AfonsoRaimundo7/status/138...,2021-04-25 23:59:28+00:00,Se a CPI do Covid quer resultados basta \nSegu...,Se a CPI do Covid quer resultados basta \nSegu...,878304762525036545,AfonsoRaimundo7,Afonso Raimundo,Aposentado e agricultor,...,,,,,,,,,,
3,3,1386469891348344835,https://twitter.com/F_Coratti_S/status/1386469...,2021-04-25 23:59:16+00:00,No Direto ao Ponto com Augusto Nunes sobre a C...,No Direto ao Ponto com Augusto Nunes sobre a C...,935521517726846982,F_Coratti_S,Fernando Coratti Silva,"Advogado, contador, escritor - ""Verdade, Justi...",...,,,,,,"[User(username='YouTube', id=10228272, display...",,,,
4,4,1386469827209007110,https://twitter.com/nitro08_/status/1386469827...,2021-04-25 23:59:00+00:00,"Pra ""condenar"" Bolsonaro na CPI da Covid, bast...","Pra ""condenar"" Bolsonaro na CPI da Covid, bast...",51510363,nitro08_,ⁿᵃ ˡᵘᵃ 🏳️‍🌈,,...,,,,,,,,,"['edicao18', 'fantastico']",


In [11]:
df['coordinates']

0                                                     NaN
1       Coordinates(longitude=-46.826039, latitude=-24...
2                                                     NaN
3                                                     NaN
4                                                     NaN
                              ...                        
1212                                                  NaN
1213                                                  NaN
1214                                                  NaN
1215                                                  NaN
1216                                                  NaN
Name: coordinates, Length: 1217, dtype: object

In [5]:
deponents = {
    #'day_3': [],
    #'day_4': [],
    #'day_5': []
}

week = Week(week_number=1,
           week_start='2021-04-25',
           deponents=deponents)

week_tweet_extract(week)

== EXTRACTING FROM WEEK 1: SINCE 2021-04-25 UNTIL 2021-05-01 ==

Data is being stored in the following directory: data/tweets/week_1

Extracting tweets from Day: 1: 2021-04-25
- Deponents of the day: []
- Cpi covid|pandemia query extraction:
-- Runtime: 0:00:52.621169
-- Tweets amount: 1217
-- Unique hashtags amount: 101
-- Top 10 Hashtags: ['#caixadepandoria', '#cpidacovid', '#morosuspeitosim', '#lulalivre', '#cpidapandemia', '#renansuspeito', '#bolsonarogenocida', '#lulabolso', '#jairbolsonaro', '#bolsolula']
- Hashtag query extraction:
-- Runtime: 0:06:01.822387
-- Tweets amount: 8615
-- Unique hashtags amount: 796
-- Top 10 Hashtags: ['#caixadepandoria', '#bolsonarogenocida', '#lulalivre', '#morosuspeitosim', '#renansuspeito', '#forabolsonaro', '#cpidacovid', '#bolsonaroorgulhodobrasil', '#impeachmentdebolsonarourgente', '#bolsonaroate2026']

Extracting tweets from Day: 2: 2021-04-26
- Deponents of the day: []
- Cpi covid|pandemia query extraction:


KeyboardInterrupt: 