# __Tweets Extraction: Brazil's Covid CPI (Parliamentary Commission of Inquiry)__

In [1]:
import itertools, datetime, os, itertools, json
from modules.hashtag_media import hashtag_media
import snscrape.modules.twitter as sntwitter
from modules.week import Week
import pandas as pd

In [2]:
_hashtag_media = [hashtag.lower() for hashtag in hashtag_media]
def get_hashtags(hashtag_series):
    hashtag_list = []
    for hashtag_group in hashtag_series:
        if hashtag_group:
            for hashtag in hashtag_group:
                if hashtag.lower() not in _hashtag_media:
                    hashtag_list.append('#'+hashtag)
    return hashtag_list

def get_hashtag_count(hashtag_list):
    hashtag_series = pd.Series(hashtag_list)
    return hashtag_series.unique().size

def get_unique_hashtags(hashtag_list):
    hashtag_series = pd.Series(hashtag_list)
    return hashtag_series.unique()

def get_top_10(hashtag_list):
    hashtag_series = pd.Series(hashtag_list)
    return hashtag_series.value_counts().head(10).index.values.tolist()


def day_tweet_extract(day):
    until = (datetime.datetime.strptime(day, '%Y-%m-%d') + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    query = f'cpi covid lang:pt since:{day} until:{until} -filter:nativeretweets -filter:retweets'
    print('-Preliminary query extraction:')
    start_time = datetime.datetime.now()
    tweets = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(query).get_items(), 100000))
    runtime = datetime.datetime.now() - start_time
    print(f'--Runtime: {runtime}'
         f'\n--Tweets amount: {tweets.shape[0]}')
    hashtag_list = get_hashtags(tweets['hashtags'])
    hashtag_count = get_hashtag_count(hashtag_list)
    hashtag_top_10 = get_top_10(hashtag_list)
    print(f'--Unique hashtags amount: {hashtag_count}'
         f'\n--Hashtag Top 10: {hashtag_top_10}'
         '\n-Hashtag query extraction:')
    hashtag_query = ' OR '.join(hashtag_top_10)
    query = f'({hashtag_query}) lang:pt since:{day} until:{until}'
    start_time = datetime.datetime.now()
    tweets = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(query).get_items(), 100000))
    runtime = datetime.datetime.now() - start_time
    print(f'--Runtime: {runtime}'
         f'\n--Tweets amount: {tweets.shape[0]}')
    hashtag_list = get_hashtags(tweets['hashtags'])
    hashtag_count = get_hashtag_count(hashtag_list)
    unique_hashtags = get_unique_hashtags(hashtag_list)
    hashtag_top_10 = get_top_10(hashtag_list)
    print(f'--Unique hashtags amount: {hashtag_count}'
         f'\n--Hashtag Top 10: {hashtag_top_10}\n')
    return hashtag_count, hashtag_top_10, unique_hashtags,tweets

def week_tweet_extract(week):
    week_dir = f'data/tweets/week_{week.number}'
    if not os.path.exists(week_dir):
        os.mkdir(week_dir)
    print(f'== EXTRACTING FROM WEEK {week.number}: SINCE {week.start} UNTIL {week.end} =='
         f'\n\nData is being stored in the following directory: {week_dir}\n')
    week_info = week.info
    week_hashtags = []
    week_tweets_amount = 0
    for idx, day in enumerate(week.days):
        _day = f'day_{idx+1}'
        if not os.path.isfile(f'data/tweets_data/week_{week.number}/{_day}_{day}.csv'):
            deponents = week_info['days_info'][_day]['deponents']
            print(f'Extracting tweets from Day {idx+1}: {day}'
                 f'\n-Deponents of the day: {deponents}')
            hashtag_count, hashtag_top_10, unique_hashtags,tweets = day_tweet_extract(day)
            week_info['days_info'][_day]['tweets_amount'] = tweets.shape[0]
            week_info['days_info'][_day]['hashtags_amount'] = hashtag_count
            week_info['days_info'][_day]['top_10_hashtags'] = hashtag_top_10
            week_tweets_amount += tweets.shape[0]
            for hashtag in unique_hashtags:
                week_hashtags.append(hashtag)
            tweets.to_csv(week_dir+'/'+_day+f'_{day}.csv')
    week_info['tweets_amount'] = week_tweets_amount
    week_info['hashtags_amount'] = get_hashtag_count(week_hashtags)
    week_info['top_10_hashtags'] = get_top_10(week_hashtags)
    print('Generating week info json file...\n')
    with open(week_dir + f'/week_{week.number}_info.json', 'w') as json_file:
        json.dump(week_info, json_file, indent=4)
        

In [3]:
deponents = {
    'day_3': ['Mayra Pinheiro'],
    #'day_4': ['Eduardo Pazuello'],
    'day_5': ['Dimas Covas']
}
week = Week(week_number=5,
           week_start='2021-05-23',
           deponents=deponents)

week_tweet_extract(week)

== EXTRACTING FROM WEEK 5: SINCE 2021-05-23 UNTIL 2021-05-29 ==

Data is being stored in the following directory: data/tweets_data/week_5

Extracting tweets from Day 1: 2021-05-23
-Deponents of the day: []
-Preliminary query extraction:
--Runtime: 0:01:16.766288
--Tweets amount: 1458
--Unique hashtags amount: 144
--Hashtag Top 10: ['#ForaBolsonaro', '#CPIdaCovid', '#RenanVagabundo', '#ForaBolsonaroGenocida', '#BolsoInRio', '#BolsonaroGenocida', '#CPIdoCirco', '#CPIdaPandemia', '#CPICovid', '#COVID19']
-Hashtag query extraction:
--Runtime: 0:13:53.021021
--Tweets amount: 18139
--Unique hashtags amount: 2778
--Hashtag Top 10: ['#BolsoInRio', '#ForaBolsonaro', '#BolsonaroGenocida', '#ForaBolsonaroGenocida', '#BolsonaroOrgulhoDoBrasil', '#CPIdaCovid', '#Bolsonaro2022', '#CPIdoCirco', '#COVID19', '#RenanVagabundo']

Extracting tweets from Day 2: 2021-05-24
-Deponents of the day: []
-Preliminary query extraction:
--Runtime: 0:01:17.347152
--Tweets amount: 1679
--Unique hashtags amount: 195
-

KeyboardInterrupt: 