# __Tweets Extraction: Brazilian Covid-19 CPI (Parliamentary Commission of Inquiry)__

In [1]:
from modules.variables import hashtag_media, week_list
import snscrape.modules.twitter as sntwitter
import os, itertools, datetime, json
from modules.week import Week
import pandas as pd

In [2]:
def get_hashtags(hashtag_series):
    hashtag_list = []
    for hashtag_group in hashtag_series:
        if hashtag_group:
            for hashtag in hashtag_group:
                if hashtag.lower() not in hashtag_media:
                    hashtag_list.append('#'+hashtag.lower())
    return hashtag_list

def get_unique_hashtags(hashtag_list):
    hashtag_series = pd.Series(hashtag_list)
    return hashtag_series.unique()

def get_hashtag_count(hashtag_list):
    return get_unique_hashtags(hashtag_list).size

def get_top10(hashtag_list):
    hashtag_series = pd.Series(hashtag_list)
    return hashtag_series.value_counts().index[:10].tolist()

def day_tweet_extract(day):
    until = (datetime.datetime.strptime(day, '%Y-%m-%d') + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    query = f'cpi AND covid OR pandemia lang:pt since:{day} until:{until}'
    print('- Cpi covid|pandemia query extraction:')
    start_time = datetime.datetime.now()
    tweets_q = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(query).get_items(), None))
    runtime = datetime.datetime.now() - start_time
    print(f'-- Runtime: {runtime}'
         f'\n-- Tweets amount: {tweets_q.shape[0]}')
    hashtag_list_q = get_hashtags(tweets_q['hashtags'])
    hashtag_count_q = get_hashtag_count(hashtag_list_q)
    unique_hashtags_q = get_unique_hashtags(hashtag_list_q)
    hashtag_top10_q = get_top10(hashtag_list_q)
    print(f'-- Unique hashtags amount: {hashtag_count_q}'
         f'\n-- Top 10 Hashtags: {hashtag_top10_q}'
         '\n- Hashtag query extraction:')
    hashtag_query = ' OR '.join(hashtag_top10_q)
    query = f'({hashtag_query}) lang:pt since:{day} until:{until}'
    start_time = datetime.datetime.now()
    tweets_h = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(query).get_items(), None))
    runtime = datetime.datetime.now() - start_time
    print(f'-- Runtime: {runtime}'
         f'\n-- Tweets amount: {tweets_h.shape[0]}')
    hashtag_list_h = get_hashtags(tweets_h['hashtags'])
    hashtag_count_h = get_hashtag_count(hashtag_list_h)
    unique_hashtags_h = get_unique_hashtags(hashtag_list_h)
    hashtag_top10_h = get_top10(hashtag_list_h)
    print(f'-- Unique hashtags amount: {hashtag_count_h}'
         f'\n-- Top 10 Hashtags: {hashtag_top10_h}\n')
    return {
        'query_ext': {
            'hashtag_count': hashtag_count_q,
            'hashtag_top_10': hashtag_top10_q,
            'unique_hashtags': unique_hashtags_q,
            'tweets': tweets_q
        },
        'hashtag_ext': {
            'hashtag_count': hashtag_count_h,
            'hashtag_top_10': hashtag_top10_h,
            'unique_hashtags': unique_hashtags_h,
            'tweets': tweets_h
        }
    }

def week_tweet_extract(week):
    week_dir = f'data/tweets/week_{week.number}'
    if not os.path.exists(week_dir):
        os.mkdir(week_dir)
    print(f'====== EXTRACTING FROM WEEK {week.number}: SINCE {week.start} UNTIL {week.end} ======'
         f'\n\nData is being stored in the following directory: {week_dir}\n')
    week_info = week.info
    week_hashtags = {
        'query_ext': [],
        'hashtag_ext': []
    }
    week_tweets_amount = {
        'query_ext': 0,
        'hashtag_ext': 0
    }
    for idx, day in enumerate(week.days):
        _day = f'day_{idx+1}'
        deponents = week_info['days_info'][_day]['deponents']
        print(f'Extracting tweets from Day: {idx+1}: {day}'
             f'\n- Deponents of the day: {deponents}')
        day_ext = day_tweet_extract(day)
        week_info['days_info'][_day]['tweets_amount'] = {
            'query_ext': day_ext['query_ext']['tweets'].shape[0],
            'hashtag_ext': day_ext['hashtag_ext']['tweets'].shape[0]
        }
        week_info['days_info'][_day]['hashtags_amount'] = {
            'query_ext': day_ext['query_ext']['hashtag_count'],
            'hashtag_ext': day_ext['hashtag_ext']['hashtag_count']
        }
        week_info['days_info'][_day]['top_10_hashtags'] = {
            'query_ext': day_ext['query_ext']['hashtag_top_10'],
            'hashtag_ext': day_ext['hashtag_ext']['hashtag_top_10']
        }
        week_tweets_amount = {
            'query_ext': week_tweets_amount['query_ext'] + day_ext['query_ext']['tweets'].shape[0],
            'hashtag_ext': week_tweets_amount['hashtag_ext'] + day_ext['hashtag_ext']['tweets'].shape[0]
        }
        for hashtag in day_ext['query_ext']['unique_hashtags']:
            week_hashtags['query_ext'].append(hashtag)
        for hashtag in day_ext['hashtag_ext']['unique_hashtags']:
            week_hashtags['hashtag_ext'].append(hashtag)
        day_ext['query_ext']['tweets'].astype(str).to_parquet(f'{week_dir}/{_day}_{day}_query_ext.parquet')
        day_ext['hashtag_ext']['tweets'].astype(str).to_parquet(f'{week_dir}/{_day}_{day}_hashtags_ext.parquet')
    week_info['tweets_amount'] = week_tweets_amount
    week_info['hashtags_amount'] = {
        'query_ext': get_hashtag_count(week_hashtags['query_ext']),
        'hashtag_ext': get_hashtag_count(week_hashtags['hashtag_ext'])
    }
    week_info['top_10_hashtags'] = {
        'query_ext': get_top10(week_hashtags['query_ext']),
        'hashtag_ext': get_top10(week_hashtags['hashtag_ext'])
    }
    print('Generating week info json file...\n')
    with open(week_dir + f'/week_{week.number}_info.json', 'w') as json_file:
        json.dump(week_info, json_file, indent=4)
        

In [3]:
# deponents = {
#     #'day_3': [],
#     #'day_4': [],
#     #'day_5': []
# }

# week = Week(week_number=1,
#            week_start='2021-04-25',
#            deponents=deponents)

# week_tweet_extract(week)

In [5]:
for week in week_list:
    _week = Week(week_number=week['week_number'],
                week_start=week['week_start'],
                deponents=week['deponents'])
    week_tweet_extract(_week)


Data is being stored in the following directory: data/tweets/week_pr_04

Extracting tweets from Day: 1: 2021-10-10
- Deponents of the day: []
- Cpi covid|pandemia query extraction:
-- Runtime: 0:00:23.391077
-- Tweets amount: 485
-- Unique hashtags amount: 44
-- Top 10 Hashtags: ['#cpidacovid', '#cpi', '#covid', '#forabolsonaro', '#ls', '#polícia', '#cpidocirco', '#puggina', '#senado', '#letinhodeminas']
- Hashtag query extraction:
-- Runtime: 0:01:24.225824
-- Tweets amount: 2079
-- Unique hashtags amount: 789
-- Top 10 Hashtags: ['#forabolsonaro', '#cpidocirco', '#forabolsonarogenocida', '#cpidacovid', '#covid', '#forabolsonaroesuaquadrilha', '#impeachmentbolsonarourgente', '#lulapresidente2022', '#bolsonarogenocida', '#fechadoscombolsonaro']

Extracting tweets from Day: 2: 2021-10-11
- Deponents of the day: []
- Cpi covid|pandemia query extraction:
-- Runtime: 0:00:31.574317
-- Tweets amount: 786
-- Unique hashtags amount: 90
-- Top 10 Hashtags: ['#cpidacovid', '#cpi', '#tácaroculp

ScraperException: Unable to find guest token