Functions

In [1]:
from credentials import *
import pandas as pd
import tweepy, json

In [2]:
def api_setup():
    """
    Setup the Twitter's API
    """
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
    return tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


def return_tweets(query, since_d, until_d, lang):
    tweet_list = []
    retr_tweets = tweepy.Cursor(api.search, q=query, since=since_d, until=until_d, lang=lang, tweet_mode='extended').items()
    for tweet in retr_tweets:
        tweet_list.append(tweet._json)
    return tweet_list


def generate_json(tweet_list):
    return json.dumps(tweet_list)


def generate_json_file(tweet_list, file_path, file_name):
    json_str = json.dumps(tweet_list)
    json_file = open(file_path + file_name + '.json', 'w')
    json_file.write(json_str)
    json_file.close()
    
    
def return_hashtag_list(tweet_list):
    hashtag_list = []
    for tweet in tweet_list:
        if tweet['entities']['hashtags']:
            for hashtag in tweet['entities']['hashtags']:
                hashtag_list.append(hashtag['text'].lower())
    return hashtag_list


def return_hashtag_count(hashtag_list):
    return pd.Series(hashtag_list).value_counts()


### __Automatic retrieving tweets from 2021-06-27 to 2021-07-03__

In [3]:
api = api_setup()
PATH = 'retrieved_tweets/'

week = {
    'day_1': ['2021-06-27','2021-06-28'],
    'day_2': ['2021-06-28','2021-06-29'],
    'day_3': ['2021-06-29','2021-06-30'],
    'day_4': ['2021-06-30','2021-07-01'],
    'day_5': ['2021-07-01','2021-07-02'],
    'day_6': ['2021-07-02','2021-07-03'],
    'day_7': ['2021-07-03','2021-07-04']
}


In [4]:
for day in week.items():
    since_date = day[1][0]
    until_date = day[1][1]
    file_name_1 = since_date+'_query.json'
    file_name_2 = since_date+'_hasht.json'
    print('Fetching tweets from day ', since_date)
    tweet_list = return_tweets('cpi AND covid', since_date, until_date, 'pt')
    print('Number of retrieved by cpi and covid query tweets: ', len(tweet_list))
    generate_json_file(tweet_list, PATH, file_name_1)
    print('JSON file created: '+PATH+file_name_1)
    unique_hashtags = return_hashtag_count(return_hashtag_list(tweet_list))
    print('Number of unique hashtags: ', unique_hashtags.shape[0])
    hashtag_top10 = unique_hashtags.head(10).index
    for i in range(0, len(hashtag_top10)):
        hashtag_top10[i] = '#'+hashtag_top10[i]
    print('The Top10 hashtags are: ', hashtag_top10)
    hashtag_query = ' OR '.join(hashtag_top10)
    tweet_list = return_tweets(hashtag_query, since_date, until_date, 'pt')
    print('Number of retrieved by top 10 hashtags query tweets: ', len(tweet_list))
    generate_json_file(tweet_list, PATH, file_name_2)
    print('JSON file created: '+PATH+file_name_2)
print('Finished day ', since_date)   


Fetching tweets from day  2021-06-27
Fetching tweets from day  2021-06-28
Fetching tweets from day  2021-06-29
Fetching tweets from day  2021-06-30
Fetching tweets from day  2021-07-01
Fetching tweets from day  2021-07-02
Fetching tweets from day  2021-07-03
