In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
import pytz
from tqdm.notebook import tqdm
from knockknock import slack_sender
import os
from dateutil.relativedelta import relativedelta

webhook_url = os.environ['KNOCKKNOCK_WEBHOOK']

In [None]:
def scrap_twitter_single_search(bank, search_term, start_date, end_date):
    # pytz to localize the date
    utc=pytz.UTC
    
    # Converting start_date and end_date to datetime objects
    start_date = utc.localize(datetime.strptime(start_date, "%Y-%m-%d"))
    end_date = utc.localize(datetime.strptime(end_date, "%Y-%m-%d"))

    # Creating list to append tweet data to
    tweets_list = []

    # Using TwitterSearchScraper to scrape data and append tweets to list
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{search_term} since:{start_date:%Y-%m-%d} until:{end_date:%Y-%m-%d} lang:en').get_items()):
        
        # Checking if tweet date is before start_date 
        if tweet.date < start_date:
            break

        tweets_list.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username, tweet.replyCount, tweet.retweetCount, tweet.likeCount])

    # Creating a dataframe from the tweets list above
    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet_Id', 'Tweet', 'Username', 'Reply_Count', 'Retweet_Count', 'Like_Count'])
    
    # Adding Bank column to the dataframe
    tweets_df["Bank"] = bank
    
    return tweets_df

@slack_sender(webhook_url=webhook_url, channel="#general")
def scrap_twitter_multiple_search(bank_dict, start_date, end_date):
    
    dfs = []
    
    # Loop through banks
    for bank in tqdm(bank_dict.keys(), total=len(bank_dict)):
 
        # Loop through each search term per bank
        for search_term in tqdm(bank_dict[bank], total=len(bank_dict[bank])):
            dfs.append(scrap_twitter_single_search(bank, search_term, start_date, end_date))

    # Concatenate multiple dataframes and drop duplicate tweets
    result = pd.concat(dfs).drop_duplicates()
    
    return result

In [None]:
start_year = '2019'
end_year = '2020'

# bank_dict = {'fnb': ['fnb', 'FNBSA', 'fnbSouthAfrica'],
#              'absa': ['absa', 'absaSA', 'ABSASouthAfrica'],
#              'nedbank': ['nedbank', 'NEDBANKSA', 'nedbankSouthAfrica'],
#              'capitec': ['capitec', 'CapitecBank', 'capitecSA'],
#              'standard_bank': ['standard bank','standardbank', 'StandardbankSA']}


bank_dict = {'capitec': ['capitecbankSA'],
             'standard_bank': ['StandardbankZA', 'standardbankSouthAfrica']}


# pytz to localize the date
utc=pytz.UTC

start_date = f'{start_year}-01-01'
end_date = f'{end_year}-12-31'


# Converting start_date and end_date to datetime objects
start_date = utc.localize(datetime.strptime(start_date, "%Y-%m-%d"))
end_date = utc.localize(datetime.strptime(end_date, "%Y-%m-%d"))
current_date = start_date

while current_date < end_date:
    next_date = current_date + relativedelta(years=1)
    print(f'Scrapping data from {current_date.strftime("%Y-%m-%d")} to {next_date.strftime("%Y-%m-%d")}')
    df = scrap_twitter_multiple_search(bank_dict, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))
    df = df.astype({'Tweet_Id': int, 'Tweet':'string', 'Username': 'string', 'Reply_Count':int, 'Retweet_Count':int, 'Like_Count':int, 'Bank':'string'})
    df.to_parquet(f"updated_search_terms_tweets_from_{current_date.strftime('%Y-%m-%d')}_to_{next_date.strftime('%Y-%m-%d')}.parquet")
    current_date = next_date