In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
import pytz
from tqdm.notebook import tqdm
from knockknock import slack_sender
import os
from dateutil.relativedelta import relativedelta

webhook_url = os.environ['KNOCKKNOCK_WEBHOOK']
# external_storage_path = os.environ['DATASET_PATH']

In [2]:
def scrap_twitter_single_search(bank, search_term, start_date, end_date):
    # pytz to localize the date
    utc=pytz.UTC
    
    # Converting start_date and end_date to datetime objects
    start_date = utc.localize(datetime.strptime(start_date, "%Y-%m-%d"))
    end_date = utc.localize(datetime.strptime(end_date, "%Y-%m-%d"))

    # Creating list to append tweet data to
    tweets_list = []

    # Using TwitterSearchScraper to scrape data and append tweets to list
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{search_term} since:{start_date:%Y-%m-%d} until:{end_date:%Y-%m-%d} lang:en').get_items()):
        
        # Checking if tweet date is before start_date 
        if tweet.date < start_date:
            break

        tweets_list.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username, tweet.replyCount, tweet.retweetCount, tweet.likeCount])

    # Creating a dataframe from the tweets list above
    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet_Id', 'Tweet', 'Username', 'Reply_Count', 'Retweet_Count', 'Like_Count'])
    
    # Adding Bank column to the dataframe
    tweets_df["Bank"] = bank
    
    return tweets_df

@slack_sender(webhook_url=webhook_url, channel="#general")
def scrap_twitter_multiple_search(bank_dict, start_date, end_date):
    
    dfs = []
    
    # Loop through banks
    for bank in tqdm(bank_dict.keys(), total=len(bank_dict)):
 
        # Loop through each search term per bank
        for search_term in tqdm(bank_dict[bank], total=len(bank_dict[bank])):
            dfs.append(scrap_twitter_single_search(bank, search_term, start_date, end_date))

    # Concatenate multiple dataframes and drop duplicate tweets
    result = pd.concat(dfs).drop_duplicates()
    
    return result

In [3]:
start_year = '2011'
end_year = '2022'

bank_dict = {'fnb': ['fnb', 'FNBSA', 'fnbSouthAfrica'],
             'absa': ['absa', 'absaSA', 'ABSASouthAfrica'],
             'nedbank': ['nedbank', 'NEDBANKSA', 'nedbankSouthAfrica'],
             'capitec': ['capitec', 'CapitecBank', 'capitecSA'],
             'standard_bank': ['standard bank','standardbank', 'StandardbankSA']}



# pytz to localize the date
utc=pytz.UTC

start_date = f'{start_year}-01-01'
end_date = f'{end_year}-12-31'


# Converting start_date and end_date to datetime objects
start_date = utc.localize(datetime.strptime(start_date, "%Y-%m-%d"))
end_date = utc.localize(datetime.strptime(end_date, "%Y-%m-%d"))
current_date = start_date

while current_date < end_date:
    next_date = current_date + relativedelta(years=1)
    print(f'Scrapping data from {current_date.strftime("%Y-%m-%d")} to {next_date.strftime("%Y-%m-%d")}')
    df = scrap_twitter_multiple_search(bank_dict, current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))
    df = df.astype({'Tweet_Id': int, 'Tweet':'string', 'Username': 'string', 'Reply_Count':int, 'Retweet_Count':int, 'Like_Count':int, 'Bank':'string'})
    df.to_parquet(f"tweets_from_{current_date.strftime('%Y-%m-%d')}_to_{next_date.strftime('%Y-%m-%d')}.parquet")
    current_date = next_date

Scrapping data from 2011-01-01 to 2012-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2012-01-01 to 2013-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2013-01-01 to 2014-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2014-01-01 to 2015-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2015-01-01 to 2016-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Could not translate t.co card URL on tweet 665244425757532160
Could not translate t.co card URL on tweet 664248739675176960


  0%|          | 0/3 [00:00<?, ?it/s]

Could not translate t.co card URL on tweet 671446426577162242


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2016-01-01 to 2017-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2017-01-01 to 2018-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2018-01-01 to 2019-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unsupported card type on tweet 1034375408744058880: '2586390716:image_direct_message'
Unsupported card type on tweet 1034375408744058880: '2586390716:image_direct_message'
Unsupported card type on tweet 1034375408744058880: '2586390716:image_direct_message'
Unsupported card type on tweet 1034375408744058880: '2586390716:image_direct_message'


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2019-01-01 to 2020-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unsupported card type on tweet 1170023857433919488: '2586390716:image_direct_message'


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unsupported card type on tweet 1116228090861170688: '2586390716:image_direct_message'


  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2020-01-01 to 2021-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Tweet 1319192647043612672 contains an app icon medium key '4_1615720470338277381' on app 'android_app'/'za.co.nedbank.avoafrica', but the corresponding medium is missing; dropping
Tweet 1336922580733792257 contains an app icon medium key '4_1615723213161648134' on app 'iphone_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1336922580733792257 contains an app icon medium key '4_1615723213161648134' on app 'ipad_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1331525223573807104 contains an app icon medium key '4_1615720470338277381' on app 'android_app'/'za.co.nedbank.avoafrica', but the corresponding medium is missing; dropping
Tweet 1319192647043612672 contains an app icon medium key '4_1615720470338277381' on app 'android_app'/'za.co.nedbank.avoafrica', but the corresponding medium is missing; dropping


  0%|          | 0/3 [00:00<?, ?it/s]

Tweet 1218265622058094594 contains an app icon medium key '4_1615813497215979538' on app 'android_app'/'com.productmadness.cashmancasino', but the corresponding medium is missing; dropping
Tweet 1234149243264393218 contains an app icon medium key '4_1615814181776723968' on app 'iphone_app'/'417962622', but the corresponding medium is missing; dropping
Tweet 1234149243264393218 contains an app icon medium key '4_1615814181776723968' on app 'ipad_app'/'417962622', but the corresponding medium is missing; dropping


  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2021-01-01 to 2022-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Tweet 1384438775116451841 contains an app icon medium key '4_1615723213161648134' on app 'iphone_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1384438775116451841 contains an app icon medium key '4_1615723213161648134' on app 'ipad_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1384438775116451841 contains an app icon medium key '4_1615723213161648134' on app 'iphone_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1384438775116451841 contains an app icon medium key '4_1615723213161648134' on app 'ipad_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1384438775116451841 contains an app icon medium key '4_1615723213161648134' on app 'iphone_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1384438775116451841 contains an app icon medium key '4_1615723213161648134' on app 'ipad_app'/'1260981758', but the corresponding medium is missing; dropping


  0%|          | 0/3 [00:00<?, ?it/s]

Tweet 1444965651496456196 contains an app icon medium key '4_1615882568007618560' on app 'android_app'/'com.seamonster.capitecfinancegame', but the corresponding medium is missing; dropping
Tweet 1410581063626264581 contains an app icon medium key '4_1615882568007618560' on app 'android_app'/'com.seamonster.capitecfinancegame', but the corresponding medium is missing; dropping
Tweet 1367883497705271296 contains an app icon medium key '4_1615882568007618560' on app 'android_app'/'com.seamonster.capitecfinancegame', but the corresponding medium is missing; dropping


  0%|          | 0/3 [00:00<?, ?it/s]

Scrapping data from 2022-01-01 to 2023-01-01


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Tweet 1577658815415214086 contains an app icon medium key '4_1615723213161648134' on app 'iphone_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1577658815415214086 contains an app icon medium key '4_1615723213161648134' on app 'ipad_app'/'1260981758', but the corresponding medium is missing; dropping
Tweet 1497194934239174665 contains an app icon medium key '4_1615720470338277381' on app 'android_app'/'za.co.nedbank.avoafrica', but the corresponding medium is missing; dropping


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]