In [16]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
import pytz
from tqdm.notebook import tqdm
from knockknock import slack_sender
import os

webhook_url = os.environ['KNOCKKNOCK_WEBHOOK']
external_storage_path = os.environ['DATASET_PATH']

In [2]:
def scrap_twitter_single_search(bank, search_term, start_date, end_date):
    # pytz to localize the date
    utc=pytz.UTC
    
    # Converting start_date and end_date to datetime objects
    start_date = utc.localize(datetime.strptime(start_date, "%Y-%m-%d"))
    end_date = utc.localize(datetime.strptime(end_date, "%Y-%m-%d"))

    # Creating list to append tweet data to
    tweets_list = []

    # Using TwitterSearchScraper to scrape data and append tweets to list
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{search_term} since:{start_date:%Y-%m-%d} until:{end_date:%Y-%m-%d} lang:en').get_items()):
        
        # Checking if tweet date is before start_date 
        if tweet.date < start_date:
            break

        tweets_list.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username, tweet.replyCount, tweet.retweetCount, tweet.likeCount])

    # Creating a dataframe from the tweets list above
    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet_Id', 'Tweet', 'Username', 'Reply_Count', 'Retweet_Count', 'Like_Count'])
    
    # Adding Bank column to the dataframe
    tweets_df["Bank"] = bank
    
    return tweets_df

@slack_sender(webhook_url=webhook_url, channel="#general")
def scrap_twitter_multiple_search(bank_dict, start_date, end_date):
    
    dfs = []
    
    # Loop through banks
    for bank in tqdm(bank_dict.keys(), total=len(bank_dict)):
 
        # Loop through each search term per bank
        for search_term in tqdm(bank_dict[bank], total=len(bank_dict[bank])):
            dfs.append(scrap_twitter_single_search(bank, search_term, start_date, end_date))

    # Concatenate multiple dataframes and drop duplicate tweets
    result = pd.concat(dfs).drop_duplicates()
    
    return result

In [4]:

df = scrap_twitter_multiple_search(bank_dict, start_date, end_date)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# Inspect tweets and order by the most liked tweets
df.head(5).sort_values(by='Like_Count', ascending=False)

Unnamed: 0,Datetime,Tweet_Id,Tweet,Username,Reply_Count,Retweet_Count,Like_Count,Bank
0,2010-12-30 22:54:54+00:00,20613831155785728,RT @RosesR_Red: RT @DAY_BOOGIE: U.N.I.T.Y ( Da...,DAY_STAY_FNB,0,0,0,fnb
1,2010-12-30 22:51:13+00:00,20612902956306432,U.N.I.T.Y,DAY_STAY_FNB,0,0,0,fnb
2,2010-12-30 22:13:44+00:00,20603469635256320,@20Ls_January6th so save me some,DAY_STAY_FNB,0,0,0,fnb
3,2010-12-30 21:30:31+00:00,20592592139853825,RT @20Ls_January6th: Bout Too Makee ShrimpFRiE...,DAY_STAY_FNB,0,0,0,fnb
4,2010-12-30 21:25:10+00:00,20591248762998784,@fnb_Saturn let's go the week of myy birthday ...,_dannnr,0,0,0,fnb


In [6]:
# Check current dtypes of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75037 entries, 0 to 1
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   Datetime       75037 non-null  datetime64[ns, UTC]
 1   Tweet_Id       75037 non-null  object             
 2   Tweet          75037 non-null  object             
 3   Username       75037 non-null  object             
 4   Reply_Count    75037 non-null  object             
 5   Retweet_Count  75037 non-null  object             
 6   Like_Count     75037 non-null  object             
 7   Bank           75037 non-null  object             
dtypes: datetime64[ns, UTC](1), object(7)
memory usage: 5.2+ MB


In [7]:
# Set dtypes of columns of dataframe
df = df.astype({'Tweet_Id': int, 'Tweet':'string', 'Username': 'string', 'Reply_Count':int, 'Retweet_Count':int, 'Like_Count':int, 'Bank':'string'})
df.dtypes

Datetime         datetime64[ns, UTC]
Tweet_Id                       int64
Tweet                         string
Username                      string
Reply_Count                    int64
Retweet_Count                  int64
Like_Count                     int64
Bank                          string
dtype: object

In [8]:
# Check updated column dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75037 entries, 0 to 1
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   Datetime       75037 non-null  datetime64[ns, UTC]
 1   Tweet_Id       75037 non-null  int64              
 2   Tweet          75037 non-null  string             
 3   Username       75037 non-null  string             
 4   Reply_Count    75037 non-null  int64              
 5   Retweet_Count  75037 non-null  int64              
 6   Like_Count     75037 non-null  int64              
 7   Bank           75037 non-null  string             
dtypes: datetime64[ns, UTC](1), int64(4), string(3)
memory usage: 5.2 MB


In [11]:
# Save tweets to parquet form. Note, parquet is a faster and more efficient form of stores larger files compared to csv.
df.to_parquet(f"tweets_from_{start_date}_to_{end_date}.parquet")

In [28]:
# start_year = '2011'
# end_year = '2022'

# bank_dict = {'fnb': ['fnb', 'FNBSA', 'fnbSouthAfrica'],
#              'absa': ['absa', 'absaSA', 'ABSASouthAfrica'],
#              'nedbank': ['nedbank', 'NEDBANKSA', 'nedbankSouthAfrica'],
#              'capitec': ['capitec', 'CapitecBank', 'capitecSA'],
#              'standard_bank': ['standard bank','standardbank', 'StandardbankSA']}



# # pytz to localize the date
# utc=pytz.UTC

# start_date = f'{start_year}-01-01'
# end_date = f'{end_year}-12-31'


# # Converting start_date and end_date to datetime objects
# start_date = utc.localize(datetime.strptime(start_date, "%Y-%m-%d"))
# end_date = utc.localize(datetime.strptime(end_date, "%Y-%m-%d"))

# select_start_date = start_date
# select_end_date = utc.localize(datetime(start_date.year, 12, 31))
# while select_start_date.year < end_date.year:
#     print(select_start_date, select_end_date)
#     select_start_date += timedelta(days=365.24)
#     select_end_date += timedelta(days=365.24)
    
    


2011-01-01 00:00:00+00:00 2022-12-31 00:00:00+00:00
2011-01-01 00:00:00+00:00 2011-12-31 00:00:00+00:00
2011-01-01 00:00:00+00:00 2011-12-31 00:00:00+00:00
2012-01-01 05:45:36+00:00 2012-12-30 05:45:36+00:00
2012-12-31 11:31:12+00:00 2013-12-30 11:31:12+00:00
2013-12-31 17:16:48+00:00 2014-12-30 17:16:48+00:00
2014-12-31 23:02:24+00:00 2015-12-30 23:02:24+00:00
2016-01-01 04:48:00+00:00 2016-12-30 04:48:00+00:00
2016-12-31 10:33:36+00:00 2017-12-30 10:33:36+00:00
2017-12-31 16:19:12+00:00 2018-12-30 16:19:12+00:00
2018-12-31 22:04:48+00:00 2019-12-30 22:04:48+00:00
2020-01-01 03:50:24+00:00 2020-12-30 03:50:24+00:00
2020-12-31 09:36:00+00:00 2021-12-30 09:36:00+00:00
2021-12-31 15:21:36+00:00 2022-12-30 15:21:36+00:00
