# Twitter - Historical Data Extraction

    
1. Get Python >= 3.8.0
2. Install:

```$:> pip install git+https://github.com/JustAnotherArchivist/snscrape.git```

```$:> pip install snscrape```

3. Set: `start_date` = "YYYY-MM-DD", `end_date` = "YYYY-MM-DD"
4. Run  `get_tweets()`
5. Save output df in a CSV format.

## Imports

In [1]:
import os
import time
from ast import literal_eval
from datetime import datetime
import pandas as pd
import snscrape.modules.twitter as sntwitter # the magic
import warnings
warnings.filterwarnings('ignore')

## Directory Setup

In [2]:
root_dir = os.path.abspath(os.path.join(os.path.dirname("."), '.'))

## Extractor Module

In [5]:
class ExtractTweets:
    
    def __init__(self, 
                 minTweetCountPerDay=10, 
                 minRetweetCount=0,
                 minLikeCount=0, 
                 minFollowersCount=0, 
                 VerifiedStatus=None, 
                 saveBufferDuration=3600):
        """
        Accepts basic input params each of integer datatype, except for VerifiedStatus which accepts boolean or None.
        """
        self.start_timer = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
        self.min_tweet_count_perDay = minTweetCountPerDay
        self.minRetweetCount = minRetweetCount
        self.minLikeCount = minLikeCount
        self.minFollowersCount = minFollowersCount
        self.VerifiedStatus = VerifiedStatus                           
        self.tweets_df = pd.DataFrame(columns=['date', 'tweet', 'lang', 'retweetCount', 'likeCount', 'replyCount', 
                                               'username', 'user_followersCount','user_friendsCount', 'verifiedStatus', 
                                               'tweet_url', 'hastags', 'chr_count', 'topic'])
        self.save_buffer_duration = saveBufferDuration
        return

    def save_copy(self):
        """
        Saves a temp copy for restoration and prevent API time limit exceed error.
        
        :return:
        pandas dataframe containing twitter record-data.
        """
        data = self.tweets_df.reset_index(drop=True)
        data['date'] = data['date'].apply(lambda x: pd.to_datetime(x).strftime('%Y-%m-%d'))
        for filename in os.listdir("."):
            if filename.endswith('local.csv'):
                os.remove(filename)
        data.to_csv("./save_{}_local.csv".format(data.date.max()), index=False)
        return data
        
    def getTweets(self, start_date, end_date, keywords):
        """
        Extracts historical twitter data.
        
        :params:
        start_date - str in "YYYY-MM-DD" format
        end_date - str in "YYYY-MM-DD" format
        keywords - list of tuples, 
            e.g, [('recession'), ('football, 'worldcup', 'fifa'), ('war', 'ukraine')]
            e.g. ['recession']
        
        :return:
        pandas dataframe with features as:
         date: Tweet Timestamp
         tweet: tweet content
         lang: language classifer used by parent api
         retweetCount: tweet retweeted count
         likeCount: tweet like count
         replyCount: number of replies to original tweet
         username: user who tweeted
         user_followersCount: number of followers user has (tells you how popular the avg tweets are)
         user_friendsCount: number of friends user has
         verifiedStatus: If the user is Verified or not (i.e. pays 8 bucks every month!)
         tweet_url: Link of original tweet (click and see)
         hastags: If any hastags were used (hastags are important for search and info retrieval)
         chr_count: number of english characters in the original tweet
         topic: keywords you used for searching tweets (kind of labels)
        """
        
        if not(isinstance(keywords, list) or isinstance(keywords, tuple)):
            raise Exception("Incorrect Input Format! Please pass a list")
        
        for topic in keywords:
            # for saving local copies every buffer_hour
            st_time = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
            date = pd.to_datetime(start_date, format='%Y-%m-%d')
            e_date = pd.to_datetime(end_date, format='%Y-%m-%d') + pd.to_timedelta(1, unit='d')
            if isinstance(topic, tuple) or isinstance(topic, list):
                topic = " ".join(topic)
            search_query = topic
            print("search_query:", search_query)
        
            while date != e_date:
                nxt_date = date + pd.to_timedelta(1, unit='d')
                content = '{} since:{} until:{}'.format(search_query, date.strftime('%Y-%m-%d'), nxt_date.strftime('%Y-%m-%d'))
                print(content)
                
                # check for save buffer duration (set to 1 Hr by default)
                delta_buffer = (datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") - st_time).seconds
                if delta_buffer >= self.save_buffer_duration:
                    self.save_copy()
                    # reset buffer
                    st_time = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")

                lst_tweets = []
                for counter, tweet in enumerate(sntwitter.TwitterSearchScraper(content).get_items()):
                    if counter+1 > self.min_tweet_count_perDay: 
                        break
                    if tweet.likeCount >= self.minLikeCount \
                        or tweet.retweetCount >= self.minRetweetCount \
                        or tweet.user.followersCount >= self.minFollowersCount \
                        or (tweet.user.verified and isinstance(tweet.user.verified, bool) and tweet.user.verified == self.VerifiedStatus):
                        
                        # ----------------------------------------------------------------
                        # Potential custom preprocessing module here: 
                        # 1. Simple and short: https://www.kaggle.com/code/zenbird01/pranjalpathak-semantic-clustering-v1-0/notebook
                        # 2. Advanced: ./NLP_basics_preprocessing_vectorization_similarity.ipynb
                        # 3. Best: Check github - https://github.com/pranzell/NLP_Tools
                        # ----------------------------------------------------------------
                        
                        lst_tweets.append([
                            topic,
                            tweet.date, 
                            tweet.content, 
                            tweet.lang,
                            tweet.retweetCount,
                            tweet.likeCount,
                            tweet.replyCount,
                            tweet.user.username, 
                            tweet.user.followersCount, 
                            tweet.user.friendsCount, 
                            tweet.user.verified,
                            tweet.url,
                            tweet.hashtags,
                            len(str(tweet.content).strip()),
                            topic])
                
                self.tweets_df = self.tweets_df.append(pd.DataFrame(lst_tweets, columns=self.tweets_df.columns))
                date = nxt_date
        
        print("\n\nTOTAL TIME TAKEN {} minutes".format(((datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") - self.start_timer).seconds)/60.0))
        return self.save_copy()
    
    
    def preprocess_shortText(self, text_col):
        # refer to Preprocessing ipynb file
        # https://github.com/pranzell/NLP_Tools
        pass

## <ins>Execute</ins>

### Configuration

In [9]:
minTweetCountPerDay=50
minRetweetCount=100
minLikeCount=100
minFollowersCount=200
VerifiedStatus=None
saveBufferDuration=3600 # in seconds

start_date = "2022-04-01"
end_date = "2022-12-01"

# list of tuples, or a list of single str items check function definition `getTweets()`
keywords = [('russia ukrain war'),('inflation'),('gas prices')]

### Run

In [10]:
et = ExtractTweets(minTweetCountPerDay, minRetweetCount, minLikeCount, minFollowersCount, VerifiedStatus, saveBufferDuration)
twitter_data = et.getTweets(start_date, end_date, keywords)

search_query: russia ukrain war
russia ukrain war since:2022-04-01 until:2022-04-02
russia ukrain war since:2022-04-02 until:2022-04-03
russia ukrain war since:2022-04-03 until:2022-04-04
russia ukrain war since:2022-04-04 until:2022-04-05
russia ukrain war since:2022-04-05 until:2022-04-06
russia ukrain war since:2022-04-06 until:2022-04-07
russia ukrain war since:2022-04-07 until:2022-04-08
russia ukrain war since:2022-04-08 until:2022-04-09
russia ukrain war since:2022-04-09 until:2022-04-10
russia ukrain war since:2022-04-10 until:2022-04-11
russia ukrain war since:2022-04-11 until:2022-04-12
russia ukrain war since:2022-04-12 until:2022-04-13
russia ukrain war since:2022-04-13 until:2022-04-14
russia ukrain war since:2022-04-14 until:2022-04-15
russia ukrain war since:2022-04-15 until:2022-04-16
russia ukrain war since:2022-04-16 until:2022-04-17
russia ukrain war since:2022-04-17 until:2022-04-18
russia ukrain war since:2022-04-18 until:2022-04-19
russia ukrain war since:2022-04-

russia ukrain war since:2022-09-05 until:2022-09-06
russia ukrain war since:2022-09-06 until:2022-09-07
russia ukrain war since:2022-09-07 until:2022-09-08
russia ukrain war since:2022-09-08 until:2022-09-09
russia ukrain war since:2022-09-09 until:2022-09-10
russia ukrain war since:2022-09-10 until:2022-09-11
russia ukrain war since:2022-09-11 until:2022-09-12
russia ukrain war since:2022-09-12 until:2022-09-13
russia ukrain war since:2022-09-13 until:2022-09-14
russia ukrain war since:2022-09-14 until:2022-09-15
russia ukrain war since:2022-09-15 until:2022-09-16
russia ukrain war since:2022-09-16 until:2022-09-17
russia ukrain war since:2022-09-17 until:2022-09-18
russia ukrain war since:2022-09-18 until:2022-09-19
russia ukrain war since:2022-09-19 until:2022-09-20
russia ukrain war since:2022-09-20 until:2022-09-21
russia ukrain war since:2022-09-21 until:2022-09-22
russia ukrain war since:2022-09-22 until:2022-09-23
russia ukrain war since:2022-09-23 until:2022-09-24
russia ukrai

inflation since:2022-06-22 until:2022-06-23
inflation since:2022-06-23 until:2022-06-24
inflation since:2022-06-24 until:2022-06-25
inflation since:2022-06-25 until:2022-06-26
inflation since:2022-06-26 until:2022-06-27
inflation since:2022-06-27 until:2022-06-28
inflation since:2022-06-28 until:2022-06-29
inflation since:2022-06-29 until:2022-06-30
inflation since:2022-06-30 until:2022-07-01
inflation since:2022-07-01 until:2022-07-02
inflation since:2022-07-02 until:2022-07-03
inflation since:2022-07-03 until:2022-07-04
inflation since:2022-07-04 until:2022-07-05
inflation since:2022-07-05 until:2022-07-06
inflation since:2022-07-06 until:2022-07-07
inflation since:2022-07-07 until:2022-07-08
inflation since:2022-07-08 until:2022-07-09
inflation since:2022-07-09 until:2022-07-10
inflation since:2022-07-10 until:2022-07-11
inflation since:2022-07-11 until:2022-07-12
inflation since:2022-07-12 until:2022-07-13
inflation since:2022-07-13 until:2022-07-14
inflation since:2022-07-14 until

gas prices since:2022-04-24 until:2022-04-25
gas prices since:2022-04-25 until:2022-04-26
gas prices since:2022-04-26 until:2022-04-27
gas prices since:2022-04-27 until:2022-04-28
gas prices since:2022-04-28 until:2022-04-29
gas prices since:2022-04-29 until:2022-04-30
gas prices since:2022-04-30 until:2022-05-01
gas prices since:2022-05-01 until:2022-05-02
gas prices since:2022-05-02 until:2022-05-03
gas prices since:2022-05-03 until:2022-05-04
gas prices since:2022-05-04 until:2022-05-05
gas prices since:2022-05-05 until:2022-05-06
gas prices since:2022-05-06 until:2022-05-07
gas prices since:2022-05-07 until:2022-05-08
gas prices since:2022-05-08 until:2022-05-09
gas prices since:2022-05-09 until:2022-05-10
gas prices since:2022-05-10 until:2022-05-11
gas prices since:2022-05-11 until:2022-05-12
gas prices since:2022-05-12 until:2022-05-13
gas prices since:2022-05-13 until:2022-05-14
gas prices since:2022-05-14 until:2022-05-15
gas prices since:2022-05-15 until:2022-05-16
gas prices

gas prices since:2022-10-24 until:2022-10-25
gas prices since:2022-10-25 until:2022-10-26
gas prices since:2022-10-26 until:2022-10-27
gas prices since:2022-10-27 until:2022-10-28
gas prices since:2022-10-28 until:2022-10-29
gas prices since:2022-10-29 until:2022-10-30
gas prices since:2022-10-30 until:2022-10-31
gas prices since:2022-10-31 until:2022-11-01
gas prices since:2022-11-01 until:2022-11-02
gas prices since:2022-11-02 until:2022-11-03
gas prices since:2022-11-03 until:2022-11-04
gas prices since:2022-11-04 until:2022-11-05
gas prices since:2022-11-05 until:2022-11-06
gas prices since:2022-11-06 until:2022-11-07
gas prices since:2022-11-07 until:2022-11-08
gas prices since:2022-11-08 until:2022-11-09
gas prices since:2022-11-09 until:2022-11-10
gas prices since:2022-11-10 until:2022-11-11
gas prices since:2022-11-11 until:2022-11-12
gas prices since:2022-11-12 until:2022-11-13
gas prices since:2022-11-13 until:2022-11-14
gas prices since:2022-11-14 until:2022-11-15
gas prices

In [11]:
print(twitter_data.shape)
twitter_data.head()

(16720, 14)


Unnamed: 0,date,tweet,lang,retweetCount,likeCount,replyCount,username,user_followersCount,user_friendsCount,verifiedStatus,tweet_url,hastags,chr_count,topic
0,2022-04-01,DON'T let the life of this earth delude you.\n...,en,0,1,0,ciira_cyrus,51358,49095,False,https://twitter.com/ciira_cyrus/status/1510024...,[MessageOfResurrection],288,russia ukrain war
1,2022-04-01,@louvelune @KyivIndependent The communist Sovi...,en,0,0,1,CatManDoo18,964,229,False,https://twitter.com/CatManDoo18/status/1510008...,[RussiaUkrainWar],309,russia ukrain war
2,2022-04-01,NEW BLOG: US Crises... Domestic Challenges &a...,en,0,0,0,IanRMackintosh,12067,9248,False,https://twitter.com/IanRMackintosh/status/1509...,"[Coronavirus, learn, pandemic, worldwvents, ru...",173,russia ukrain war
3,2022-04-01,جایگاه ورزش در دنیا می تونه ترویج گفتمان صلح ب...,fa,0,0,0,neshan_eftekhar,237,347,False,https://twitter.com/neshan_eftekhar/status/150...,"[جام_جهانی2022, جنگ_روسیه_اکراین, russia_ukrai...",226,russia ukrain war
4,2022-04-01,@WeHearPodcast @HarryPotterMAGE How about Ur o...,en,0,0,0,swithtalker,906,4761,False,https://twitter.com/swithtalker/status/1509954...,,281,russia ukrain war


## Output

In [12]:
def read_copy(path="."):
    for f in os.listdir(path):
        if f.endswith('local.csv'):
            df = pd.read_csv(f, lineterminator='\n')
            df.hastags = df.hastags.apply(lambda x: literal_eval(x) if str(x) not in ['none', 'nan', 'np.nan', 'null', ''] else None)
            return df

In [13]:
df = read_copy(root_dir)
df

Unnamed: 0,date,tweet,lang,retweetCount,likeCount,replyCount,username,user_followersCount,user_friendsCount,verifiedStatus,tweet_url,hastags,chr_count,topic
0,2022-04-01,DON'T let the life of this earth delude you.\n...,en,0,1,0,ciira_cyrus,51358,49095,False,https://twitter.com/ciira_cyrus/status/1510024...,[MessageOfResurrection],288,russia ukrain war
1,2022-04-01,@louvelune @KyivIndependent The communist Sovi...,en,0,0,1,CatManDoo18,964,229,False,https://twitter.com/CatManDoo18/status/1510008...,[RussiaUkrainWar],309,russia ukrain war
2,2022-04-01,NEW BLOG: US Crises... Domestic Challenges &a...,en,0,0,0,IanRMackintosh,12067,9248,False,https://twitter.com/IanRMackintosh/status/1509...,"[Coronavirus, learn, pandemic, worldwvents, ru...",173,russia ukrain war
3,2022-04-01,جایگاه ورزش در دنیا می تونه ترویج گفتمان صلح ب...,fa,0,0,0,neshan_eftekhar,237,347,False,https://twitter.com/neshan_eftekhar/status/150...,"[جام_جهانی2022, جنگ_روسیه_اکراین, russia_ukrai...",226,russia ukrain war
4,2022-04-01,@WeHearPodcast @HarryPotterMAGE How about Ur o...,en,0,0,0,swithtalker,906,4761,False,https://twitter.com/swithtalker/status/1509954...,,281,russia ukrain war
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16715,2022-12-01,Current Gas prices post-black Friday+Cyber Mon...,en,0,1,0,Aru762mmR,703,1368,False,https://twitter.com/Aru762mmR/status/159846436...,,147,gas prices
16716,2022-12-01,@RonnyJacksonTX Do they have a plan for inflat...,en,0,0,0,CoachPete2323,1045,3752,False,https://twitter.com/CoachPete2323/status/15984...,,89,gas prices
16717,2022-12-01,@Jason27614314 @KrissyLUnited @TierraHenson @S...,en,1,3,2,johnnysunset287,1539,4993,False,https://twitter.com/johnnysunset287/status/159...,"[PrayTogether, peace, Ukraine, Trump]",1002,gas prices
16718,2022-12-01,"@dbongino Gas prices are down, stock market is...",en,0,5,4,glr4cblaw,2046,3716,False,https://twitter.com/glr4cblaw/status/159846416...,,290,gas prices


### Citations

Credits to the awesome social media mining tool SNScrape (https://github.com/JustAnotherArchivist/snscrape)

---