In [4]:
!pip install snscrape
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 5.0 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.0.0-py3-none-any.whl size=193022 sha256=54cb9cc780cc1c931fd47804b746dcbca326429be65af3621c2fe4ccd7658cf5
  Stored in directory: /root/.cache/pip/wheels/ec/29/4d/3cfe7452ac7d8d83b1930f8a6205c3c9649b24e80f9029fc38
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.0.0


In [5]:
import itertools
import pandas as pd
import snscrape.modules.twitter as sntwitter
from datetime import date, timedelta
import re
import string
from emoji import replace_emoji

In [7]:


#https://www.kaggle.com/code/ludovicocuoghi/twitter-sentiment-analysis-with-bert-roberta/notebook
def strip_all_entities(text): 
    """
    Function to remove all next lines, tabs, links, mentions and non utf8/ascii characters as well as transofrming everything to lower case.
    Input: Text (String)
    Ouput_Striped Text (String)
    """
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    """
    Funciton to clean a tweet from all hastags.
    """
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    """
    Remove & and $ from words
    """
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    """
    Function to remove multiple spaces
    """
    return re.sub("\s\s+" , " ", text)

##### https://zhuanlan.zhihu.com/p/342441381
def scrape_tweets(words, start, end, clean=True, save=True):
    """
    Scrapes Twitter and returns the collected tweets
    words: list of strings; contains the words that one wants to scrape tweets for
    start: string start date for the scraping; needs to be in YYYY-MM-DD format
    end: string end date for the scraping; needs to be in YYY-MM-DD format
    save: to append the scraped data to an existing csv
    """

    df = pd.DataFrame()
    for word in words:
        scraped_tweets = sntwitter.TwitterSearchScraper(f"{word} since:{start} until:{end}").get_items()

        sliced_scraped_tweets = itertools.islice(scraped_tweets, None)
        tweets_df = pd.DataFrame(sliced_scraped_tweets)
        tweets_df = tweets_df[["id", "content", "date"]]

        df = pd.concat([df, tweets_df])

        df.drop_duplicates(subset='id',inplace=True)
        
    if clean:
        df["content"] = df["content"].apply(replace_emoji)
        df["content"] = df["content"].apply(strip_all_entities)
        df["content"] = df["content"].apply(clean_hashtags)
        df["content"] = df["content"].apply(filter_chars)
        df["content"] = df["content"].apply(remove_mult_spaces)
    
    if save:
        df.to_csv("NLP_test_file.csv", mode="a") #a for append  
        
    return df

In [12]:
def scraper(words, start, end, number_of_days=False, clean=True, save=True):
    """
    words: list of strings to scrape Twitter for
    start: string start date for the scraping; needs to be in YYYY-MM-DD format 
    end: string end date for the scraping; needs to be in YYY-MM-DD format, 
         not inclusive of the last day
    number_of_days: alternative to end
                    int, the number of days beginning from start one wants to scrape
    """
    start_date = date.fromisoformat(start)
    if number_of_days:
        end_date = start_date + timedelta(number_of_days)
    elif end:
        end_date = date.fromisoformat(end)
    else:
        print("You must provide at least a valid end date or the number of days you want to scrape twitter")
    if len(words) == 0:
        print(f"You are not searching for any words, current words input is:\n{words}")
        
    for i in range((end_date-start_date).days):
        print(f"scraping day {start_date}")
        next_day = start_date + timedelta(1)
        df = scrape_tweets(words, start_date.isoformat(), next_day.isoformat(), clean=clean, save=save)
        start_date = next_day
        
    return df
        

In [13]:
#define your words, one of them should be in a tweet for the tweet to be downloaded
words = ["refugee", "asylum", "escapee", "stateless person", "fugitive", "UNHCR", 
         "migrant", "civil conflict", "war", "hunger"] 


In [18]:
#scrape twitter for your defined words from the 1st June of 2022 to 2nd June of 2022
d = scraper(words, "2020-06-01", "2020-06-02", clean=True, save=True)

scraping day 2020-06-01
