In [None]:

import pandas as pd 
import twint 
import datetime
import re

In [None]:
def url_filter(text: str) -> str:
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.search(regex, text)
    
    if url and len(url[0]) / len(text) > 0.50:
        return "SPAM"
    else:
        return text

In [None]:

def remove_emojis(text: str) -> str:
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)


In [None]:
def filter_scam_tweets(text: str) -> str:
    word_black_list = ["giving away", "Giving away", "GIVING AWAY", "PRE-GIVEAWAY", "Giveaway", "GIVEAWAY", "giveaway", "follow me", "Follow me", "FOLLOW ME", "retweet", "Retweet", "RETWEET", "LIKE", "airdrop","AIRDROP", "Airdrop", "free", "FREE", "Free", "-follow", "-Follow", "-rt", "-Rt", "Requesting faucet funds"]
    if any(ext in text for ext in word_black_list):
        return "SPAM"
    else:
        return text


In [None]:

def clean_text(text: str) -> str:
    text = str(text)

    text = text.replace("\n", "") 
    text = url_filter(text)
    text = filter_scam_tweets(text)
    text = remove_emojis(text)
    
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = text.replace("#", "") # remove hashtags from tweet
    
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_regex, "", text) # remove URL's from tweet
        
    return text.strip()


In [None]:
def create_date_list(start_year: int, start_month: int, start_day: int, number_of_days: int) -> list[str]:
    start_date = datetime.date(start_year, start_month, start_day)
    date_list = []

    for day in range(number_of_days):
        date_str = (start_date + datetime.timedelta(days = day)).isoformat()
        date_list.append(date_str)
    
    return date_list


In [None]:
def scrape_tweets(day: str, topic: str, num_of_tweets: int) -> pd.DataFrame:
    config = twint.Config()

    config.Search = topic # What topic should the tweets be about
    config.Limit = num_of_tweets # How many tweets do we want to scrape
    config.Lang = "en"
    config.Since = f"{day} 00:00:00" # Specify the day 
    config.Until = f"{day} 23:59:59"

    config.Pandas = True
    config.Store_object = True
    twint.run.Search(config)

    df = twint.storage.panda.Tweets_df # Create a Pandas DataFrame of the scraped tweets
    return df 


In [None]:
df = scrape_tweets(day="2022-01-25", topic="bitcoin", num_of_tweets=50)

date_list = create_date_list(start_year=2022, start_month=1, start_day=18, number_of_days=7)
df_list = []

for day in date_list:
    df = scrape_tweets(day=day, topic="bitcoin", num_of_tweets=400)
    if len(df.columns) != 0:
        df["tweet"] = df["tweet"].apply(lambda row: clean_text(row))
        df = df[df["tweet"] != "SPAM"]
        df.dropna(subset=["tweet"], inplace=True)
        df_list.append(df)


merged_df = pd.concat(df_list)
merged_df.to_csv("bitcoin.csv", index=False)