# Tweets Retriver

Tweets IDs: [COVID19_Tweets_dataset GitHub repository](https://github.com/lopezbec/COVID19_Tweets_Dataset)

Follow this notebook to download tweets. You have to apply for a developer account first.

In [1]:
# # from IPython.display import clear_output

# !pip install pandas
# !pip install twarc
# !pip install jsonlines
# !pip install wget
# !pip install python-dotenv
# !pip install tqdm

# # clear_output()

In [2]:
import os
import wget
from datetime import datetime
import itertools
import pandas as pd
import numpy as np
import math
import jsonlines, json, csv
import sys
sys.path.append("../")
import tqdm
from twarc import Twarc
import glob

# These keys are received after applying for a twitter developer account
import jsonlines, json, csv
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

In [42]:
## Paths

data_url = "https://raw.githubusercontent.com/lopezbec/COVID19_Tweets_Dataset/master/Summary_Details/"
tweet_ID_dir = "Tweet_IDs/"
tweet_summary_dir = "Tweet_Summary/"
# create a folder to store tweet IDs if not exists
os.makedirs(tweet_summary_dir, exist_ok=True)
os.makedirs(tweet_ID_dir, exist_ok=True)

## define months to study
data_month_dict = {   
    "202201": {
        "start_date": "2022-1-01",
        "end_date": "2022-1-31"},   
    # "202202": {
    #     "start_date": "2022-2-01",
    #     "end_date": "2022-2-28"},   
    # "202203": {
    #     "start_date": "2022-3-01",
    #     "end_date": "2022-3-31"},   
    # "202204": {
    #     "start_date": "2022-4-01",
    #     "end_date": "2022-4-30"},   
    # "202205": {
    #     "start_date": "2022-5-01",
    #     "end_date": "2022-5-31"},    
}

data_hours = ['00','01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23']

## Download COVID-19 Tweet ids from Github

In [43]:
for data_month, date_range in data_month_dict.items():
    start_date = date_range["start_date"]
    end_date = date_range["end_date"]
    
    dates_list = pd.date_range(start_date, end_date).tolist()
    month_str = dates_list[0].strftime("%Y_%m")
    dates_list = [d.strftime("%Y_%m_%d") for d in dates_list]

    files_list = [
        f"{data_url}{month_str}/{date_str}_{hour_str}_Summary_Details.csv"
        for date_str, hour_str
        in itertools.product(dates_list, data_hours)
    ]
    
    month_directory = f"{tweet_summary_dir}{data_month}"
    os.makedirs(month_directory, exist_ok=True)
    for file in files_list:
        if not os.path.exists(file):
            try:
                wget.download(file, out=month_directory)
            except:
                print("something went wrong")
                # there are some known gaps with no data collected:
                # https://github.com/lopezbec/COVID19_Tweets_Dataset#data-collection-process-inconsistencies
                pass
            

In [None]:
# create a folder to store tweet IDs if not exists
os.makedirs(tweet_ID_dir, exist_ok=True)

for data_month, date_range in data_month_dict.items():
    print(data_month)
    files = glob.glob(f"{tweet_summary_dir}{data_month}/*.csv")
    tweet_ids = []
    for file in tqdm.tqdm(files):
        data = pd.read_csv(file)

        # only keep English tweets
        data = data[data['Language']=='en']
        # filter out retweets
        data = data[data["RT"]=="NO"] 
        tweet_ids.extend(data["Tweet_ID"])

    # write Tweet IDs to a file for hydration later
    tweet_ids_filename = f"{tweet_ID_dir}/{data_month}.txt"
    with open(tweet_ids_filename, "w+") as f:
        for tweet_id in tweet_ids:
            f.write(f"{tweet_id}\n")

In [45]:
# check total tweet IDs for each month
for data_month in data_month_dict.keys():
    !wc -l "Tweet_IDs/{data_month}.txt"

23605674 Tweet_IDs/202201.txt


In [46]:
n_split = 10  ## split the data to make the files smaller. 

In [None]:
# iterate through tweet IDs for each month and sample 10%
for data_month in data_month_dict.keys():
    filename = f"{tweet_ID_dir}{data_month}.txt"
    print(filename)
    # read monthly tweet IDs
    tweet_ids = pd.read_csv(filename, header=None, dtype=str)
    
    # split the data frame into n chunks
    end_i = tweet_ids.shape[0]
    chunk_size = math.ceil(end_i / n_split)

    # iterate through all the chunks and output to file
    for i, start_i in enumerate(range(0, end_i, chunk_size)):
        tweet_split_i = tweet_ids[start_i:start_i + chunk_size]

        # output to a file for each split
        tweet_sample_ids_filename = f"{tweet_ID_dir}{data_month}_{i}.txt"
        with open(tweet_sample_ids_filename, "w+") as f:
            for tweet_id in tweet_split_i[0]:
                f.write(f"{tweet_id}\n")

In [None]:
# check total tweet IDs for each month
for data_month in data_month_dict.keys():
    for i in range(n_split):
        !wc -l "Tweet_IDs/{data_month}_{i}.txt"

# Hydrate

### Set up output file

In [49]:
output_dir= "/mnt/hdd/ningh/Dropbox/"
# os.makedirs(output_dir,output_dir=True)

In [None]:
# iterate through tweet IDs for each month and sample 10%
for data_month in data_month_dict.keys():
    os.makedirs(f"{output_dir}{data_month}", exist_ok=True)
    for i in range(0, n_split):
        tweet_ids_filename = f"Tweet_IDs/{data_month}_{i}.txt" #@param {type: "string"}
        output_filename = f"{output_dir}{data_month}/{data_month}_{i}.txt" #@param {type: "string"}
        print("On file %s"%output_filename)
        ids = []
        with open(tweet_ids_filename, "r") as ids_file:
            ids = ids_file.read().split()
        hydrated_tweets = []
        ids_to_hydrate = set(ids)
        # Check hydrated tweets
        if os.path.isfile(output_filename):
            with jsonlines.open(output_filename, "r") as reader:
                for i in reader.iter(type=dict, skip_invalid=True):
                    hydrated_tweets.append(i)
                    ids_to_hydrate.remove(i["id_str"])
        if ids_to_hydrate == 0:
            print("Finished downloading. Skipping.")
            continue

        print("Total IDs: " + str(len(ids)) + ", IDs to hydrate: " + str(len(ids_to_hydrate)))
        print("Hydrated: " + str(len(hydrated_tweets)))
        
        pbar = tqdm.tqdm(total=len(ids_to_hydrate))
        count = len(hydrated_tweets)
        start_index = count

        num_save  = 10000

        # start hydrating
        for tweet in t.hydrate(ids_to_hydrate):
            hydrated_tweets.append(tweet)
            count += 1
            # If num_save iterations have passed,
            if (count % num_save) == 0:
                with jsonlines.open(output_filename, "a") as writer:
                    for hydrated_tweet in hydrated_tweets[start_index:]:
                        writer.write(hydrated_tweet)
                start_index = count
            pbar.update(1)

        if count != start_index:
            print("Here with start_index", start_index)
            with jsonlines.open(output_filename, "a") as writer:
                for hydrated_tweet in hydrated_tweets[start_index:]:
                    writer.write(hydrated_tweet)   

#### This takes a long time, several days for a month. You can convert this to a python file. Jupyter notebook disconnects sometimes

## Convert jsonl files that are stored in .txt to csv

In [None]:
import glob

folders = sorted([f for f in glob.glob(output_dir+"*") if "-" in f])
## convert jsonl to csv files
files = []
for folder in folders:
     files += glob.glob(folder+"/*") 
files = sorted(files)
files

In [None]:
# Convert jsonl to csv
for file in tqdm.tqdm(files):
    print("On file %s"%(file))
    output_filename = file.replace("txt","csv") 
    if os.path.exists(output_filename):
      continue
    # These are the column name that are selected to be stored in the csv
    keyset = ["created_at", "id", "id_str", "full_text", "source", "truncated", "in_reply_to_status_id",
      "in_reply_to_status_id_str", "in_reply_to_user_id", "in_reply_to_user_id_str", 
      "in_reply_to_screen_name", "user", "coordinates", "place", "quoted_status_id",
      "quoted_status_id_str", "is_quote_status", "quoted_status", "retweeted_status", 
      "quote_count", "reply_count", "retweet_count", "favorite_count", "entities", 
      "extended_entities", "favorited", "retweeted", "possibly_sensitive", "filter_level", 
      "lang", "matching_rules", "current_user_retweet", "scopes", "withheld_copyright", 
      "withheld_in_countries", "withheld_scope", "geo", "contributors", "display_text_range",
      "quoted_status_permalink"]
    hydrated_tweets = []
    # Reads the current tweets
    with jsonlines.open(file, "r") as reader:
        for hydrated in reader.iter(type=dict, skip_invalid=True):
            hydrated_tweets.append(hydrated)
    # Writes them out
    with open(output_filename, "w+") as output_file:
        d = csv.DictWriter(output_file, keyset)
        d.writeheader()
        d.writerows(hydrated_tweets)
# clear_output()