### Purpose of script

In this script, I'll hydrate all the tweets from https://ieee-dataport.org/open-access/coronavirus-covid-19-geo-tagged-tweets-dataset from March 20th 2020 to January 9th 2021


In [23]:
import numpy as np
import pandas as pd
import os
import json
import datetime as datetime
import re
import nltk 
from nltk.corpus import stopwords
import emoji
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

pd.set_option('display.max_columns', None) # show all columns

#### 1. Hydrate tweets

In [2]:
TWEET_ID_DIR = "../../data/tweets/tweet_ids/"

In [3]:
def get_tweets_to_hydrate(link):
    
    """
        Takes the links to both of the csv files for the given date, as well as name of export file
        
        Assumes that directory for tweet IDs is specified
        
    """
    
    df = pd.read_csv(link, names=["tweet_id", "sentiment_score"])
    
    df.drop_duplicates(inplace=True)
    
    tweet_ids = list(df["tweet_id"])
    
    return tweet_ids

In [4]:
def save_tweet_IDs(tweet_ids, filepath):
    """
        Takes list of tweet IDs, exports as .csv
    """
    
    with open(filepath, "a+") as f:
        for idx, tweet in enumerate(tweet_ids):
            if idx != len(tweet_ids) - 1:
                f.write(f"{tweet}, \n")
            else:
                f.write(f"{tweet}")
                
    print(f"CSV file successfully exported")
    

Let's get all the .csv links

In [38]:
links_list = []

In [30]:
PATH = "/Users/mark/Documents/research/gersteinLab/TextMining-master/chromedriver"
driver = webdriver.Chrome(PATH)

In [67]:
username=""
password=""

In [68]:
with open("../../ieee_creds.txt") as f:
    username=f.readline().strip("\n")
    password=f.readline().strip("\n")

In [42]:
def get_links_from_website():
    
    try: 
        
        PATH = "/Users/mark/Documents/research/gersteinLab/TextMining-master/chromedriver"
        driver = webdriver.Chrome(PATH)
        
        driver.get("https://ieee-dataport.org/open-access/coronavirus-covid-19-geo-tagged-tweets-dataset")
        
        # click on button to login
        login_button = driver.find_element(By.XPATH, "/html/body/div[3]/div/div[2]/nav/div/ul/li[2]/a")
        login_button.click()
        
        time.sleep(1)

        # enter login info
        username_field=driver.find_element(By.ID, "username")
        username_field.clear()
        password_field=driver.find_element(By.ID, "password")
        password_field.clear()

        username_field.send_keys(username)
        password_field.send_keys(password)

        driver.find_element(By.ID, "modalWindowRegisterSignInBtn").click()
        
        # get links for datasets
        has_link = True
        idx = 1
        
        while has_link:
            
            try:
                row = driver.find_element(By.XPATH, f"/html/body/div[5]/div/div/main/div/div[2]/div[1]/div/div/div/table[2]/tbody/tr[{idx}]/td[1]/span/a")
                link = row.get_attribute("href")
                text = row.text
                
                links_list.append(link)
                print(f"Added csv link for file: {text}")
                
                idx = idx + 1
                
            except Exception as e:
                print("Finished loading all csv file links")
                print(e)
                has_link = False
                
        # close session       
        driver.close()
        
    except Exception as e:
        print(e)


In [43]:
#get_links_from_website()

Added csv link for file: march20_march21.csv
Added csv link for file: march21_march22.csv
Added csv link for file: march22_march23.csv
Added csv link for file: march23_march24.csv
Added csv link for file: march24_march25.csv
Added csv link for file: march25_march26.csv
Added csv link for file: march26_march27.csv
Added csv link for file: march27_march28.csv
Added csv link for file: march28_march29.csv
Added csv link for file: march30_march31.csv
Added csv link for file: march31_april1.csv
Added csv link for file: april1_april2.csv
Added csv link for file: april2_april3.csv
Added csv link for file: april3_april4.csv
Added csv link for file: april4_april5.csv
Added csv link for file: april5_april6.csv
Added csv link for file: april6_april7.csv
Added csv link for file: april7_april8.csv
Added csv link for file: april8_april9.csv
Added csv link for file: april9_april10.csv
Added csv link for file: april10_april11.csv
Added csv link for file: april11_april12.csv
Added csv link for file: apr

Added csv link for file: september22_september23.csv
Added csv link for file: september23_september24.csv
Added csv link for file: september24_september25.csv
Added csv link for file: september25_september26.csv
Added csv link for file: september26_september27.csv
Added csv link for file: september27_september28.csv
Added csv link for file: september28_september29.csv
Added csv link for file: september29_september30.csv
Added csv link for file: september30_october1.csv
Added csv link for file: october1_october2.csv
Added csv link for file: october2_october3.csv
Added csv link for file: october3_october4.csv
Added csv link for file: october4_october5.csv
Added csv link for file: october5_october6.csv
Added csv link for file: october6_october7.csv
Added csv link for file: october7_october8.csv
Added csv link for file: october8_october9.csv
Added csv link for file: october9_october10.csv
Added csv link for file: october10_october11.csv
Added csv link for file: october11_october12.csv
Adde

Now that we have all the .csv files, let's get all the IDs

In [46]:
links_list

['https://ieee-dataport.s3.amazonaws.com/open/14206/march20_march21.csv?response-content-disposition=attachment%3B%20filename%3D%22march20_march21.csv%22&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJOHYI4KJCE6Q7MIQ%2F20210111%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210111T163523Z&X-Amz-SignedHeaders=Host&X-Amz-Expires=86400&X-Amz-Signature=116bdff0ed55fd19639758c4a3232dd0491d04142220a59f491051c42fb7243f',
 'https://ieee-dataport.s3.amazonaws.com/open/14206/march21_march22.csv?response-content-disposition=attachment%3B%20filename%3D%22march21_march22.csv%22&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJOHYI4KJCE6Q7MIQ%2F20210111%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210111T163523Z&X-Amz-SignedHeaders=Host&X-Amz-Expires=86400&X-Amz-Signature=2186539f78fd2b5bfb435bd29a59e1ad9cd250de4a5912dbacb0322cfe14cd5f',
 'https://ieee-dataport.s3.amazonaws.com/open/14206/march22_march23.csv?response-content-disposition=attachment%3B%20filename%3D%22march22_march23.csv%2

In [47]:
IDs_list = []

In [48]:
for lst in links_list:
    ids_to_add = get_tweets_to_hydrate(lst)
    for elem in ids_to_add:
        IDs_list.append(elem)

Now, let's export the IDs

In [49]:
save_tweet_IDs(IDs_list, TWEET_ID_DIR + "tweet_IDs_2020-03-20_2021-01-09.csv")

CSV file successfully exported


Now, using these tweet IDs, let's hydrate them to recover the original tweets

First, you have to confirm your credentials. 

`twarc configure`

Then, submit the creds. After doing so successfully, you should get a message like this: 

`The credentials for default have been saved to your configuration file at /Users/mark/.twarc`

Afterwards, you can start hydrating the tweets. 

This can be done in the command line

You'd run something like this:

`twarc hydrate ids.txt > tweets.jsonl`

In my case, running the command from the root directory of this project, it looks something like this:

`twarc hydrate data/tweets/tweet_ids/tweet_IDs_2020-03-20_2021-01-09.csv > data/tweets/hydrated_tweets/2020-03-20_2021-01-09_tweets.jsonl`

#### 2. Preprocess tweets

We likely only care about the following columns:

    • created_at
    • id
    • full_text
    • geo
    • coordinates
    • place (this has the city + state location, as a field called "full_name")
    • retweet_count
    • favorite_count
    
We also want to parse the "created_at" column (we can perhaps create 2 columns, one with the date and one with the hour)

First, we'll define some helper functions

In [50]:
def get_state_from_location(place):
    """
    Gets state info from place field
    Assumes dict input
    """
    
    if place is None:
        state = "NA"  
    elif place["country_code"] != "US":
        state = "NA"
    else:
        state = place["full_name"].split(",")[1].strip() # e.g., "Los Angeles, CA" --> "CA"
        
    return state
        

In [51]:
PUNCTUATION ='''!()-[]{};:'"\,<>./?@$%^&*_~''' # keep hashtags
STOPWORDS = stopwords.words("english")

In [70]:
def remove_emoji(string):
    """
        Removes emojis
    """
    text = string.encode("utf-8")
    allchars = [str for str in text.decode('utf-8')]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.decode('utf-8').split() if not any(i in str for i in emoji_list)])
    return clean_text

In [88]:
def clean_text(text):
    """
        Removes punctuation, does string split, and removes links
        
        Works on individual tweets
    """
    
    return_arr = []
    
    # remove punctuation
    text_no_punctuation = ""
    
    for char in text:
        if char not in PUNCTUATION:
            text_no_punctuation = text_no_punctuation + char
            
    # remove emojis
    text_no_punctuation = remove_emoji(text_no_punctuation)
    text_no_punctuation = re.sub(r'\\U[a-zA-Z0-9]{8}', '', text_no_punctuation)
    
    # remove \n and \t
    text_no_punctuation = re.sub(r'\n', '', text_no_punctuation)
    text_no_punctuation = re.sub(r'\t', '', text_no_punctuation)
    
    # remove escape sequences
    text_no_escape = ""
    
    for char in text_no_punctuation:
        try:
            char.encode('ascii')
            text_no_escape = text_no_escape + char # this'll catch chars that don't have an ascii equivalent (e.g., emojis)
        except:
            pass
    
    # add space between # and another char before it (e.g., split yes#baseball into yes #baseball)
    text_no_escape = re.sub(r"([a-zA-Z0-9]){1}#", r"\1 #", text_no_escape)
    
    # other preprocessing
    text_arr = text_no_escape.split(' ')
    
    for word in text_arr:
        
        # clean words
        word = word.lower()
        
        if "http" not in word and word.strip() != '' and word not in STOPWORDS:
            return_arr.append(word)
            
    return return_arr


In [83]:
def clean_hydrated_tweets(tweet_jsonl_path):
    
    """
    
        Takes .jsonl from Twitter, returns the cleaned df
        
    """
    
    # get uncleaned df from json
    df = pd.read_json(path_or_buf=tweet_jsonl_path, lines=True)
    df = df[["user","created_at", "id", "full_text", "geo", "coordinates", "place", "retweet_count", "favorite_count"]]
    
    # get state
    print("Getting states for each tweet...")
    states = []
    
    for location_dict in df["place"]:
        try:
            states.append(get_state_from_location(location_dict))
        except Exception as e:
            states.append("NA")
            print(location_dict)
            print(e)
            
    df["US_state"] = states
    
    # get dates of tweets
    print("Getting dates of tweets...")
    dates = []
    months = []
    days = []
    hours = []

    for timestamp in df["created_at"]:
        hour = pd.to_datetime(timestamp).hour
        dt_obj = pd.to_datetime(timestamp).date()
        year = dt_obj.year
        month = dt_obj.month
        day = dt_obj.day

        hours.append(hour)
        months.append(month)
        days.append(day)

        if month < 10:
            month = f"0{month}"

        dates.append(f"{year}-{month}-{day}")

    df["date_of_tweet"] = dates
    df["month_of_tweet"] = months
    df["day_of_tweet"] = days
    df["hour_of_tweet"] = hours

    # clean the text
    print("Cleaning text...")
    df["cleaned_text"] = df["full_text"].apply(clean_text)

    # work with hashtags
    hashtags_arr = []
    num_hashtags_arr = []
    text_no_hashtags_arr = []

    print("Getting the hashtags that we care about...")
    for tokenized_text in df["cleaned_text"]:
        hashtag_lst = []
        text_no_hashtags_lst = []

        for word in tokenized_text:
            if '#' in word:
                hashtag_lst.append(word)
            else:
                text_no_hashtags_lst.append(word)

        hashtags_arr.append(hashtag_lst)
        num_hashtags_arr.append(len(hashtag_lst))
        text_no_hashtags_arr.append(text_no_hashtags_lst)

    df["hashtags"] = hashtags_arr
    df["hashtags_count"] = num_hashtags_arr
    df["cleaned_text_no_hashtags"] = text_no_hashtags_arr

    # get only cols that we care about
    df_small = df[["user", "id", "full_text", "retweet_count", "favorite_count", "place", 
                   "US_state", "date_of_tweet", "month_of_tweet", "day_of_tweet", 
                   "hour_of_tweet", "cleaned_text", "hashtags", "hashtags_count", "cleaned_text_no_hashtags"]]

    print("Finished preprocessing.")
    return df_small

Now, let's get the df that we need

In [84]:
HYDRATED_TWEETS_DIR = "../../data/tweets/hydrated_tweets/"

In [85]:
tweets_df = clean_hydrated_tweets(HYDRATED_TWEETS_DIR + "2020-03-20_2021-01-09_tweets.jsonl")

Getting states for each tweet...
{'id': '96683cc9126741d1', 'url': 'https://api.twitter.com/1.1/geo/id/96683cc9126741d1.json', 'place_type': 'country', 'name': 'United States', 'full_name': 'United States', 'country_code': 'US', 'country': 'United States', 'contained_within': [], 'bounding_box': {'type': 'Polygon', 'coordinates': [[[-179.231086, 13.182335], [179.859685, 13.182335], [179.859685, 71.434357], [-179.231086, 71.434357]]]}, 'attributes': {}}
list index out of range
{'id': '07d9d25c10081003', 'url': 'https://api.twitter.com/1.1/geo/id/07d9d25c10081003.json', 'place_type': 'poi', 'name': 'Winston Trails Golf Club', 'full_name': 'Winston Trails Golf Club', 'country_code': 'US', 'country': 'United States', 'contained_within': [], 'bounding_box': {'type': 'Polygon', 'coordinates': [[[-80.14255034134065, 26.58003454682702], [-80.14255034134065, 26.58003454682702], [-80.14255034134065, 26.58003454682702], [-80.14255034134065, 26.58003454682702]]]}, 'attributes': {}}
list index out 

#### 3. Export tweets

Now that we've cleaned our tweets, let's export them

In [86]:
EXPORT_DIR = "../../data/tweets/"

In [89]:
tweets_df.to_csv(EXPORT_DIR + "tweets_2020-03-20_2021-01-09_with_locations.csv")

In [91]:
tweets_df.shape

(278339, 15)

In [97]:
tweets_df["US_state"].value_counts().head(10)

NA     152733
CA      23347
NY      20843
USA     12535
TX       7910
FL       7811
GA       3880
IL       3274
DC       3145
NJ       2858
Name: US_state, dtype: int64

In [94]:
tweets_df.head()

Unnamed: 0,user,id,full_text,retweet_count,favorite_count,place,US_state,date_of_tweet,month_of_tweet,day_of_tweet,hour_of_tweet,cleaned_text,hashtags,hashtags_count,cleaned_text_no_hashtags
0,"{'id': 169711005, 'id_str': '169711005', 'name...",1240728065983959040,#statewaterheaters #getitin #corona #keepingpe...,0,1,"{'id': '3995cc1483801d24', 'url': 'https://api...",OH,2020-03-19,3,19,19,"[#statewaterheaters, #getitin, #corona, #keepi...","[#statewaterheaters, #getitin, #corona, #keepi...",6,"[new, water, heater, swap, pickerington, ohio]"
1,"{'id': 29464656, 'id_str': '29464656', 'name':...",1240728187136610306,"""ain't no humans outside! (corona!)"" 😂😂😂🤣 @ Cl...",0,0,"{'id': '0eb9676d24b211f1', 'url': 'https://api...",OH,2020-03-19,3,19,19,"[aint, humans, outside, corona, cleveland, ohio]",[],0,"[aint, humans, outside, corona, cleveland, ohio]"
2,"{'id': 3180254803, 'id_str': '3180254803', 'na...",1240728221986906113,Salam Friends\nLooking at the grave financial ...,0,0,"{'id': '01aadce76841e2c5', 'url': 'https://api...",,2020-03-19,3,19,19,"[salam, friends, looking, grave, financial, si...",[],0,"[salam, friends, looking, grave, financial, si..."
3,"{'id': 39557009, 'id_str': '39557009', 'name':...",1240728361556750338,Thanks to COVID19 we are under unprecedented l...,0,0,"{'id': '7db53e74bdc6007e', 'url': 'https://api...",,2020-03-19,3,19,19,"[thanks, covid19, unprecedented, levels, stres...",[],0,"[thanks, covid19, unprecedented, levels, stres..."
4,"{'id': 2923230219, 'id_str': '2923230219', 'na...",1240728639358017536,#tbt to the current #anime that I'm rewatching...,0,0,"{'id': '00893962665a2284', 'url': 'https://api...",,2020-03-19,3,19,19,"[#tbt, current, #anime, im, rewatching, #coron...","[#tbt, #anime, #corona, #codegeass, #lelouchof...",8,"[current, im, rewatching, threat, keeping, us,..."
