### Purpose of script

In this script, I'll build off `hydrate_tweets_2020-12-17.ipynb` and hydrate all the tweets from https://ieee-dataport.org/open-access/coronavirus-covid-19-geo-tagged-tweets-dataset from March 20th to December 22. 


In [1]:
import numpy as np
import pandas as pd
import os
import json
import datetime as datetime
import re
import nltk 
from nltk.corpus import stopwords
import emoji

pd.set_option('display.max_columns', None) # show all columns

#### 1. Hydrate tweets

In this part of the code, we'll take the .csv files from the website above and get the IDs. We'll do this for all the IDs, then export all the IDs from March 20th to December 22nd as a .csv file. 

In [3]:
TWEET_ID_DIR = "../../data/tweets/tweet_ids/"

In [4]:
def get_tweets_to_hydrate(link, csv_name, tweet_id_dir = TWEET_ID_DIR):
    
    """
        Takes the links to both of the csv files for the given date, as well as name of export file
        
        Assumes that directory for tweet IDs is specified
        
    """
    
    df = pd.read_csv(link, names=["tweet_id", "sentiment_score"])
    
    df.drop_duplicates(inplace=True)
    
    tweet_ids = list(df["tweet_id"])
    
    return tweet_ids

In [5]:
def save_tweet_IDs(tweet_ids, filepath):
    """
        Takes list of tweet IDs, exports as .csv
    """
    
    with open(filepath, "a+") as f:
        for idx, tweet in enumerate(tweet_ids):
            if idx != len(tweet_ids) - 1:
                f.write(f"{tweet}, \n")
            else:
                f.write(f"{tweet}")
                
    print(f"CSV file {csv_name} successfully exported")
    

Let's get all the .csv links

In [None]:
links_list = [
    "https://ieee-dataport.s3.amazonaws.com/open/14206/march20_march21.csv?response-content-disposition=attachment%3B%20filename%3D%22march20_march21.csv%22&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJOHYI4KJCE6Q7MIQ%2F20201222%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201222T180633Z&X-Amz-SignedHeaders=Host&X-Amz-Expires=86400&X-Amz-Signature=dbc018fbf9afe4b94d0c65019ffdc4de7f2072787dd2747acebfff07ba218d39", 
    "https://ieee-dataport.s3.amazonaws.com/open/14206/march21_march22.csv?response-content-disposition=attachment%3B%20filename%3D%22march21_march22.csv%22&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJOHYI4KJCE6Q7MIQ%2F20201222%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201222T180633Z&X-Amz-SignedHeaders=Host&X-Amz-Expires=86400&X-Amz-Signature=e1908ab5e5ab3b8734ea6e6f1cd15df90c348c4e3734f4bc044cdd9b0536fd03", 
    "https://ieee-dataport.s3.amazonaws.com/open/14206/march22_march23.csv?response-content-disposition=attachment%3B%20filename%3D%22march22_march23.csv%22&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJOHYI4KJCE6Q7MIQ%2F20201222%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201222T180633Z&X-Amz-SignedHeaders=Host&X-Amz-Expires=86400&X-Amz-Signature=eba874ef352f5d374e949ed583c32c227549c3c672ef016bfe8831fcb173979b", 
    "https://ieee-dataport.s3.amazonaws.com/open/14206/march23_march24.csv?response-content-disposition=attachment%3B%20filename%3D%22march23_march24.csv%22&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJOHYI4KJCE6Q7MIQ%2F20201222%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201222T180633Z&X-Amz-SignedHeaders=Host&X-Amz-Expires=86400&X-Amz-Signature=5a1225643892cbf405ca2964f130a447f99b30c48ba3f14787213cafe41db3cc",
    
]