In [1]:
import matplotlib.pyplot as plt
import numpy as np
from numpy.polynomial.polynomial import polyfit
import scipy.stats
import glob
from dateutils import timedelta
import pandas as pd
from datetime import datetime
import tqdm

In [2]:
from pandarallel import pandarallel as pdrl
pdrl.initialize()

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
drug_common = "../data/final/"
drug_dict = {"hcq": f"{drug_common}hcq.csv",
              "ivermectin": f"{drug_common}ivermectin.csv",
              "remdesivir": f"{drug_common}remdesivir.csv",
              "molnupiravir": f"{drug_common}molnupiravir.csv"}

In [4]:
# set the first date of the datasets: 2020-01-22
start_date = datetime.strptime("01/22/2020", "%m/%d/%Y").date()
print(f"Start date: {start_date}")

# set the end date 2020-12-31
end_date = datetime.strptime("11/30/2021", "%m/%d/%Y").date()
print(f"End date: {end_date}")

Start date: 2020-01-22
End date: 2021-11-30


In [5]:
import math
def to_week(x):
    return math.ceil((x + timedelta(days=1) - start_date) / timedelta(weeks=1))
def to_date(week):
    return start_date + timedelta(weeks=week-1)

## Get tweet numbers 

In [82]:
# files = glob.glob(f"{drug_common}raw/*.csv")
# with open("tweet_amount.log","w") as log_file:
#     for file in tqdm.tqdm(files):
#         print(file)
#         df = pd.read_csv(file,lineterminator="\n")
#         # convert date strings to dates
#         df = df[df.full_text.notna()]
#         df["date"] = df["created_at"].parallel_apply(lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S %z %Y").date())
#         df["week"] = df["date"].parallel_apply(lambda x: to_week(x))
# #         df.to_csv(file, index=False)
#         # Split data into weekly data files
#         start_week = df["week"].min()
#         end_week = df["week"].max()

#         for i in range(start_week, end_week + 1):
#             week_start = (start_date + timedelta(weeks=i - 1))
#             week_i_df = df[df.week == i].sort_values("date")
#             print("Week %i have %i raw tweets\n"%(i, len(week_i_df)))
#             log_file.write("Week %i have %i raw tweets\n"%(i, len(week_i_df)))

In [83]:
# with open("tweet_amount.log","r") as file:
#     log = [l for l in file.read().split("\n") if "raw" in l]

# log = np.unique(np.array(log))
# print("\n".join(list(log)))

### Get drug tweet number distribution

In [None]:
with open("drug_tweet_amount.log","w") as log_file:
    for drug, drug_path in drug_dict.items():
        df = pd.read_csv(drug_path,lineterminator="\n", low_memory=False)
        # convert date strings to dates
        if "date" not in df.columns:
            df["date"] = df["created_at"].apply(lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S %z %Y").date())
            df["week"] = df["date"].apply(lambda x: to_week(x))
#                 df.to_csv(file, index=False)
            df = df.sort_values("week")
        start_week = df["week"].min()
        end_week = df["week"].max()

        for i in range(start_week, end_week + 1):
            week_start = (start_date + timedelta(weeks=i - 1))
            week_i_df = df[df.week == i].sort_values("date")
            print("Week %i have %i %s tweets"%(i, len(week_i_df), drug))
            log_file.write("Week %i have %i %s tweets\n"%(i, len(week_i_df), drug))

In [39]:
dfs = []
with open("drug_tweet_amount.log", 'r') as file:
    log = [l for l in file.read().split("\n") if l]
    weeks = sorted(list(map(int,np.unique(np.array([l.split(" ")[1] for l in log])).tolist())))
    total_d = dict.fromkeys(weeks, 0)
    for drug in drug_dict.keys():
        drug_d = dict.fromkeys(weeks, 0)
        drug_ratio_d = dict.fromkeys(weeks, 0)
        drug_log = [l for l in log if l and drug in l]
        for line in drug_log:
            drug_d[int(line.split(" ")[1])] += int(line.split(" ")[3])/1000
        drug_df = pd.DataFrame.from_dict(drug_d, orient='index', columns=[drug]).reset_index()
        drug_df.insert(0,"week",drug_df.pop("index"))
        dfs.append(drug_df)

df = pd.concat(dfs,axis=1).T.drop_duplicates().T

In [42]:
df["start_date"] = drug_df.week.apply(lambda x:to_date(int(x)))

In [44]:
df

Unnamed: 0,week,hcq,ivermectin,remdesivir,molnupiravir,start_date
0,2.0,0.000,0.000,0.046,0.000,2020-01-29
1,3.0,0.000,0.000,0.174,0.000,2020-02-05
2,4.0,0.000,0.000,0.122,0.000,2020-02-12
3,5.0,0.000,0.000,0.087,0.000,2020-02-19
4,6.0,0.003,0.004,0.169,0.000,2020-02-26
...,...,...,...,...,...,...
91,93.0,0.355,4.398,0.211,0.183,2021-10-27
92,94.0,0.356,9.785,0.183,0.925,2021-11-03
93,95.0,0.213,4.762,0.234,0.140,2021-11-10
94,96.0,0.276,4.838,0.227,0.200,2021-11-17


In [45]:
import datetime
def get_str(x):
    return x.strftime("%b %d, %Y")

drug_df["week_str"] = drug_df["start_date"].apply(lambda x: get_str(x))
drug_df

Unnamed: 0,week,molnupiravir,start_date,week_str
0,2,0.000,2020-01-29,"Jan 29, 2020"
1,3,0.000,2020-02-05,"Feb 05, 2020"
2,4,0.000,2020-02-12,"Feb 12, 2020"
3,5,0.000,2020-02-19,"Feb 19, 2020"
4,6,0.000,2020-02-26,"Feb 26, 2020"
...,...,...,...,...
91,93,0.183,2021-10-27,"Oct 27, 2021"
92,94,0.925,2021-11-03,"Nov 03, 2021"
93,95,0.140,2021-11-10,"Nov 10, 2021"
94,96,0.200,2021-11-17,"Nov 17, 2021"


In [46]:
df.hcq.sum()  ## check

192.625

In [47]:
# df.to_csv("tweet_fraction.csv",index=False)
drug_df = df

## COVID data (new cases)

In [49]:
## Get new cases 
files = sorted(glob.glob("../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/*.csv"))
c_19_summary_us = {file.split("/")[-1].split(".csv")[0]:
                sum(pd.read_csv(file)["Confirmed"].astype(int)) for file in files}

US only contains info from 2020-04-011

In [51]:
## cite this: https://www.sheffield.ac.uk/international/english-speaking-countries
# English_speaking_countries = ['Antigua and Barbuda',
#                                  'Australia',
#                                  'Bahamas',
#                                  'Barbados',
#                                  'Belize',
#                                  'Canada',
#                                  'Dominica',
#                                  'Grenada',
#                                  'Guyana',
#                                  'Ireland',
#                                  'Jamaica',
#                                  'Malta',
#                                  'New Zealand',
#                                  'Saint Kitts and Nevis',
#                                  'Saint Lucia',
#                                  'Saint Vincent and the Grenadines',
#                                  'Trinidad and Tobago',
#                                  'United Kingdom',
#                                  'US']

# English_speaking_countries = ["United Kingdom","US","Canada","Philippines","India"]

In [52]:
## Get new cases 
files = sorted(glob.glob("../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv"))
# files = sorted([f for f in files if "2020" in f])
# files = files[:files.index('COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/04-11-2020.csv')]'

In [53]:
for file in files:
    df = pd.read_csv(file,header=0)
    if "Country/Region" in df.columns:
        df[df["Country/Region"]=='US']["Confirmed"]
        g = df.groupby("Country/Region").agg("sum")
    else:
        df[df["Country_Region"]=="US"]["Confirmed"]
        g = df.groupby("Country_Region").agg("sum")    
    c_19_summary_us[file.split("/")[-1].split(".csv")[0]] = g["Confirmed"]["US"]


In [54]:
# C19_df = pd.DataFrame.from_dict(c_19_summary, orient='index',columns=["aggregated cases"]).reset_index()
def get_new_cases(C19_df):
    C19_df.insert(0,"date",C19_df.pop("index"))
    C19_df["week"] = C19_df.date.apply(lambda x: to_week(datetime.datetime.strptime(x,'%m-%d-%Y').date()))
    C19_df = C19_df.T[1:].T.groupby("week").agg("sum")
    C19_df["new"] = C19_df["aggregated cases"] - ([0]+list(C19_df["aggregated cases"]))[:-1]
    C19_df = C19_df.reset_index()
    C19_df = C19_df[C19_df.week<=97]
    C19_df = C19_df[1:]
    C19_df = C19_df.drop("aggregated cases",axis=1)
    return C19_df

In [55]:
us = get_new_cases(pd.DataFrame.from_dict(c_19_summary_us, orient='index',columns=["aggregated cases"]).reset_index())
us

Unnamed: 0,week,new
1,2,33.0
2,3,31.0
3,4,18.0
4,5,138.0
5,6,309.0
...,...,...
92,93,3583776.0
93,94,3595808.0
94,95,3907836.0
95,96,4562487.0


In [56]:
# drug_df = pd.read_csv("tweet_fraction.csv")
# drug_df

Unnamed: 0,week,hcq,ivermectin,remdesivir,molnupiravir,start_date
0,2.0,0.000,0.000,0.046,0.000,2020-01-29
1,3.0,0.000,0.000,0.174,0.000,2020-02-05
2,4.0,0.000,0.000,0.122,0.000,2020-02-12
3,5.0,0.000,0.000,0.087,0.000,2020-02-19
4,6.0,0.003,0.004,0.169,0.000,2020-02-26
...,...,...,...,...,...,...
91,93.0,0.355,4.398,0.211,0.183,2021-10-27
92,94.0,0.356,9.785,0.183,0.925,2021-11-03
93,95.0,0.213,4.762,0.234,0.140,2021-11-10
94,96.0,0.276,4.838,0.227,0.200,2021-11-17


In [57]:
df = pd.merge(us,drug_df,on="week",how="outer")
df

Unnamed: 0,week,new,hcq,ivermectin,remdesivir,molnupiravir,start_date
0,2,33.0,0.000,0.000,0.046,0.000,2020-01-29
1,3,31.0,0.000,0.000,0.174,0.000,2020-02-05
2,4,18.0,0.000,0.000,0.122,0.000,2020-02-12
3,5,138.0,0.000,0.000,0.087,0.000,2020-02-19
4,6,309.0,0.003,0.004,0.169,0.000,2020-02-26
...,...,...,...,...,...,...,...
91,93,3583776.0,0.355,4.398,0.211,0.183,2021-10-27
92,94,3595808.0,0.356,9.785,0.183,0.925,2021-11-03
93,95,3907836.0,0.213,4.762,0.234,0.140,2021-11-10
94,96,4562487.0,0.276,4.838,0.227,0.200,2021-11-17


In [7]:
def w(x):
    if x<35: return 1
    elif x<77 and x>=35: return 2
    else: return 3
df["wave"] = df["week"].apply(lambda x: w(x))

In [62]:
df.to_csv("final_stats.csv",index=False)

In [59]:
df = pd.read_csv("final_stats.csv")
df

Unnamed: 0,week,new,hcq,ivermectin,remdesivir,molnupiravir,start_date
0,2,33.0,0.000,0.000,0.046,0.000,2020-01-29
1,3,31.0,0.000,0.000,0.174,0.000,2020-02-05
2,4,18.0,0.000,0.000,0.122,0.000,2020-02-12
3,5,138.0,0.000,0.000,0.087,0.000,2020-02-19
4,6,309.0,0.003,0.004,0.169,0.000,2020-02-26
...,...,...,...,...,...,...,...
91,93,3583776.0,0.355,4.398,0.211,0.183,2021-10-27
92,94,3595808.0,0.356,9.785,0.183,0.925,2021-11-03
93,95,3907836.0,0.213,4.762,0.234,0.140,2021-11-10
94,96,4562487.0,0.276,4.838,0.227,0.200,2021-11-17
