In [1]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Concat all data 

In [3]:
drugs = ["hcq","ivermectin","molnupiravir","remdesivir"]
data_dir = "../data/final/"
dfs = []
for drug in drugs:
    df = pd.read_csv(f"{data_dir}{drug}.csv",header=0,lineterminator='\n')
    if "Unnamed: 0" in df.columns:
        df = df.drop("Unnamed: 0",axis=1)
    df["drug"] = drug
    dfs.append(df)
df = pd.concat(dfs)
# df = df[df["created_at"].str.contains("\+0000")]

In [4]:
len(df.screen_name.unique())

127491

### convert date to week and wave

In [7]:
from datetime import datetime

# set the first date of the datasets: 2020-01-22
start_date = datetime.strptime("01/22/2020", "%m/%d/%Y").date()
print(f"Start date: {start_date}")
# set the end date 2020-12-31
end_date = datetime.strptime("11/30/2021", "%m/%d/%Y").date()
print(f"End date: {end_date}")


import math
from dateutils import timedelta

def to_week(x):
    if x:
        return math.ceil((x + timedelta(days=1) - start_date) / timedelta(weeks=1))
    return np.nan
    
def to_date(week):
    return start_date + timedelta(weeks=week-1)

import numpy as np
import tqdm

def map_date(x):
    try:
        return datetime.strptime(x, "%a %b %d %H:%M:%S %z %Y").date()
    except:
        return np.nan

def map_wave(x):
    if x<35:
        return 1
    elif x>=77:
        return 3
    else:
        return 2

df["date"] = df["created_at"].parallel_apply(lambda x: map_date(x))
df = df.dropna(subset=["date","state","full_text"])
df["week"] = df["date"].parallel_apply(lambda x: to_week(x))

df["wave"] = df.week.apply(lambda x: map_wave(x))


### Map state

In [None]:
len(df)

549388

In [8]:
df.columns

Index(['id', 'full_text', 'screen_name', 'state', 'stance', 'med', 'date',
       'week', 'wave', 'desensitized_text', 'drug'],
      dtype='object')

In [9]:
## get rid of tweets with more than one state
df = df[~df.state.str.contains("_")]
len(df)

484461

In [10]:
geo_reference = pd.read_csv("us_states.csv")
geo_reference[:5]

Unnamed: 0,State,Abbreviation,Alpha code
0,Alabama,Ala.,AL
1,Alaska,,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [11]:
states = geo_reference.State.str.lower().tolist()
abbs = [a.replace(".","") for a in geo_reference.Abbreviation.dropna().str.lower().tolist()]
alpha = geo_reference["Alpha code"].str.lower().tolist()
dest = geo_reference.State.tolist()

geo_reference = dict(zip(states,dest))
geo_reference.update(dict(zip(abbs,dest)))
geo_reference.update(dict(zip(alpha,dest)))


In [12]:
def find_us_state(x):
    '''Unifies the state names'''
    if "_" in x:
        final = []
        for s in x.split("_"):
            if s.lower() in states+abbs+alpha:
               final.append(geo_reference[s.lower()])
        return "_".join(final)
    for s in x.split(", "):
        if s.lower() in states+abbs+alpha:
            return geo_reference[s.lower()]
            
    return x
            

In [13]:
df.state = df.state.parallel_apply(lambda x: find_us_state(x))
df.state.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


array(['Colorado', 'Georgia', 'California', 'Arizona',
       'District of Columbia', 'New Jersey', 'South Carolina',
       'Washington', 'Maryland', 'North Carolina', 'Oklahoma',
       'Massachusetts', 'Pennsylvania', 'Mississippi', 'Kansas',
       'Florida', 'Michigan', 'New York', 'Tennessee', 'Texas',
       'Rhode Island', 'Nevada', 'Minnesota', 'Oregon', 'Indiana',
       'Virginia', 'Illinois', 'Delaware', 'Wisconsin', 'Kentucky',
       'Arkansas', 'Ohio', 'Montana', 'Connecticut', 'New Hampshire',
       'Alaska', 'Louisiana', 'Missouri', 'Wyoming', 'New Mexico',
       'Alabama', 'Idaho', 'Maine', 'Hawaii', 'West Virginia', 'Utah',
       'Iowa', 'Vermont', 'Nebraska', 'South Dakota', 'North Dakota'],
      dtype=object)

In [38]:
import numpy as np
df["count"] = df.stance
average_stance_dist = df[["drug","wave","state","stance","count"]].groupby(["drug","wave","state"]).agg({"stance":np.sum,"count":np.size}).reset_index()
average_stance_dist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["count"] = df.stance


Unnamed: 0,drug,wave,state,stance,count
0,hcq,1,Alabama,512,1426
1,hcq,1,Alaska,107,334
2,hcq,1,Arizona,1005,3879
3,hcq,1,Arkansas,115,640
4,hcq,1,California,757,18104
...,...,...,...,...,...
702,remdesivir,3,Virginia,-7,100
703,remdesivir,3,Washington,96,208
704,remdesivir,3,West Virginia,-3,31
705,remdesivir,3,Wisconsin,15,57


In [39]:
average_stance_dist["average_stance"] = round(average_stance_dist.stance/average_stance_dist["count"], 4)
average_stance_dist.pop("stance")
average_stance_dist.pop("count")
average_stance_dist


Unnamed: 0,drug,wave,state,average_stance
0,hcq,1,Alabama,0.3590
1,hcq,1,Alaska,0.3204
2,hcq,1,Arizona,0.2591
3,hcq,1,Arkansas,0.1797
4,hcq,1,California,0.0418
...,...,...,...,...
702,remdesivir,3,Virginia,-0.0700
703,remdesivir,3,Washington,0.4615
704,remdesivir,3,West Virginia,-0.0968
705,remdesivir,3,Wisconsin,0.2632


In [40]:
average_stance_dist.to_csv("stance_geo_average.csv",index=False)