We ran NER using [Twitter-Stanza](https://github.com/social-machines/TweebankNLP). Follow their README for more detailed introduction

In [2]:
import pandas as pd
import os
import numpy as np
import tqdm
import glob
from pandarallel import pandarallel 
pandarallel.initialize()

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [1]:
import torch
torch.cuda.empty_cache()

In [3]:
import stanza

# config for the `en_tweet` models (models trained only on Tweebank)
config = {
          'processors': 'tokenize,lemma,pos,depparse,ner',
          'lang': 'en',
          'tokenize_pretokenized': True, # disable tokenization
          'tokenize_model_path': '../../TweebankNLP/twitter-stanza/saved_models/tokenize/en_tweet_tokenizer.pt',
          'lemma_model_path': '../../TweebankNLP/twitter-stanza/saved_models/lemma/en_tweet_lemmatizer.pt',
          "pos_model_path": '../../TweebankNLP/twitter-stanza/saved_models/pos/en_tweetewt_tagger.pt',
          "depparse_model_path": '../../TweebankNLP/twitter-stanza/saved_models/depparse/en_tweetewt_parser.pt',
          "ner_model_path": '../../TweebankNLP/twitter-stanza/saved_models/ner/en_tweetwnut17_nertagger.pt',
}

# Initialize the pipeline using a configuration dict
# stanza.download("en")
nlp = stanza.Pipeline(**config) 

2022-04-06 12:01:36 INFO: Loading these models for language: en (English):
| Processor | Package                 |
---------------------------------------
| tokenize  | ../../Twee...kenizer.pt |
| pos       | ../../Twee..._tagger.pt |
| lemma     | ../../Twee...matizer.pt |
| depparse  | ../../Twee..._parser.pt |
| ner       | ../../Twee...rtagger.pt |

2022-04-06 12:01:36 INFO: Use device: gpu
2022-04-06 12:01:36 INFO: Loading: tokenize
2022-04-06 12:01:36 INFO: Loading: pos
2022-04-06 12:01:43 INFO: Loading: lemma
2022-04-06 12:01:43 INFO: Loading: depparse
2022-04-06 12:01:44 INFO: Loading: ner
2022-04-06 12:01:45 INFO: Done loading processors!


In [4]:
def find_NER(x):
    ners = []
    types = []

    doc = nlp(x)
    for sent in doc.sentences:
        if sent.ents:
            for ent in sent.ents:
                ners.append(ent.text)
                types.append(ent.type)
    return "|".join(ners) + "%" + "|".join(types)

In [5]:
data_dir = "../data/final/"
files = sorted(glob.glob(f"{data_dir}*.csv"), reverse=True)

In [6]:
import os 

data_dir = "../data/ner/"
os.makedirs(data_dir,exist_ok=True)

In [8]:
files

['../data/final/remdesivir.csv',
 '../data/final/molnupiravir.csv',
 '../data/final/ivermectin.csv',
 '../data/final/hcq.csv']

In [9]:
from tqdm import tqdm
tqdm.pandas()

for file in files[3:]:
    drug = file.split("/")[-1].split(".")[0]
    print(drug)
    df = pd.read_csv(file, lineterminator="\n", low_memory=False)
    df = df[df.stance!=0]
    df["ner"] = df.full_text.progress_apply(lambda x: find_NER(x))
    df = df[["stance","wave","ner"]]
    df[["ner","type"]] = df.ner.str.split('%', expand=True)
    df.to_csv(f"{data_dir}{drug}")


hcq


100%|██████████| 252198/252198 [7:13:38<00:00,  9.69it/s]  


In [3]:
from wordcloud import WordCloud, STOPWORDS
import random
import string
import glob
import tqdm
def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(0, 50)

STOPWORDS = ["covid",'america',"ncov","covid-19","covid19","coronavirus","cov","us","americans","fda","dr","corona","north","south","state","university","merck merck"]

def generate_wordcloud(df, stopwords):
    words = [w.replace("#","").replace("@","").lower() for w 
                                in "|".join(df.ner.values.tolist()).split("|")]
    words = [w for w in words if len(w)>=2]
    all_word_string = ",".join(words).lower().replace("food and","FDA").replace("and drug","FDA").replace("president trump","trump").replace("university of","university").replace("merck merck","merck")
    # stopwords = set(STOPWORDS)
    wordcloud = WordCloud(random_state=1, 
                        stopwords=stopwords,
                        background_color="white",
                        max_words=80, 
                        contour_width=3, 
                        min_word_length = 2,
                        width=600, height=400,
                        # min_font_size = 20,
                        max_font_size = 100,
                        color_func=grey_color_func).generate(all_word_string)

    return wordcloud

In [4]:
import pandas as pd
keywords_dict = {"hcq":"Hydroxychloroquine|hcq|plaque|plaquenil|hydroquine|axemal",
                    "ivermectin": "ivermectin|stromectol|soolantra|sklice|ivm",
                    "remdesivir": "remdesivir|veklury|rem",
                    "molnupiravir": "molnupiravir|merck's drug|merck's pill|merck's antiviral|merck's"}

files = sorted(glob.glob("../data/ner/*"))

all_wcs = []
for file in tqdm.tqdm(files):
    # pos, neg = [], [] 
    drug_wcs = []
    drug = file.split("/")[-1]
    stopwords = keywords_dict[drug].split("|")
    stopwords.extend(STOPWORDS)
    df = pd.read_csv(file)
    for i in [1,2,3]:
        # print("On wave %i"%i)
        df = df[~df.ner.isna()]
        w = df[df.wave==i]
        # print(Counter(words).most_common()[:20])
        drug_wcs.append(generate_wordcloud(w,stopwords))
    all_wcs.append(drug_wcs)


100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


In [5]:
import numpy as np
all_wcs = np.array(all_wcs).T

  all_wcs = np.array(all_wcs).T
  all_wcs = np.array(all_wcs).T


In [6]:
import matplotlib.pyplot as plt
import numpy as np

fig, axs = plt.subplots(3, 4,  figsize=(140, 70), sharex=True, sharey=True)
for i in range(3):
    for j in range(4):
        axs[i][j].imshow(all_wcs[i][j])
        axs[i][j].axis("off")



In [7]:
plt.savefig("wordcloud.jpg")