### Remove tweets with more than one drug

The model was trained on HCQ data, and we used the unmasked model for inference. So we need to remove tweets with multiple drug names.

In [3]:
DRUG = "hcq" ## specify the dataset name.
data_dir = "../data/final/"

In [None]:
keywords_dict = {"hcq":"ydroxych| hcq |plaqu |plaquenil|hydroquin|axemal",
                    "ivermectin": "ivermectin|stromectol|soolantra|sklice",
                    "remdesivir": "remdesivir|veklury",
                    "molnupiravir": "molnupiravir|merck's drug|merck's pill|merck's antiviral"}
for drug, keywords in keywords_dict.items():
    df = pd.read_csv(f"../../data/final/{drug}.csv")
    # drug_df = df[df["full_text"].str.contains(keywords, case=False)].drop_duplicates()
    other_keys = keywords_dict.copy()
    other_keys.pop(drug)
    keys_for_other_drugs = "|".join(list(other_keys.values()))
    df = df[~df.full_text.str.contains(keys_for_other_drugs, case=False)]
    if not df.empty:
        df.to_csv(f"../../data/final/{drug}.csv", index=False)
        message = "%i tweets written to %s\n"%(len(df), f"../../data/final/{drug}.csv")
        print(message)
    else:
        print("Nothing left. Skipping...")

In [4]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize()
import re

def clean_url(x):
    return re.sub(r"http\S+","", x)

df = pd.read_csv(f"{data_dir}{DRUG}.csv")

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model_masked = AutoModelForSequenceClassification.from_pretrained("model/Twitter-drug-stance-bert-masked").to("cuda:1")  ## or masked 
model_masked.eval()
model = AutoModelForSequenceClassification.from_pretrained("model/Twitter-drug-stance-bert").to("cuda:1")  ## or masked 
model.eval()



RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [15]:
def infer(x):
    tok = tokenizer(x,return_tensors = 'pt')
    tok.to("cuda:1")
    
    output = model(**tok)
    predictions = torch.argmax(output.logits, dim=-1)
    
    return int(predictions[0])

def masked_infer(x):
    tok = tokenizer(x,return_tensors = 'pt')
    tok.to("cuda:1")
    
    output = model_masked(**tok)
    predictions = torch.argmax(output.logits, dim=-1)
    
    return int(predictions[0])

In [16]:

# def re_map(x):
#     if x==1: return -1
#     elif x==2: return 1
#     else: return 0
# df=df[:200]  ## test test
# df.stance = df.stance.parallel_apply(lambda x: re_map(x))
# df.stance.value_counts()  



In [17]:
# df.to_csv(f"{data_dir}{DRUG}.csv",index=False)

## Evaluation


In [18]:
import glob
import pandas as pd
from tqdm import tqdm
import os

tqdm.pandas()
evaluation_dir = "evaluation/"
os.makedirs(evaluation_dir, exist_ok=True)

for drug in ["hcq","ivermectin","molnupiravir","remdesivir"]:
    df = pd.read_csv(f"{data_dir}{drug}.csv").sample(100, random_state=42)
    df["stance"] = df.full_text.apply(lambda x: infer(x))   
    df["masked_prediction"] = df.full_text.progress_apply(lambda x: masked_infer(x))
    df["prediction"] = df.full_text.progress_apply(lambda x: infer(x))
    df.to_csv(f"{evaluation_dir}{drug}.csv",index=False)

100%|██████████| 100/100 [00:01<00:00, 95.25it/s]
100%|██████████| 100/100 [00:01<00:00, 97.42it/s]
100%|██████████| 100/100 [00:01<00:00, 98.72it/s]
100%|██████████| 100/100 [00:01<00:00, 98.22it/s]
100%|██████████| 100/100 [00:01<00:00, 90.24it/s]
100%|██████████| 100/100 [00:01<00:00, 90.03it/s]
100%|██████████| 100/100 [00:01<00:00, 98.86it/s]
100%|██████████| 100/100 [00:01<00:00, 93.95it/s]
