In [33]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
import json
from tqdm import tqdm
import pandas as pd

In [34]:
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2')

In [35]:
with open('extracted_symptoms_list.json','r') as extract_file:
    extracted_symptoms_list = json.load(extract_file)
    
with open('standard_symptoms_list.json','r') as standard_file:
    standard_symptoms_list = json.load(standard_file)
    
with open('top_100_list.json','r') as top:
    top_100 = json.load(top)
    
with open('ext_top.json','r') as top_ext:
    ext_top = json.load(top_ext)

In [36]:
len(ext_top)

1000

In [37]:
len(top_100)

100

In [38]:
100*(len(set(extracted_symptoms_list)))

1914200

In [39]:
path = "dataset/"

In [40]:
pd.set_option('display.max_columns',None)

In [None]:
vax_vaers_data = pd.read_csv("VAX_VAERS_DATA.csv")

In [None]:
symptoms_in_reports = []
for i in vax_vaers_data["SYMPTOM_TEXT"].dropna():
    symptoms_in_reports.append(i)

In [None]:
symptoms_frequency = dict()
for i in extracted_symptoms_list:
    symptoms_frequency[i] = sum(entry.count(i) for entry in symptoms_in_reports)

In [None]:
symptoms_frequency

In [None]:
ext_top = dict(sorted(symptoms_frequency.items(), key=lambda x: x[1], reverse=True)[:1000])

In [None]:
ext_top

In [None]:
import json
with open("ext_top.json",'w') as op_file:
    json.dump(list(ext_top),op_file)

In [None]:
import json
with open("symptoms_frequency.json",'w') as op_file:
    json.dump(symptoms_frequency,op_file)

In [5]:
similarity_scores = dict()
for symp in tqdm(set(ext_top)):
    scores = dict()
    for std_symp in top_100:
        std_symp_tokens = tokenizer(std_symp, return_tensors='pt')
        symp_tokens = tokenizer(symp, return_tensors='pt')

        with torch.no_grad():
            std_symp_emb = model(**std_symp_tokens).last_hidden_state.mean(dim=1)
            symp_emb = model(**symp_tokens).last_hidden_state.mean(dim=1)

        std_symp_emb = std_symp_emb.detach().numpy()
        symp_emb = symp_emb.detach().numpy()

        scores[std_symp] = cosine_similarity([np.ravel(std_symp_emb)],[np.ravel(symp_emb)])[0][0]
        
    max_key, max_val = max(scores.items(), key= lambda x: x[1])
    similarity_scores[symp] = [max_key, max_val]
#     break

100%|██████████| 1000/1000 [3:38:45<00:00, 13.13s/it] 


In [32]:
import csv
csv_file = 'similarity_scores.csv'

# Specify the CSV file name
csv_file = 'similarity_scores.csv'

# Reshape the dictionary into a list of tuples (key, value)
data = [(key, value) for key, value in similarity_scores.items()]

# Writing the reshaped data to a CSV file
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write header
    writer.writerow(['Key', 'Value'])
    
    # Write data
    writer.writerows(data)

In [7]:
similarity_scores

{'stool': ['Swelling', 0.9072527],
 'lightheadedness': ['Dizziness', 0.9160143],
 'fainted': ['Vomiting', 0.8686165],
 'itchiness': ['Pyrexia', 0.8744232],
 'MYALGIA': ['Myalgia', 1.0000002],
 'Swollen': ['Swelling', 0.9506608],
 'flu like symptom': ['Pyrexia', 0.82842076],
 'CKD': ['SARS-CoV-2 test', 0.82007635],
 'pericarditis': ['Pericarditis', 1.0],
 'an infection': ['Infection', 0.8943497],
 'these symptoms': ['Tenderness', 0.87117016],
 'PYREXIA (fever)': ['Pyrexia', 0.9441159],
 'Throat': ['Cough', 0.9340446],
 'tingling sensation': ['Burning sensation', 0.9725992],
 'breakthrough': ['Swelling', 0.92750686],
 'TENDERNESS': ['Tenderness', 0.99999994],
 'sick': ['Seizure', 0.938822],
 'a side effect': ['Pyrexia', 0.86037886],
 'anaphylactic': ['Tachycardia', 0.8214239],
 'DISCOMFORT': ['Discomfort', 1.0],
 'covid': ['COVID-19', 0.85596645],
 'HEADACHE': ['Headache', 1.0],
 'concurrent conditions': ['Drug ineffective', 0.8543272],
 'cramping': ['Pyrexia', 0.8581922],
 'nervous': ['

In [43]:
df = pd.DataFrame(similarity_scores.items(), columns=['key','values'])

In [44]:
df

Unnamed: 0,key,values
0,stool,"[Swelling, 0.9072527]"
1,lightheadedness,"[Dizziness, 0.9160143]"
2,fainted,"[Vomiting, 0.8686165]"
3,itchiness,"[Pyrexia, 0.8744232]"
4,MYALGIA,"[Myalgia, 1.0000002]"
...,...,...
995,Shortness of breath,"[Dizziness, 0.8203939]"
996,vaccination,"[Vaccination failure, 0.9691125]"
997,lump,"[Seizure, 0.92915714]"
998,PRURITUS (itchiness,"[Pruritus, 0.91945404]"


In [None]:
# # Example tokenization and embedding
# symptom_text = "fever"
# standard_symptom_text = "migraine"

# symptom_tokens = tokenizer(symptom_text, return_tensors='pt')
# standard_symptom_tokens = tokenizer(standard_symptom_text, return_tensors='pt')

In [None]:
# with torch.no_grad():
#     # Convert PyTorch tensors to NumPy arrays
#     symptom_embedding = model(**symptom_tokens).last_hidden_state.mean(dim=1)
#     standard_symptom_embedding = model(**standard_symptom_tokens).last_hidden_state.mean(dim=1)

# # Convert PyTorch tensors to NumPy arrays    
# symptom_embedding_np = symptom_embedding.detach().numpy()
# standard_symptom_embedding_np = standard_symptom_embedding.detach().numpy()

# # Flatten the embeddings to 1D arrays
# flat_symptom_embedding = np.ravel(symptom_embedding_np)
# flat_standard_symptom_embedding = np.ravel(standard_symptom_embedding_np)

# # Calculate cosine similarity
# similarity_score = cosine_similarity([flat_symptom_embedding], [flat_standard_symptom_embedding])[0][0]
# print(f"Similarity Score: {similarity_score:.2f}")