In [1]:
import pandas as pd

drugae = pd.read_excel('DRUG-AE.xls',sheet_name='DRUG-AE')

drugdos=pd.read_excel('DRUG-AE.xls',sheet_name='DRUG-DOSE')

s1 = pd.merge(drugae, drugdos, how='left', on=['PMID', 'Drug', 'Drug from','Drug to'])

the drugae dataset contains positive and negative sentences. the merging should have happened only for the positive however the number of lines after the merge is not equal to the number of lines before the merge. The reason for the difference is due to multiple adverse event entities within some phrases, that therefore leads to more lines. Manual inspection confirmed.

# Randomization #
The target dataset will consists of three groups of lines:
100 lines containing annotated doses
100 lines containing only drug and AE but without doses information
100 lines from the negative dataset (that was already manually imported in the excel sheet and labeled as negative)

In [2]:
from numpy.random import default_rng

# Select random 100 negative #

In [3]:
negative = s1[s1["HasAE"] == "Negative"]
arr_indices_random_negative = default_rng().choice(negative.index, size=100, replace=False)

# Select random 100 with Dose #

In [5]:
withDoseMask = pd.notna(s1['Dose'])
doseRows = s1[withDoseMask]
arr_indices_random_withDose = default_rng().choice(doseRows.index, size=100, replace=False)

# Select random 100 positive without Dose #

In [6]:
positive = s1[s1["HasAE"] == "Positive"]
withOutDoseMask = pd.isna(positive['Dose'])
positiveWithoutDoseRows = positive[withOutDoseMask]
arr_indices_random_withOutDose = default_rng().choice(positiveWithoutDoseRows.index, size=100, replace=False)

# Create a new dataframe with the rows from the previously calculated indices #

In [25]:
N=s1.loc[arr_indices_random_negative]
D=s1.loc[arr_indices_random_withDose]
P=s1.loc[arr_indices_random_withOutDose]
frames = [N, D, P]
result = pd.concat(frames)
final=result.drop(columns=['PMID','Drug from', 'Drug to'])

In [26]:
final

Unnamed: 0,Phrase,Adverse Effect,Drug,HasAE,Dose
10103,RESULTS: [1] Mucin secretion was increased by ...,,,Negative,
22493,While more studies are needed to better unders...,,,Negative,
14170,Primary spontaneous coronary artery dissection...,,,Negative,
22175,"Liver function continued to deteriorate, and t...",,,Negative,
8118,The patient's methemoglobin level was 63%.,,,Negative,
...,...,...,...,...,...
1312,Risperidone-induced obsessive-compulsive sympt...,obsessive-compulsive symptoms,Risperidone,Positive,
3525,Drug rash with eosinophilia and systemic sympt...,Drug rash with eosinophilia and systemic symptoms,chlorambucil,Positive,
2563,"To our knowledge, this is the first case of iv...",severe liver disease,ivermectin,Positive,
5894,Nephrogenic diabetes insipidus and renal tubul...,Nephrogenic diabetes insipidus,foscarnet,Positive,


In [27]:
final["GoldenTruth"] = final.apply(
    lambda row: "|".join(str(row[col]).upper() if pd.notnull(row[col]) else "" 
                         for col in ["HasAE", "Drug", "Dose", "Adverse Effect"]),
    axis=1
)

In [28]:
final

Unnamed: 0,Phrase,Adverse Effect,Drug,HasAE,Dose,GoldenTruth
10103,RESULTS: [1] Mucin secretion was increased by ...,,,Negative,,NEGATIVE|||
22493,While more studies are needed to better unders...,,,Negative,,NEGATIVE|||
14170,Primary spontaneous coronary artery dissection...,,,Negative,,NEGATIVE|||
22175,"Liver function continued to deteriorate, and t...",,,Negative,,NEGATIVE|||
8118,The patient's methemoglobin level was 63%.,,,Negative,,NEGATIVE|||
...,...,...,...,...,...,...
1312,Risperidone-induced obsessive-compulsive sympt...,obsessive-compulsive symptoms,Risperidone,Positive,,POSITIVE|RISPERIDONE||OBSESSIVE-COMPULSIVE SYM...
3525,Drug rash with eosinophilia and systemic sympt...,Drug rash with eosinophilia and systemic symptoms,chlorambucil,Positive,,POSITIVE|CHLORAMBUCIL||DRUG RASH WITH EOSINOPH...
2563,"To our knowledge, this is the first case of iv...",severe liver disease,ivermectin,Positive,,POSITIVE|IVERMECTIN||SEVERE LIVER DISEASE
5894,Nephrogenic diabetes insipidus and renal tubul...,Nephrogenic diabetes insipidus,foscarnet,Positive,,POSITIVE|FOSCARNET||NEPHROGENIC DIABETES INSIP...


# Generate JSON output to be used for input of the AI Launchpad

In [31]:
import json

In [32]:
json_data = [
    {
        "Phrase": row["Phrase"],
        "goldenTruth":row["GoldenTruth"]
    }
    for _, row in final.iterrows()
]

In [33]:
with open("sample-dataset.json", "w", encoding='utf-8') as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)