# Automatic Comparison of Two Datasets

## Load Data

In [1]:
import pandas as pd
import os
import en_ner_bc5cdr_md
from spacy.language import Language
from scispacy.linking import EntityLinker
from tqdm import tqdm
tqdm.pandas()

from src.data import files_to_df
from src.scispacy_ie import entity_linker

gen_path = "data/2_generated/2step_transformation_dt4h_GPT4omini"
orig_path = "data/1_original/txt"

lang = "en"
gen_path = os.path.join(gen_path, lang)

df_orig = files_to_df(orig_path)

df_gen = files_to_df(gen_path)
df_gen["text_orig"] = df_gen["text"]
df_gen["text"] = df_gen["text_orig"].apply(lambda x: x.split("'text_to_transform': ")[-1][:-1].replace("'", ""))
df_gen.drop("text_orig", axis=1, inplace=True)

print("There are {} original and {} generated samples".format(len(df_orig), len(df_gen)))

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


There are 1000 original and 1000 generated samples


## Extract Entities and Normalize

In [2]:
df_gen

Unnamed: 0,filenameid,text
0,36951253_transformed_step1,**Discharge Summary**\n\n**Patient Information...
1,33892935_transformed_step1,**Discharge Summary**\n\n**Patient Information...
2,28960930_transformed_step1,**Discharge Summary**\n\n**Patient Information...
3,29538200_transformed_step1,**Discharge Summary**\n\n**Patient Information...
4,30103814_transformed_step1,**Discharge Summary**\n\n**Patient Information...
...,...,...
995,30762762_transformed_step1,**Discharge Summary**\n\n**Patient Information...
996,27980283_transformed_step1,**Discharge Summary**\n\n**Patient Information...
997,34012203_transformed_step1,**Discharge Summary**\n\n**Patient Information...
998,37828735_4_transformed_step1,**Discharge Summary**\n\n**Patient Information...


In [4]:
nlp = en_ner_bc5cdr_md.load()

try:    # Register the EntityLinker component
    @Language.factory("umls_linker")
    def create_umls_linker(nlp, name):
        return EntityLinker(k=10, max_entities_per_mention=5, name="umls")
    nlp.add_pipe("umls_linker")
    
except ValueError:
    print("Entity linker already exists")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
import swifter

df_ents_orig = pd.DataFrame()
step = 10
for i in range(0, df_orig.shape[0], step):
    df_ents_orig = pd.concat([df_ents_orig, df_orig.iloc[i:i+step].set_index("filenameid")["text"].swifter.apply(lambda x: entity_linker(nlp, x)).explode().apply(pd.Series)], axis=0)

df_ents_orig.columns = ["span", "mention_class", "code", "term"]
df_ents_orig.reset_index(inplace=True)

  from .autonotebook import tqdm as notebook_tqdm
Pandas Apply: 100%|██████████| 400/400 [00:36<00:00, 10.93it/s]
Pandas Apply: 100%|██████████| 400/400 [00:34<00:00, 11.47it/s]
Pandas Apply: 100%|██████████| 200/200 [00:17<00:00, 11.58it/s]


: 

In [None]:
df_ents_gen = pd.DataFrame()
step = 100
for i in range(0, df_gen.shape[0], step):
    df_ents_gen = pd.concat([df_ents_gen, df_gen.iloc[i:i+step].set_index("filenameid")["text"].swifter.apply(lambda x: entity_linker(nlp, x)).explode().apply(pd.Series)], axis=0)

df_gen.columns = ["span", "mention_class", "code", "term"]
df_gen.reset_index(inplace=True)

Pandas Apply: 100%|██████████| 100/100 [00:08<00:00, 12.48it/s]
Pandas Apply:  80%|████████  | 80/100 [00:05<00:01, 14.82it/s]