### Join all evaluation datasets in a single dataframe

Output in `out/eval_resources/sim_rel_all.csv`

Authors: 
- F.A. Cardillo, francoalberto.cardillo@cnr.it
- F. Debole, franca.debole@isti.cnr.it

Date: 22 March 2024

__If you use this notebook or the resources it builds, please cite:__

__"Italian Word Embeddings for the Medical Domain", F.A. Cardillo, F. Debole. Proc. of the 2024 Joint Int. Conf. on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), Turin, Italy, May 20-25, 2024.__


The MIT License

Copyright 2024, Franco Alberto Cardillo, Franca Debole

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<hr>

In [1]:
import os
from posixpath import join
import pandas as pd
import numpy as np

fld = join("out", "eval_resources")
csvs = sorted([fn for fn in os.listdir(fld) if fn.endswith(".csv")])

# filter: check that filename (without ext) ends with "_it"
csvs = [
    fn for fn in csvs if fn.split(".")[0].endswith("it")
]
print("INPUT FILES:")
for i, fn in enumerate(csvs):
    print(f"{i+1}. {fn}")

# read dataframes, rename columns, and merge
# mayo: Mean,CUI1,CUI2,TERM1,TERM2,term1_it,term2_it
# mini-mayo: Physicians,Coders,CUI1,CUI2,TERM1,TERM2,term1_it,term2_it
# all UMNSRS: Mean,Stdev,Term1,Term2,CUI1,CUI2,term1_it,term2_it
column_map = {
    "Mean" : "score",
    "Term1" : "term1",
    "TERM1" : "term1",
    "Term2" : "term2",
    "TERM2" : "term2",
    "CUI1" : "cui1",
    "CUI2" : "cui2",
    "term1_it" : "term1_it",
    "term2_it" : "term2_it",
}

dfs = []

def process_mini_mayo(df):
    # create two dataframes dfa, dfb, one for physicians and one for coders
    dfa = df.copy()
    dfa = df.rename(columns={"Physicians" : "score"})
    dfa["annotator"] = "physicians"
    dfa = dfa.drop(columns=["Coders"])
    print("DFA")
    display(dfa.head())

    dfb = df.copy()
    dfb = dfb.rename(columns={"Coders" : "score"})
    dfb["annotator"] = "coders"
    dfb = dfb.drop(columns=["Physicians"])
    print("DFB")
    display(dfb.head())

    df = pd.concat([dfa, dfb])
    df["filename"] = "MiniMayoSRS_it.csv"
    df.columns = df.columns.map(lambda c: column_map[c] if c in column_map else c)
    print("MiniMayoSRS_it.csv")
    display(df.head())
    return df
#<

for fn in csvs:
    df = pd.read_csv(join(fld, fn)).reset_index(drop=True)

    # MiniMayoSRS_it.csv has two annotators: Physicians and Coders, dedicated procedure
    if fn == "MiniMayoSRS_it.csv":
        df = process_mini_mayo(df)
        print(df.columns)
        dfs.append(df)
        continue
    #<

    if "UMNSRS" in fn:
        df = df.drop(columns=["Stdev"])
    
    df.columns = df.columns.map(column_map)
    df["annotator"] = "umnsrs" if "UMNSRS" in fn else "coders"
    df["filename"] = fn
    df = df.set_index(["filename", "cui1", "cui2"])
    print(fn)
    display(df.head())
    dfs.append(df)
#<

for fn, df in zip(csvs, dfs):
    print(fn, ":", df.columns)

def reorder_columns(df):
    new_order = ["filename", "cui1", "cui2", "annotator", "term1", "term2", "score", "term1_it", "term2_it"]
    return df[new_order]

dfs = [reorder_columns(df.reset_index()) for df in dfs]
out_df = pd.concat(dfs)

print("OUT [filename,annotator]")
display(out_df.groupby(["filename", "annotator"]).sample(1))

print("OUT stats")
display(out_df.groupby(["filename", "annotator"]).count())

fn = join(fld, "sim_rel_all.csv")
out_df.to_csv(fn, index=False)
print("save:", fn)
print("done")


INPUT FILES:
1. MayoSRS_it.csv
2. MiniMayoSRS_it.csv
3. UMNSRS_relatedness_it.csv
4. UMNSRS_relatedness_mod458_word2vec_it.csv
5. UMNSRS_similarity_it.csv
6. UMNSRS_similarity_mod449_word2vec_it.csv
MayoSRS_it.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score,term1,term2,term1_it,term2_it,annotator
filename,cui1,cui2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MayoSRS_it.csv,C0311394,C0231685,6.69,difficulty walking,antalgic gait,difficoltà nella deambulazione,andatura antalgica,coders
MayoSRS_it.csv,C0035450,C0034079,2.38,rheumatoid nodule,lung nodule,nodulo reumatoide,nodulo polmonare,coders
MayoSRS_it.csv,C0409162,C0333286,1.0,hand splint,splinter hemorrhage,steccatura della mano,emorragie a scheggia,coders
MayoSRS_it.csv,C0011849,C0032584,1.0,diabetes,polyp,diabete mellito,polipi,coders
MayoSRS_it.csv,C0020541,C0027962,1.0,portal hypertension,nevus,ipertensione portale,nevo pigmentato,coders


DFA


Unnamed: 0,score,CUI1,CUI2,TERM1,TERM2,term1_it,term2_it,annotator
0,4.0,C0035078,C0035078,Renal failure,Kidney failure,insufficienza renale,insufficienza renale,physicians
1,3.0,C0156543,C0000786,Abortion,Miscarriage,aborto,aborto spontaneo,physicians
2,3.3,C0018787,C0027061,Heart,Myocardium,cuore,miocardio,physicians
3,3.0,C0038454,C0021308,Stroke,Infarct,ictus cerebrale,infarto,physicians
4,3.0,C0011253,C0036341,Delusion,Schizophrenia,delusioni,schizofrenia,physicians


DFB


Unnamed: 0,score,CUI1,CUI2,TERM1,TERM2,term1_it,term2_it,annotator
0,4.0,C0035078,C0035078,Renal failure,Kidney failure,insufficienza renale,insufficienza renale,coders
1,3.3,C0156543,C0000786,Abortion,Miscarriage,aborto,aborto spontaneo,coders
2,3.0,C0018787,C0027061,Heart,Myocardium,cuore,miocardio,coders
3,2.8,C0038454,C0021308,Stroke,Infarct,ictus cerebrale,infarto,coders
4,2.2,C0011253,C0036341,Delusion,Schizophrenia,delusioni,schizofrenia,coders


MiniMayoSRS_it.csv


Unnamed: 0,score,cui1,cui2,term1,term2,term1_it,term2_it,annotator,filename
0,4.0,C0035078,C0035078,Renal failure,Kidney failure,insufficienza renale,insufficienza renale,physicians,MiniMayoSRS_it.csv
1,3.0,C0156543,C0000786,Abortion,Miscarriage,aborto,aborto spontaneo,physicians,MiniMayoSRS_it.csv
2,3.3,C0018787,C0027061,Heart,Myocardium,cuore,miocardio,physicians,MiniMayoSRS_it.csv
3,3.0,C0038454,C0021308,Stroke,Infarct,ictus cerebrale,infarto,physicians,MiniMayoSRS_it.csv
4,3.0,C0011253,C0036341,Delusion,Schizophrenia,delusioni,schizofrenia,physicians,MiniMayoSRS_it.csv


Index(['score', 'cui1', 'cui2', 'term1', 'term2', 'term1_it', 'term2_it',
       'annotator', 'filename'],
      dtype='object')
UMNSRS_relatedness_it.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score,term1,term2,term1_it,term2_it,annotator
filename,cui1,cui2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
UMNSRS_relatedness_it.csv,C0006949,C0031507,797.5,Carbatrol,Dilantin,carbamazepina,fenitoina,umnsrs
UMNSRS_relatedness_it.csv,C0878544,C0000970,417.25,Cardiomyopathy,Tylenol,malattie del miocardio,acetaminofene,umnsrs
UMNSRS_relatedness_it.csv,C0009186,C0019655,1273.0,Coccidioidomycosis,Histoplasmosis,coccidioidomicosi,istoplasmosi,umnsrs
UMNSRS_relatedness_it.csv,C0023413,C0025677,1111.25,Leucovorin,Methotrexate,leucovorina,metotressato,umnsrs
UMNSRS_relatedness_it.csv,C0002962,C0070166,1294.75,Angina,Plavix,angina pectoris,clopidogrel,umnsrs


UMNSRS_relatedness_mod458_word2vec_it.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score,term1,term2,term1_it,term2_it,annotator
filename,cui1,cui2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
UMNSRS_relatedness_mod458_word2vec_it.csv,C0001047,C0000970,1109.0,acetylcysteine,tylenol,acetilcisteina,acetaminofene,umnsrs
UMNSRS_relatedness_mod458_word2vec_it.csv,C0001047,C0001443,586.5,acetylcysteine,adenosine,acetilcisteina,adenosina,umnsrs
UMNSRS_relatedness_mod458_word2vec_it.csv,C0001367,C0070122,604.75,zovirax,paxil,acyclovir,paroxetina,umnsrs
UMNSRS_relatedness_mod458_word2vec_it.csv,C0001416,C0038450,680.5,adenitis,stridor,adenite,stridore,umnsrs
UMNSRS_relatedness_mod458_word2vec_it.csv,C0001824,C0002871,1359.5,agranulocytosis,anemia,agranulocitosi,anemia,umnsrs


UMNSRS_similarity_it.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score,term1,term2,term1_it,term2_it,annotator
filename,cui1,cui2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
UMNSRS_similarity_it.csv,C0017601,C0232197,279.0,Glaucoma,Fibrillation,glaucoma,fibrillazione cardiaca,umnsrs
UMNSRS_similarity_it.csv,C0006949,C0031507,370.5,Carbatrol,Dilantin,carbamazepina,fenitoina,umnsrs
UMNSRS_similarity_it.csv,C0878544,C0000970,241.0,Cardiomyopathy,Tylenol,malattie del miocardio,acetaminofene,umnsrs
UMNSRS_similarity_it.csv,C0019340,C0020550,142.5,Herpes,Hyperthyroidism,herpes nas,ipertiroidismo,umnsrs
UMNSRS_similarity_it.csv,C0036494,C0027497,870.75,Seasickness,Nausea,mal di mare,nausea,umnsrs


UMNSRS_similarity_mod449_word2vec_it.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score,term1,term2,term1_it,term2_it,annotator
filename,cui1,cui2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
UMNSRS_similarity_mod449_word2vec_it.csv,C0000970,C0020740,1067.0,tylenol,motrin,acetaminofene,ibuprofene,umnsrs
UMNSRS_similarity_mod449_word2vec_it.csv,C0001047,C0001443,256.25,acetylcysteine,adenosine,acetilcisteina,adenosina,umnsrs
UMNSRS_similarity_mod449_word2vec_it.csv,C0001416,C0038450,666.25,adenitis,stridor,adenite,stridore,umnsrs
UMNSRS_similarity_mod449_word2vec_it.csv,C0001824,C0002871,933.25,agranulocytosis,anemia,agranulocitosi,anemia,umnsrs
UMNSRS_similarity_mod449_word2vec_it.csv,C0001924,C0019134,377.5,albumin,heparin,albumine,eparina,umnsrs


MayoSRS_it.csv : Index(['score', 'term1', 'term2', 'term1_it', 'term2_it', 'annotator'], dtype='object')
MiniMayoSRS_it.csv : Index(['score', 'cui1', 'cui2', 'term1', 'term2', 'term1_it', 'term2_it',
       'annotator', 'filename'],
      dtype='object')
UMNSRS_relatedness_it.csv : Index(['score', 'term1', 'term2', 'term1_it', 'term2_it', 'annotator'], dtype='object')
UMNSRS_relatedness_mod458_word2vec_it.csv : Index(['score', 'term1', 'term2', 'term1_it', 'term2_it', 'annotator'], dtype='object')
UMNSRS_similarity_it.csv : Index(['score', 'term1', 'term2', 'term1_it', 'term2_it', 'annotator'], dtype='object')
UMNSRS_similarity_mod449_word2vec_it.csv : Index(['score', 'term1', 'term2', 'term1_it', 'term2_it', 'annotator'], dtype='object')
OUT [filename,annotator]


Unnamed: 0,filename,cui1,cui2,annotator,term1,term2,score,term1_it,term2_it
89,MayoSRS_it.csv,C0241910,C0022876,coders,autoimmune hepatitis,premature labor,1.0,epatite autoimmune,parto prematuro
56,MiniMayoSRS_it.csv,C0042345,C0224701,coders,Varicose vein,Entire knee meniscus,1.0,vene varicose,menisco del ginocchio intero
5,MiniMayoSRS_it.csv,C0175895,C0009814,physicians,Calcification,Stenosis,2.7,calcificazione fisiologica,stenosi acquisita
81,UMNSRS_relatedness_it.csv,C0036396,C0246719,umnsrs,Sciatica,Actonel,473.5,sciatica,risedronato
417,UMNSRS_relatedness_mod458_word2vec_it.csv,C0497327,C0995182,umnsrs,dementia,aloe,215.5,demenza,aloe vera
405,UMNSRS_similarity_it.csv,C0553735,C0027697,umnsrs,Pyorrhea,Nephritis,777.5,piorrea,nefrite
343,UMNSRS_similarity_mod449_word2vec_it.csv,C0070166,C0286651,umnsrs,plavix,lipitor,895.5,clopidogrel,atorvastatina


OUT stats


Unnamed: 0_level_0,Unnamed: 1_level_0,cui1,cui2,term1,term2,score,term1_it,term2_it
filename,annotator,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MayoSRS_it.csv,coders,101,101,101,101,101,101,101
MiniMayoSRS_it.csv,coders,29,29,29,29,29,29,29
MiniMayoSRS_it.csv,physicians,29,29,29,29,29,29,29
UMNSRS_relatedness_it.csv,umnsrs,587,587,587,587,587,587,587
UMNSRS_relatedness_mod458_word2vec_it.csv,umnsrs,458,458,458,458,458,458,458
UMNSRS_similarity_it.csv,umnsrs,566,566,566,566,566,566,566
UMNSRS_similarity_mod449_word2vec_it.csv,umnsrs,449,449,449,449,449,449,449


save: out/eval_resources/sim_rel_all.csv
done
