## Baseline, using wiki2vec


Authors: 
- F.A. Cardillo, francoalberto.cardillo@cnr.it
- F. Debole, franca.debole@isti.cnr.it

Date: 22 March 2024


Notice: this repository does not contain the wiki2vec word embeddings, that can be downloaded from:


__If you use this notebook or the resources it builds, please cite:__

__"Italian Word Embeddings for the Medical Domain", F.A. Cardillo, F. Debole. Proc. of the 2024 Joint Int. Conf. on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), Turin, Italy, May 20-25, 2024.__


The MIT License

Copyright 2024, Franco Alberto Cardillo, Franca Debole

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<hr>

In [2]:
import numpy as np
import pandas as pd
from posixpath import join
from wikipedia2vec import Wikipedia2Vec

fn = join("external_resources", "itwiki_20180420_100d.pkl")
wiki2vec = Wikipedia2Vec.load(fn)
print( type(wiki2vec) )

def similarity(v1, v2):
    out = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    # no nans in out
    return np.nan if np.isnan(out).any() else out
#<


def get_vector1(mdl, term, agg=np.mean):
    tokens = term.split()
    vectors = []
    for t in tokens:
        if t not in mdl.dictionary:
            print(f"{t} not in dictionary")

    # vectors = [np.array(wiki2vec.get_word_vector(t)) for t in tokens]    
    print("got vectors:", len(vectors))
    a = agg(vectors, axis=0)
    return agg(vectors, axis=0)
#<


def get_vector(mdl, term, agg=np.mean):
    global missed
    # print(term)         
    tokens = term.split()
    vectors = []
    
    for t in tokens:
        try:
            v = np.array(mdl.get_word_vector(t))
            # if np.isnan(v).any():
            #     print(f"{t} has nan")
            #     assert False
            vectors.append(v)
        except KeyError:
            missed =  missed + 1
            # print(f"{t} not in dictionary")
        
    # vectors = [np.array(wiki2vec.get_word_vector(t)) for t in tokens]    
    # print("got vectors:", len(vectors))
    res = agg(vectors, axis=0) if len(vectors) > 0 else np.nan
    
    if type(res) == np.ndarray:
        assert not np.isnan(res).any()
    
    return res
#<

<class 'wikipedia2vec.wikipedia2vec.Wikipedia2Vec'>


 [124 112]
 [122 108]
 ...
 [  0   0]
 [  0   0]
 [ 134782   62437]
 [6811053  887006]
 ...
 [      0       0]
 [      0       0]
   0.18417557]
 [-0.12081806  0.2413752   0.45221782 ... -0.39314067 -0.43245047
  -0.1734238 ]
 [-0.1158172   0.20283589  0.20232022 ... -0.05498212 -0.4147939
  -0.08980408]
 ...
 [-0.29182723 -0.1575879   0.55301744 ...  0.69867665  0.0148048
   0.58166534]
 [-1.0185058   0.03632973  0.7033741  ... -0.15651195  0.29880068
   0.49029368]
 [-0.45643398  0.19100761  0.22013728 ... -0.12237632 -0.32557508
  -3.18702877e-01  2.30105937e-01]
 [ 3.79299223e-01  1.33697405e-01  3.38798203e-02 ... -4.89570469e-01
  -1.96655288e-01 -1.73741922e-01]
 [ 1.50949836e-01 -4.13462482e-02 -1.25802420e-02 ...  5.20190820e-02
   2.45902948e-02  1.92391455e-01]
 ...
 [ 1.50965106e+00 -1.11557744e-01  9.46417823e-02 ...  1.50800335e+00
   9.85529721e-01  1.76642880e-01]
 [ 1.03813648e+00 -4.69308347e-01 -9.72151935e-01 ...  9.27031279e-01
   1.55203617e+00  9.50994909e-01]
 [

In [7]:

print("SIM_REL_ALL.CSV")
df = pd.read_csv(join("out/eval_resources/sim_rel_all.csv"))
print("ALL EVAL RESOURCES, sample")
display(df.sample(10))

missed = 0
vectors1 = df.term1_it.apply(lambda x: get_vector(wiki2vec, x))
vectors2 = df.term2_it.apply(lambda x: get_vector(wiki2vec, x))
df["v1"] = vectors1
df["v2"] = vectors2

sim = []
for v1, v2 in zip(vectors1, vectors2):
    try:
        s = similarity(v1, v2)
    except ValueError:
        s = np.nan
    sim.append(s)

print("missing in baseline, tokens:", missed)

# baseline could contain NaNs (terms not in dictionary of the base model)
df["computed_score"] = sim
df["size"] = df.groupby(["filename", "annotator"])["term1_it"].transform('count')

# ------------------------------------------

from scipy.stats import pearsonr, spearmanr

df["valid"] = df.v1.apply(lambda x: not np.isnan(x).any()) & df.v2.apply(lambda x: not np.isnan(x).any())
# display(df)

iii = df.valid
df["n_valid"] = df.groupby(["filename", "annotator"])["valid"].transform('sum')

def compute_correlation(group):
    group2 = group.dropna(subset=["computed_score", "score"], how="any")
    values1 = np.array(group2.computed_score)
    values2 = np.array(group2.score)
    r, p = pearsonr(values1, values2)
    group["r"] = r
    group["p"] = p
    sr, sp = spearmanr(values1, values2)
    group["spearman"] = sr
    group["spearman_p"] = sp
    group["model"] = "wiki2vec"
    return group
#<


out = df.groupby(["filename", "annotator"]).apply(lambda g: compute_correlation(g)).reset_index(drop=True)
out["stat_significant"] = out.p < 0.05
out["spear_stat_significant"] = out.spearman_p < 0.05

print('CHECK: groupby(["filename", "annotator"]).sample(2))')
display(out.groupby(["filename", "annotator"]).sample(2))


fld = join("out", "results")
out.to_csv(join(fld, "baseline.csv"), index=False)
print("saved:", join(fld, "baseline.csv"))

cols = ["filename", "annotator", "model", "stat_significant", "r", "p", "spearman", "spearman_p", "spear_stat_significant", "size", "n_valid"]

print("RESULT")
display(out[cols].groupby(["filename", "annotator"]).head(1))
# latex = out[cols].groupby(["filename", "annotator"]).head(1).to_latex()
# print(latex)

print("all done")


SIM_REL_ALL.CSV
ALL EVAL RESOURCES, sample


Unnamed: 0,filename,cui1,cui2,annotator,term1,term2,score,term1_it,term2_it
129,MiniMayoSRS_it.csv,C0043352,C0023891,physicians,Xerostomia,Alcoholic cirrhosis,1.0,xerostomia,cirrosi epatica alcolica
1881,UMNSRS_similarity_mod449_word2vec_it.csv,C0013404,C0027358,umnsrs,dyspnea,narcan,578.25,dispnea,nalossone
1055,UMNSRS_relatedness_mod458_word2vec_it.csv,C0040038,C0039070,umnsrs,thromboembolism,syncope,1226.75,tromboembolia,sincope
2,MayoSRS_it.csv,C0409162,C0333286,coders,hand splint,splinter hemorrhage,1.0,steccatura della mano,emorragie a scheggia
1801,UMNSRS_similarity_mod449_word2vec_it.csv,C0003564,C0152149,umnsrs,aphonia,mittelschmerz,214.25,afonia,dolore dell'ovulazione
1575,UMNSRS_similarity_it.csv,C0152447,C0003564,umnsrs,Urethrorrhea,Aphonia,233.25,secrezione uretrale,afonia
664,UMNSRS_relatedness_it.csv,C0034642,C0034880,umnsrs,Rales,Hyperacusis,409.5,rantoli,iperacusia
1621,UMNSRS_similarity_it.csv,C0009421,C0003129,umnsrs,comatose,Anoxemia,668.0,coma,anossemia
982,UMNSRS_relatedness_mod458_word2vec_it.csv,C0029877,C0025287,umnsrs,otitis,meningism,848.0,otite,meningismo
604,UMNSRS_relatedness_it.csv,C0018081,C0076275,umnsrs,Gonorrhea,Xenical,404.75,gonorrea,orlistat


missing in baseline, tokens: 376
CHECK: groupby(["filename", "annotator"]).sample(2))


Unnamed: 0,filename,cui1,cui2,annotator,term1,term2,score,term1_it,term2_it,v1,...,size,valid,n_valid,r,p,spearman,spearman_p,model,stat_significant,spear_stat_significant
14,MayoSRS_it.csv,C2267026,C0020473,coders,HMG Co A reductase inhibitor,hyperlipidemia,2.23,inibitore dell'hmg coa reduttasi,iperlipidemie,"[0.17372788, 0.23538442, -0.6930311, 0.7691941...",...,101,True,95,0.081131,0.4344587,0.103644,0.3175431,wiki2vec,False,False
27,MayoSRS_it.csv,C0429103,C0027051,coders,T wave,myocaridal infarction,5.23,onda t,infarto del miocardio,"[0.14858682, -0.06748229, 0.055117816, 0.04985...",...,101,True,95,0.081131,0.4344587,0.103644,0.3175431,wiki2vec,False,False
114,MiniMayoSRS_it.csv,C0034065,C0027051,coders,Pulmonary embolus,Myocardial infarction,1.2,embolia polmonare,infarto del miocardio,"[-0.26794165, 0.12332751, -1.2134464, -0.15212...",...,29,True,29,0.337298,0.07355639,0.459166,0.0122241,wiki2vec,False,True
124,MiniMayoSRS_it.csv,C0020473,C0027627,coders,Hyperlipidemia,Tumor metastasis,1.0,iperlipidemie,metastasi neoplastiche,"[0.0065295706, -0.053818025, -0.474993, -0.136...",...,29,True,29,0.337298,0.07355639,0.459166,0.0122241,wiki2vec,False,True
131,MiniMayoSRS_it.csv,C0156543,C0000786,physicians,Abortion,Miscarriage,3.0,aborto,aborto spontaneo,"[0.16467296, -0.70436525, -0.8231383, -0.18788...",...,29,True,29,0.342068,0.0693246,0.358609,0.05609326,wiki2vec,False,False
152,MiniMayoSRS_it.csv,C0011581,C0007642,physicians,Depression,Cellulitis,1.0,disturbo depressivo,cellulite,"[0.39888608, 0.0028961822, -0.774464, -0.13539...",...,29,True,29,0.342068,0.0693246,0.358609,0.05609326,wiki2vec,False,False
379,UMNSRS_relatedness_it.csv,C0206160,C0018926,umnsrs,Reticulocytosis,Hematemesis,487.0,reticolocitosi,ematemesi,"[-0.10576051, -0.054728996, -0.4779745, 0.1355...",...,587,True,486,0.221181,8.453808e-07,0.225279,5.215283e-07,wiki2vec,True,True
401,UMNSRS_relatedness_it.csv,C0060277,C0060277,umnsrs,Iron,Iron,1550.5,gluconato ferroso,gluconato ferroso,"[-0.004519403, 0.045788936, -0.18695222, 0.072...",...,587,True,486,0.221181,8.453808e-07,0.225279,5.215283e-07,wiki2vec,True,True
780,UMNSRS_relatedness_mod458_word2vec_it.csv,C0003615,C0007546,umnsrs,appendicitis,cefazolin,879.75,appendicite,cefazolina,"[0.2775922, -0.29467842, -0.95305425, -0.14643...",...,458,True,401,0.242235,9.158733e-07,0.245269,6.615433e-07,wiki2vec,True,True
942,UMNSRS_relatedness_mod458_word2vec_it.csv,C0021359,C0005747,umnsrs,infertility,blepharospasm,411.5,infertilità,blefarospasmo,"[-0.11583069, -0.6761835, -0.94259506, -0.5229...",...,458,True,401,0.242235,9.158733e-07,0.245269,6.615433e-07,wiki2vec,True,True


saved: out/results/baseline.csv
RESULT


Unnamed: 0,filename,annotator,model,stat_significant,r,p,spearman,spearman_p,spear_stat_significant,size,n_valid
0,MayoSRS_it.csv,coders,wiki2vec,False,0.081131,0.4344587,0.103644,0.3175431,False,101,95
101,MiniMayoSRS_it.csv,coders,wiki2vec,False,0.337298,0.07355639,0.459166,0.0122241,True,29,29
130,MiniMayoSRS_it.csv,physicians,wiki2vec,False,0.342068,0.0693246,0.358609,0.05609326,False,29,29
159,UMNSRS_relatedness_it.csv,umnsrs,wiki2vec,True,0.221181,8.453808e-07,0.225279,5.215283e-07,True,587,486
746,UMNSRS_relatedness_mod458_word2vec_it.csv,umnsrs,wiki2vec,True,0.242235,9.158733e-07,0.245269,6.615433e-07,True,458,401
1204,UMNSRS_similarity_it.csv,umnsrs,wiki2vec,True,0.298263,3.743798e-11,0.294851,6.376248e-11,True,566,472
1770,UMNSRS_similarity_mod449_word2vec_it.csv,umnsrs,wiki2vec,True,0.29278,3.621792e-09,0.286765,7.742286e-09,True,449,391


all done
