# Library

In [1]:
import pandas as pd
import numpy as np
import os
import re
import collections
import unidecode
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation
from functools import reduce
import seaborn as sns
from abbreviations import schwartz_hearst
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import ast
import math

[nltk_data] Downloading package wordnet to /Users/egarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('display.max_colwidth', 100)

In [3]:
%matplotlib inline
from matplotlib import pyplot as plt

# Functions

In [4]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [5]:
nltk.download('stopwords')
swords = list(set(stopwords.words('spanish')))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def clean_text(string):
    """
    A method to clean text 
    """
    
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuation:
            if x != '/':
                string = string.replace(x, "")
            else:
                string = string.replace(x, " ")
    
    string = unidecode.unidecode(string)

#     # Converting the text to lower
#     string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in swords])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string 

In [7]:
def distance_levenshtein(str1, str2):
    d=dict()
    for i in range(len(str1)+1):
        d[i]=dict()
        d[i][0]=i
    for i in range(len(str2)+1):
        d[0][i] = i
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not str1[i-1] == str2[j-1]))
    return d[len(str1)][len(str2)]

In [8]:
def normalize_lf(row):
    leven2 = []
    for i in row:
        for j in row:
            if i != j:
                long = max(len(i),len(j))
                ratio = distance_levenshtein(i,j)/long
                if ratio < 0.2:
                    leven2.append(j)
    if leven2:
        leven2 = set(leven2)
        lista = []
        for i in leven2:
            val = frec[frec['index'] == i]['long_form'].iloc[0]
            lista.append((i, val))
        lista = set(lista)
        most_freq = sorted(set(lista), key=lambda x: x[1], reverse = True)[0][0]
        sust = {}
        for i in set(leven2):
            sust[i] = most_freq
        
        return sust
    else:
        pass
    

In [9]:
def get_label(row):
    if row['long_form_x'] == row['long_form_y']:
        return 1
    else:
        return 0

In [10]:
def offsetA(row):
    return row['texto'].find(row['Mention_A'])
    
def offsetB(row):
    return row['texto'].find(row['Mention_B'])

def offsetB_end(row):
    return row['texto'].find(row['Mention_B']) + len(row['Mention_B'])

def offsetA_end(row):
    return row['Mention_A_StartOffset'] + len(row['Mention_A'])

In [11]:
def offset(row):
    return row['texto'].find(row['abrev'])

def offsetend(row):
    return row['StartOffset']+len(row['abrev'])

In [12]:
def defin_dictionary(row,dictionary):
    if row['Definition'] == 'no_existe':
        return dictionary.get(row['Abbreviation'])
    else:
        return row['Definition']

# Load Data

### Testing

220 clinical cases.

In [23]:
testing_abbr = pd.read_csv("../data/track_1_soto_test.tsv", sep = "\t")

In [24]:
testing_abbr.head()

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-14732005000200003-1,SHORT_FORM,1683,1688,XOMED,SHORT-LONG,LONG_FORM,1672,1681,Medtronic
1,S0365-66912011001100006-1,SHORT_FORM,127,129,AV,SHORT-LONG,LONG_FORM,111,125,agudeza visual
2,S0365-66912011001100006-1,SHORT_FORM,206,208,OD,SHORT-LONG,LONG_FORM,193,204,ojo derecho
3,S0365-66912011001100006-1,SHORT_FORM,250,252,OI,SHORT-LONG,LONG_FORM,235,248,ojo izquierdo
4,S0212-71992004000300009-1,SHORT_FORM,1005,1009,ANAs,SHORT-LONG,LONG_FORM,960,1003,ecografía de abdomen y estudio inmunológico


In [31]:
testing_abbr = testing_abbr[['# Document_ID', 'Mention_A_StartOffset',
       'Mention_A_EndOffset', 'Mention_A']]

In [15]:
testing_raw = read_texts("../../acronym_disambiguation_tfm/data/ibereval_data/testing_set/testing_set.raw_text/")

In [32]:
testing_raw.rename(columns = {'nombre':'# Document_ID'}, inplace = True)
testing_raw.head()

Unnamed: 0,# Document_ID,texto
0,S1130-01082008001000010-1,"Varón de 43 años originario de Marruecos, que ingresó en nuestro servicio por cuadro de 4 días d..."
1,S0004-06142009000400011-1,Varón de 75 años con antecedentes de EPOC moderado sin otros antecedentes médicos de interés. En...
2,S0376-78922011000200004-1,Mujer de 44 años de edad con antecedentes médicos de insuficiencia renal crónica en tratamiento ...
3,S1137-66272014000300016-1,Mujer de 36 años sin antecedentes médicos o epidemiológicos de interés que es ingresada en la Un...
4,S1130-01082008000800019-1,"Recientemente, atendimos en nuestro hospital a un varón de 46 años, con antecedente de síndrome ..."


In [180]:
testing_raw['# Document_ID'].nunique()

220

In [181]:
testing_raw.to_csv("../data/test_raw.tsv", index = False)

In [44]:
testing = testing_abbr.merge(testing_raw, on = '# Document_ID', how = 'left')

In [45]:
print(testing_raw.shape)
print(testing_abbr.shape)
print(testing.shape)

(220, 2)
(198, 4)
(198, 5)


In [49]:
testing.columns = ['nombre', 'StartOffset', 'EndOffset', 'abrev', 'texto']

### Trainning

318 clinical cases

In [105]:
train_abbr = pd.read_csv("../data/track_1_soto_train.tsv", sep = "\t")

In [106]:
train_abbr.head()

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0365-66912006000200012-1,SHORT_FORM,208,210,OD,SHORT-LONG,LONG_FORM,195,206,ojo derecho
1,S0365-66912006000200012-1,SHORT_FORM,562,564,AV,SHORT-LONG,LONG_FORM,546,560,agudeza visual
2,S0365-66912006000200012-1,SHORT_FORM,636,640,DPAR,SHORT-LONG,LONG_FORM,601,634,defecto pupilar aferente relativo
3,S0365-66912006000200012-1,SHORT_FORM,810,812,TC,SHORT-LONG,LONG_FORM,784,808,tomografía computarizada
4,S0365-66912006000200012-1,SHORT_FORM,895,898,RMN,SHORT-LONG,LONG_FORM,865,893,resonancia magnética nuclear


In [36]:
train_abbr = train_abbr[['# Document_ID', 'Mention_A_StartOffset',
       'Mention_A_EndOffset', 'Mention_A']]

In [37]:
train_raw = read_texts("../../acronym_disambiguation_tfm/data/ibereval_data/trainning_set/training_set.raw_text/")

In [39]:
train_raw.rename(columns = {'nombre':'# Document_ID'}, inplace = True)
train_raw.head()

Unnamed: 0,# Document_ID,texto
0,S1130-01082006000100014-1,"Se trata de una mujer de 35 años, con antecedentes familiares de enfermedad de Crohn y sin antec..."
1,S1130-01082009000300015-1,"Varón de 70 años, fumador, con enfisema pulmonar y vitíligo al que en mayo de 2001 se realizó un..."
2,S0210-56912010000200009-1,Se trata de una mujer de 70 años con antecedentes de HTA y diagnosticada recientemente de neopla...
3,S1130-01082008000900014-1,Varón de 41 años diagnosticado de adenocarcinoma medianamente diferenciado implantado sobre esóf...
4,S0210-48062004000500008-1,Paciente de 29 años de edad que acude al Servicio de Urgencias de nuestro Hospital ante la prese...


In [179]:
train_raw.to_csv("../data/train_raw.tsv", index = False)

In [54]:
train = train_abbr.merge(train_raw, on = '# Document_ID', how = 'left')

In [55]:
print(train_raw.shape)
print(train_abbr.shape)
print(train.shape)

(318, 2)
(290, 4)
(290, 5)


In [56]:
train.columns = ['nombre', 'StartOffset', 'EndOffset', 'abrev', 'texto']

In [60]:
train_track2 = train.copy() 

## Test with AbreMES DB

### 1) Get the short-form from the text

I already have it from Soto process

### Training

### 2) Give a long-form from AbreMES data base

In [57]:
abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/pairs.tsv", sep = '\t')

In [58]:
abremes.head()

Unnamed: 0,# Pair ID,Abbreviation ID,Definition ID,Frequency,Abbreviation,Definition,Appears on
0,1,3348,17876,31,DDD,diaria definida,"http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300004,http://scielo..."
1,2,11880,23106,11,HP-CHPR,Hospital Pediátrico del Centro Hospitalario Pereira Rossell,"http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300004,http://www.sc..."
2,3,1454,23213,1,EVN,de vida al nacer,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300005
3,4,1112,23214,1,TDS,Total dermatoscopic score,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300006
4,5,231,23215,1,AP,cases by pathologic anatomy,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300006


In [59]:
abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')

  abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')


#### Train

In [61]:
train_track2 = train_track2.rename(columns = {'abrev':'Abbreviation'})

In [62]:
train_track2['Abbreviation'] = train_track2['Abbreviation'].str.strip()

In [63]:
print(train_track2.shape)
print(train_track2.Abbreviation.nunique())
print(abremes.shape)
print(abremes.Abbreviation.nunique())

(290, 5)
149
(52551, 7)
20236


Add dictionary with measure units

In [64]:
with open("dictionary_measureunits.txt", "r") as data:
    dictionary = ast.literal_eval(data.read())

Assign definitions from AbreMES DB

In [65]:
train_def = train_track2.merge(abremes[['Abbreviation', 'Definition']], how = 'left', on = 'Abbreviation', indicator = True)

In [66]:
train_def.head(2)

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both
1,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,o-desmetilasa,both


In [67]:
train_def['Definition'] = train_def['Definition'].fillna('no_existe')

In [68]:
train_def['Definition'] = train_def.apply(lambda x: defin_dictionary(x, dictionary), axis = 1)

In [69]:
sf_notfind= train_def[train_def['Definition'].isna()]['Abbreviation'].unique().tolist()

In [70]:
len(sf_notfind)

31

In [71]:
train_def= train_def.dropna(subset = ['Definition'])

In [72]:
#En test real hay 600 y pico
print(train_def.Abbreviation.nunique())

118


In [73]:
train_def.head()

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both
1,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,o-desmetilasa,both
2,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both
3,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,oxígeno disuelto,both
4,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,OD: cuenta dedos,both


In [74]:
print(train_def.shape)
print(train_def.Abbreviation.nunique())

(7175, 7)
118


Get lemmatized long forms

In [75]:
lemmatizer = WordNetLemmatizer()

In [76]:
train_def['Definition_lemmatized'] = train_def['Definition'].map(lambda x: lemmatizer.lemmatize(x))

In [77]:
train_def.head()

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge,Definition_lemmatized
0,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both,ojo derecho
1,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,o-desmetilasa,both,o-desmetilasa
2,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both,ojo derecho
3,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,oxígeno disuelto,both,oxígeno disuelto
4,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,OD: cuenta dedos,both,OD: cuenta dedos


In [78]:
del train_def['_merge']

In [79]:
train_def.to_csv("../data/data_paper/train_subtrack2_soto_parte1.csv", index = False)

### Test

In [196]:
testing = pd.read_csv("../data/marzo2023/subtrack2/OutputApproach4Disambiguation_testing.tsv", sep = '\t')

In [197]:
test_track2 = testing.copy()

In [207]:
test_track2 = test_track2[['# Document_ID', 'StartOffset', 'EndOffset', 'Abbreviation']]

In [208]:
test_track2.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation
0,S1130-14732005000200003-1,300,302,mm
1,S1130-14732005000200003-1,649,651,TC


### 2) Give a long-form from AbreMES data base

In [199]:
abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/pairs.tsv", sep = '\t')

In [200]:
abremes.head()

Unnamed: 0,# Pair ID,Abbreviation ID,Definition ID,Frequency,Abbreviation,Definition,Appears on
0,1,3348,17876,31,DDD,diaria definida,"http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300004,http://scielo..."
1,2,11880,23106,11,HP-CHPR,Hospital Pediátrico del Centro Hospitalario Pereira Rossell,"http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300004,http://www.sc..."
2,3,1454,23213,1,EVN,de vida al nacer,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300005
3,4,1112,23214,1,TDS,Total dermatoscopic score,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300006
4,5,231,23215,1,AP,cases by pathologic anatomy,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300006


In [201]:
abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')

  abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')


In [202]:
test_track2 = test_track2.rename(columns = {'abrev':'Abbreviation'})

In [203]:
test_track2['Abbreviation'] = test_track2['Abbreviation'].str.strip()

In [204]:
print(test_track2.shape)
print(test_track2.Abbreviation.nunique())
print(abremes.shape)
print(abremes.Abbreviation.nunique())

(1981, 6)
407
(52551, 7)
20236


Add dictionary with measure units

In [205]:
with open("dictionary_measureunits.txt", "r") as data:
    dictionary = ast.literal_eval(data.read())

Assign definitions from AbreMES DB

In [209]:
test_def = test_track2.merge(abremes[['Abbreviation', 'Definition']], how = 'left', on = 'Abbreviation', indicator = True)

In [210]:
test_def.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,aminoacidos más abundantes,both
1,S1130-14732005000200003-1,649,651,TC,tomografías computarizadas,both


In [211]:
test_def['Definition'] = test_def['Definition'].fillna('no_existe')

In [212]:
test_def['Definition'] = test_def.apply(lambda x: defin_dictionary(x, dictionary), axis = 1)

In [213]:
sf_notfind= test_def[test_def['Definition'].isna()]['Abbreviation'].unique().tolist()

In [214]:
len(sf_notfind)

88

In [215]:
test_def= test_def.dropna(subset = ['Definition'])

In [216]:
#En test real hay 600 y pico
print(test_def.Abbreviation.nunique())

319


In [217]:
test_def.head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,aminoacidos más abundantes,both
1,S1130-14732005000200003-1,649,651,TC,tomografías computarizadas,both
2,S1130-14732005000200003-1,649,651,TC,tomografía computarizada,both
3,S1130-14732005000200003-1,649,651,TC,tomografia computarizada,both
4,S1130-14732005000200003-1,649,651,TC,tomografía craneal,both


In [218]:
print(test_def.shape)
print(test_def.Abbreviation.nunique())

(41480, 6)
319


Get lemmatized long forms

In [219]:
lemmatizer = WordNetLemmatizer()

In [220]:
test_def['Definition_lemmatized'] = test_def['Definition'].map(lambda x: lemmatizer.lemmatize(x))

In [221]:
test_def.head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,_merge,Definition_lemmatized
0,S1130-14732005000200003-1,300,302,mm,aminoacidos más abundantes,both,aminoacidos más abundantes
1,S1130-14732005000200003-1,649,651,TC,tomografías computarizadas,both,tomografías computarizadas
2,S1130-14732005000200003-1,649,651,TC,tomografía computarizada,both,tomografía computarizada
3,S1130-14732005000200003-1,649,651,TC,tomografia computarizada,both,tomografia computarizada
4,S1130-14732005000200003-1,649,651,TC,tomografía craneal,both,tomografía craneal


In [222]:
del test_def['_merge']

In [224]:
test_def.to_csv("../data/marzo2023/test_subtrack2_marzo23soto_parte1.csv", sep = '\t', index = False)