# Library

In [14]:
import pandas as pd
import numpy as np
import os
import re
import collections
import unidecode
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation
from functools import reduce
import seaborn as sns
from abbreviations import schwartz_hearst
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import ast
import math

[nltk_data] Downloading package wordnet to /Users/egarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
pd.set_option('display.max_colwidth', 100)

In [16]:
%matplotlib inline
from matplotlib import pyplot as plt

# Functions

In [17]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [18]:
nltk.download('stopwords')
swords = list(set(stopwords.words('spanish')))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def clean_text(string):
    """
    A method to clean text 
    """
    
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuation:
            if x != '/':
                string = string.replace(x, "")
            else:
                string = string.replace(x, " ")
    
    string = unidecode.unidecode(string)

#     # Converting the text to lower
#     string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in swords])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string 

In [20]:
def distance_levenshtein(str1, str2):
    d=dict()
    for i in range(len(str1)+1):
        d[i]=dict()
        d[i][0]=i
    for i in range(len(str2)+1):
        d[0][i] = i
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not str1[i-1] == str2[j-1]))
    return d[len(str1)][len(str2)]

In [21]:
def normalize_lf(row):
    leven2 = []
    for i in row:
        for j in row:
            if i != j:
                long = max(len(i),len(j))
                ratio = distance_levenshtein(i,j)/long
                if ratio < 0.2:
                    leven2.append(j)
    if leven2:
        leven2 = set(leven2)
        lista = []
        for i in leven2:
            val = frec[frec['index'] == i]['long_form'].iloc[0]
            lista.append((i, val))
        lista = set(lista)
        most_freq = sorted(set(lista), key=lambda x: x[1], reverse = True)[0][0]
        sust = {}
        for i in set(leven2):
            sust[i] = most_freq
        
        return sust
    else:
        pass
    

In [22]:
def get_label(row):
    if row['long_form_x'] == row['long_form_y']:
        return 1
    else:
        return 0

In [23]:
def offsetA(row):
    return row['texto'].find(row['Mention_A'])
    
def offsetB(row):
    return row['texto'].find(row['Mention_B'])

def offsetB_end(row):
    return row['texto'].find(row['Mention_B']) + len(row['Mention_B'])

def offsetA_end(row):
    return row['Mention_A_StartOffset'] + len(row['Mention_A'])

In [24]:
def offset(row):
    return row['texto'].find(row['abrev'])

def offsetend(row):
    return row['StartOffset']+len(row['abrev'])

In [25]:
def defin_dictionary(row,dictionary):
    if row['Definition'] == 'no_existe':
        return dictionary.get(row['Abbreviation'])
    else:
        return row['Definition']

# Load Data

### Trainning

318 clinical cases

In [27]:
train_raw = pd.read_csv("../data/train_raw.tsv")

In [60]:
#train_gold_subtrack1 = pd.read_csv("../data/clinical_cases.relations.training_set_2.tsv", sep = '\t')
train_gold_subtrack1 = pd.read_csv("../data/ibereval_data/trainning_set/clinical_cases.relations.training_set.txt", sep = '\t')
    train_gold_subtrack2 = pd.read_csv("../data/ibereval_data/trainning_set/clinical_cases.abbreviations.training_set.tsv", sep = '\t')

In [61]:
train_raw.head(2)

Unnamed: 0,# Document_ID,texto
0,S1130-01082006000100014-1,"Se trata de una mujer de 35 años, con antecedentes familiares de enfermedad de Crohn y sin antec..."
1,S1130-01082009000300015-1,"Varón de 70 años, fumador, con enfisema pulmonar y vitíligo al que en mayo de 2001 se realizó un..."


In [62]:
train_gold_subtrack1.head(1)

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-01082009000400014-1,SHORT_FORM,476,NPT,SHORT-LONG,LONG_FORM,454.0,474.0,nutrición parenteral


In [63]:
train_gold_subtrack2.head(1)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0210-48062004000500008-1,1650,1652,ml,mililitro,mililitro


In [137]:
#train_soto = pd.read_csv("../data/track_1_soto_train.tsv", sep = "\t")
#train_soto = pd.read_csv("../data/track_1_soto_train_marzo23.tsv", sep = "\t")
train_soto1 = pd.read_csv("../data/marzo2023/subtrack1/OutputApproach4Relations_training.tsv", sep = "\t",header = None)
train_soto2 = pd.read_csv("../data/marzo2023/subtrack2/OutputApproach4Disambiguation_training.tsv", sep = "\t", header = None)

In [141]:
train_soto1.head(2)

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndtOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0365-66912006000200012-1,SHORT_FORM,208,210,OD,SHORT-LONG,LONG_FORM,195,206,ojo derecho
1,S0365-66912006000200012-1,SHORT_FORM,562,564,AV,SHORT-LONG,LONG_FORM,546,560,agudeza visual


In [142]:
train_soto2.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0004-06142005000900013-1,186,188,mm,milímetro,milimetro
1,S0376-78922008000400008-1,458,461,mal,maligno,maligno


In [151]:
train_soto1.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset','Mention_A_EndOffset', 'Mention_A',
       'Relation_type', 'Mention_B_type', 'Mention_B_StartOffset',
       'Mention_B_EndOffset', 'Mention_B']
train_soto2.columns = train_gold_subtrack2.columns

In [152]:
train_soto2.isna().sum(axis = 0)

# Document_ID            0
StartOffset              0
EndOffset                0
Abbreviation             0
Definition               0
Definition_lemmatized    0
dtype: int64

In [153]:
train_soto1.dtypes

# Document_ID            object
Mention_A_type           object
Mention_A_StartOffset     int64
Mention_A_EndOffset       int64
Mention_A                object
Relation_type            object
Mention_B_type           object
Mention_B_StartOffset     int64
Mention_B_EndOffset       int64
Mention_B                object
dtype: object

In [157]:
train_soto1.to_csv("../data/marzo2023/subtrack1/OutputApproach4Relations_training.tsv", sep = "\t",index = False)

In [156]:
train_soto2.to_csv("../data/marzo2023/subtrack2/OutputApproach4Disambiguation_training.tsv", sep = "\t",index = False)

#### Subtrack 1

Evaluate number of records

In [143]:
print(f"Number of rows in train gold standard subtrack1: {train_gold_subtrack1.shape[0]}")
print(f"Number of rows in train soto dataset: {train_soto1.shape[0]}")

Number of rows in train gold standard subtrack1: 287
Number of rows in train soto dataset: 273


Evaluate number of Docs

In [144]:
print(f"Number of distinct docs in train raw: {train_raw['# Document_ID'].nunique()}")
print(f"Number of distinct docs in train gold standard subtrack1: {train_gold_subtrack1['# Document_ID'].nunique()}")
print(f"Number of distinct docs in train soto dataset: {train_soto1['# Document_ID'].nunique()}")

Number of distinct docs in train raw: 318
Number of distinct docs in train gold standard subtrack1: 119
Number of distinct docs in train soto dataset: 119


Evaluate number of Short forms

In [145]:
print(f"Number of distinct SF in train gold standard subtrack1: {train_gold_subtrack1['Mention_A'].nunique()}")
print(f"Number of distinct SF in train soto dataset: {train_soto1['Mention_A'].nunique()}")

Number of distinct SF in train gold standard subtrack1: 176
Number of distinct SF in train soto dataset: 165


Evaluate number of Long forms

In [146]:
print(f"Number of distinct LF in train gold standard: {train_gold_subtrack1['Mention_B'].nunique()}")
print(f"Number of distinct LF in train soto dataset: {train_soto1['Mention_B'].nunique()}")

Number of distinct LF in train gold standard: 189
Number of distinct LF in train soto dataset: 179


#### Subtrack 2

Evaluate number of records

In [175]:
print(f"Number of rows in train gold standard subtrack2: {train_gold_subtrack2.shape[0]}")
print(f"Number of rows in train soto dataset: {train_soto2.shape[0]}")

Number of rows in train gold standard subtrack2: 4259
Number of rows in train soto dataset: 2529


Evaluate number of Docs

In [176]:
print(f"Number of distinct docs in train raw: {train_raw['# Document_ID'].nunique()}")
print(f"Number of distinct docs in train gold standard subtrack2: {train_gold_subtrack2['# Document_ID'].nunique()}")
print(f"Number of distinct docs in train soto dataset: {train_soto2['# Document_ID'].nunique()}")

Number of distinct docs in train raw: 318
Number of distinct docs in train gold standard subtrack2: 298
Number of distinct docs in train soto dataset: 298


Evaluate number of Short forms

In [179]:
print(f"Number of distinct SF in train gold standard subtrack2: {train_gold_subtrack2['Abbreviation'].nunique()}")
print(f"Number of distinct SF in train soto dataset: {train_soto2['Abbreviation'].nunique()}")

Number of distinct SF in train gold standard subtrack2: 768
Number of distinct SF in train soto dataset: 495


Evaluate number of Long forms

In [180]:
print(f"Number of distinct LF in train gold standard: {train_gold_subtrack2['Definition'].nunique()}")
print(f"Number of distinct LF in train soto dataset: {train_soto2['Definition'].nunique()}")

Number of distinct LF in train gold standard: 908
Number of distinct LF in train soto dataset: 538


### Testing

220 clinical cases.

In [69]:
test_raw = pd.read_csv("../data/test_raw.tsv")

In [75]:
test_gold_subtrack1 = pd.read_csv("../data/clinical_cases.relations.testing_set_2.tsv", sep = '\t')
#test_gold_subtrack1 = pd.read_csv("../data/ibereval_data/testing_set/clinical_cases.relations.testing_set.tsv", sep = '\t')
test_gold_subtrack2 = pd.read_csv("../data/ibereval_data/testing_set/clinical_cases.abbreviations.testing_set.tsv", sep = '\t')

In [76]:
test_raw.head(2)

Unnamed: 0,# Document_ID,texto
0,S1130-01082008001000010-1,"Varón de 43 años originario de Marruecos, que ingresó en nuestro servicio por cuadro de 4 días d..."
1,S0004-06142009000400011-1,Varón de 75 años con antecedentes de EPOC moderado sin otros antecedentes médicos de interés. En...


In [77]:
test_gold_subtrack1.head(1)

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0211-69952013000500019-1,SHORT_FORM,3739,3745,ARA II,SHORT-LONG,LONG_FORM,3695,3737,antagonista del receptor de angiotesina II


In [78]:
test_gold_subtrack2.head(1)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0212-71992005000500009-1,533,537,/mm3,milímetro cúbico,milímetro cúbico


In [158]:
#test_soto = pd.read_csv("../data/track_1_soto_test.tsv", sep = "\t")
#test_soto = pd.read_csv("../data/track_1_soto_test_marzo23.tsv", sep = "\t")
test_soto1 = pd.read_csv("../data/marzo2023/subtrack1/OutputApproach4Relations_testing.tsv", sep = "\t",header = None)
test_soto2 = pd.read_csv("../data/marzo2023/subtrack2/OutputApproach4Disambiguation_testing.tsv", sep = "\t", header = None)

In [167]:
test_soto1.head(2)

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-14732005000200003-1,SHORT_FORM,1683,1688,XOMED,SHORT-LONG,LONG_FORM,1672,1681,Medtronic
1,S0365-66912011001100006-1,SHORT_FORM,127,129,AV,SHORT-LONG,LONG_FORM,111,125,agudeza visual


In [168]:
test_soto2.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S1130-14732005000200003-1,300,302,mm,milímetro,milimetro
1,S1130-14732005000200003-1,649,651,TC,tomografía computarizada,tomografia computariz


In [169]:
test_soto1.columns = test_gold_subtrack1.columns
test_soto2.columns = test_gold_subtrack2.columns

In [170]:
test_soto2.isna().sum(axis = 0)

# Document_ID            0
StartOffset              0
EndOffset                0
Abbreviation             0
Definition               0
Definition_lemmatized    0
dtype: int64

In [174]:
test_gold_subtrack1.dtypes

# Document_ID            object
Mention_A_type           object
Mention_A_StartOffset     int64
Mention_A_EndOffset       int64
Mention_A                object
Relation_type            object
Mention_B_type           object
Mention_B_StartOffset     int64
Mention_B_EndOffset       int64
Mention_B                object
dtype: object

In [172]:
test_soto1.to_csv("../data/marzo2023/subtrack1/OutputApproach4Relations_testing.tsv", sep = "\t",index = False)

In [173]:
test_soto2.to_csv("../data/marzo2023/subtrack2/OutputApproach4Disambiguation_testing.tsv", sep = "\t",index = False)

#### Subtrack 1

Evaluate number of records

In [181]:
print(f"Number of rows in test gold standard subtrack1: {test_gold_subtrack1.shape[0]}")
print(f"Number of rows in test soto dataset: {test_soto1.shape[0]}")

Number of rows in test gold standard subtrack1: 238
Number of rows in test soto dataset: 213


Evaluate number of Docs

In [182]:
print(f"Number of distinct docs in test raw: {test_raw['# Document_ID'].nunique()}")
print(f"Number of distinct docs in test gold standard subtrack1: {test_gold_subtrack1['# Document_ID'].nunique()}")
print(f"Number of distinct docs in test soto dataset: {test_soto1['# Document_ID'].nunique()}")

Number of distinct docs in test raw: 220
Number of distinct docs in test gold standard subtrack1: 101
Number of distinct docs in test soto dataset: 99


Evaluate number of Short forms

In [183]:
print(f"Number of distinct SF in test gold standard subtrack1: {test_gold_subtrack1['Mention_A'].nunique()}")
print(f"Number of distinct SF in test soto dataset: {test_soto1['Mention_A'].nunique()}")

Number of distinct SF in test gold standard subtrack1: 147
Number of distinct SF in test soto dataset: 120


Evaluate number of Long forms

In [184]:
print(f"Number of distinct LF in test gold standard: {test_gold_subtrack1['Mention_B'].nunique()}")
print(f"Number of distinct LF in test soto dataset: {test_soto1['Mention_B'].nunique()}")

Number of distinct LF in test gold standard: 163
Number of distinct LF in test soto dataset: 139


#### Subtrack 2

Evaluate number of records

In [185]:
print(f"Number of rows in test gold standard subtrack2: {test_gold_subtrack2.shape[0]}")
print(f"Number of rows in test soto dataset: {test_soto2.shape[0]}")

Number of rows in test gold standard subtrack2: 3413
Number of rows in test soto dataset: 1981


Evaluate number of Docs

In [186]:
print(f"Number of distinct docs in test raw: {test_raw['# Document_ID'].nunique()}")
print(f"Number of distinct docs in test gold standard subtrack2: {test_gold_subtrack2['# Document_ID'].nunique()}")
print(f"Number of distinct docs in test soto dataset: {test_soto2['# Document_ID'].nunique()}")

Number of distinct docs in test raw: 220
Number of distinct docs in test gold standard subtrack2: 214
Number of distinct docs in test soto dataset: 213


Evaluate number of Short forms

In [188]:
print(f"Number of distinct SF in test gold standard subtrack2: {test_gold_subtrack2['Abbreviation'].nunique()}")
print(f"Number of distinct SF in test soto dataset: {test_soto2['Abbreviation'].nunique()}")

Number of distinct SF in test gold standard subtrack2: 685
Number of distinct SF in test soto dataset: 407


Evaluate number of Long forms

In [189]:
print(f"Number of distinct LF in test gold standard: {test_gold_subtrack2['Definition'].nunique()}")
print(f"Number of distinct LF in test soto dataset: {test_soto2['Definition'].nunique()}")

Number of distinct LF in test gold standard: 695
Number of distinct LF in test soto dataset: 439


In [64]:
with open("dictionary_measureunits.txt", "r") as data:
    dictionary = ast.literal_eval(data.read())

Assign definitions from AbreMES DB

In [65]:
train_def = train_track2.merge(abremes[['Abbreviation', 'Definition']], how = 'left', on = 'Abbreviation', indicator = True)

In [66]:
train_def.head(2)

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both
1,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,o-desmetilasa,both


In [67]:
train_def['Definition'] = train_def['Definition'].fillna('no_existe')

In [68]:
train_def['Definition'] = train_def.apply(lambda x: defin_dictionary(x, dictionary), axis = 1)

In [69]:
sf_notfind= train_def[train_def['Definition'].isna()]['Abbreviation'].unique().tolist()

In [70]:
len(sf_notfind)

31

In [71]:
train_def= train_def.dropna(subset = ['Definition'])

In [72]:
#En test real hay 600 y pico
print(train_def.Abbreviation.nunique())

118


In [73]:
train_def.head()

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both
1,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,o-desmetilasa,both
2,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both
3,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,oxígeno disuelto,both
4,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,OD: cuenta dedos,both


In [74]:
print(train_def.shape)
print(train_def.Abbreviation.nunique())

(7175, 7)
118


Get lemmatized long forms

In [75]:
lemmatizer = WordNetLemmatizer()

In [76]:
train_def['Definition_lemmatized'] = train_def['Definition'].map(lambda x: lemmatizer.lemmatize(x))

In [77]:
train_def.head()

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge,Definition_lemmatized
0,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both,ojo derecho
1,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,o-desmetilasa,both,o-desmetilasa
2,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,ojo derecho,both,ojo derecho
3,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,oxígeno disuelto,both,oxígeno disuelto
4,S0365-66912006000200012-1,208,210,OD,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...,OD: cuenta dedos,both,OD: cuenta dedos


In [78]:
del train_def['_merge']

In [79]:
train_def.to_csv("../data/data_paper/train_subtrack2_soto_parte1.csv", index = False)

### Test

In [81]:
test_track2 = testing.copy()

### 2) Give a long-form from AbreMES data base

In [82]:
abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/pairs.tsv", sep = '\t')

In [83]:
abremes.head()

Unnamed: 0,# Pair ID,Abbreviation ID,Definition ID,Frequency,Abbreviation,Definition,Appears on
0,1,3348,17876,31,DDD,diaria definida,"http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300004,http://scielo..."
1,2,11880,23106,11,HP-CHPR,Hospital Pediátrico del Centro Hospitalario Pereira Rossell,"http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300004,http://www.sc..."
2,3,1454,23213,1,EVN,de vida al nacer,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300005
3,4,1112,23214,1,TDS,Total dermatoscopic score,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300006
4,5,231,23215,1,AP,cases by pathologic anatomy,http://www.scielo.edu.uy/scielo.php?script=sci_arttext&pid=S1688-03902003000300006


In [84]:
abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')

  abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')


In [85]:
test_track2 = test_track2.rename(columns = {'abrev':'Abbreviation'})

In [86]:
test_track2['Abbreviation'] = test_track2['Abbreviation'].str.strip()

In [87]:
print(test_track2.shape)
print(test_track2.Abbreviation.nunique())
print(abremes.shape)
print(abremes.Abbreviation.nunique())

(198, 5)
116
(52551, 7)
20236


Add dictionary with measure units

In [88]:
with open("dictionary_measureunits.txt", "r") as data:
    dictionary = ast.literal_eval(data.read())

Assign definitions from AbreMES DB

In [89]:
test_def = test_track2.merge(abremes[['Abbreviation', 'Definition']], how = 'left', on = 'Abbreviation', indicator = True)

In [90]:
test_def.head(2)

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,1683,1688,XOMED,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",,left_only
1,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",altura de vuelo,both


In [91]:
test_def['Definition'] = test_def['Definition'].fillna('no_existe')

In [92]:
test_def['Definition'] = test_def.apply(lambda x: defin_dictionary(x, dictionary), axis = 1)

In [93]:
sf_notfind= test_def[test_def['Definition'].isna()]['Abbreviation'].unique().tolist()

In [94]:
len(sf_notfind)

16

In [95]:
test_def= test_def.dropna(subset = ['Definition'])

In [96]:
#En test real hay 600 y pico
print(test_def.Abbreviation.nunique())

100


In [97]:
test_def.head()

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
1,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",altura de vuelo,both
2,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",acceso vascular,both
3,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",accesos vasculares,both
4,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",agudeza visual,both
5,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",acroqueratosis verruciforme de Hopf,both


In [98]:
print(test_def.shape)
print(test_def.Abbreviation.nunique())

(5616, 7)
100


Get lemmatized long forms

In [99]:
lemmatizer = WordNetLemmatizer()

In [100]:
test_def['Definition_lemmatized'] = test_def['Definition'].map(lambda x: lemmatizer.lemmatize(x))

In [101]:
test_def.head()

Unnamed: 0,nombre,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge,Definition_lemmatized
1,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",altura de vuelo,both,altura de vuelo
2,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",acceso vascular,both,acceso vascular
3,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",accesos vasculares,both,accesos vasculares
4,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",agudeza visual,both,agudeza visual
5,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",acroqueratosis verruciforme de Hopf,both,acroqueratosis verruciforme de Hopf


In [102]:
del test_def['_merge']

In [104]:
test_def.to_csv("../data/data_paper/test_subtrack2_soto_parte1.csv", index = False)