# Library

In [104]:
import pandas as pd
import numpy as np
import os
import re
import collections
import unidecode
#from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation

from nltk import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Functions

In [2]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [105]:
def clean_text(string):
    """
    A method to clean text 
    """
    
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuation: 
            string = string.replace(x, "")
        string = unidecode.unidecode(string)

#     # Converting the text to lower
#     string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in swords])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string 

In [4]:
#nltk.download('stopwords')
swords = list(set(stopwords.words('spanish')))

# Load Data

### Testing

220 clinical cases.

In [3]:
testing_abbr = pd.read_csv("../datasets/testing_set/clinical_cases.abbreviations.testing_set.tsv", sep = '\t')
testing_met = pd.read_csv("../datasets/testing_set/clinical_cases.metadata.testing_set.tsv", sep = '\t')
testing_rel = pd.read_csv("../datasets/testing_set/clinical_cases.relations.testing_set.tsv", sep = '\t')

In [4]:
testing_met.head()

Unnamed: 0,# Document_ID,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S1130-01082006000700014-1.txt,1.txt,1130-0108,2006-07-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...
1,S1130-01082007000300006-7.txt,7.txt,1130-0108,2007-03-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S1134-80462009000100005-1.txt,1.txt,1134-8046,2009-02-01,Revista de la Sociedad Española del Dolor v.1...,http://scielo.isciii.es/scielo.php?script=sci_...
3,S1137-66272014000300015-1.txt,1.txt,1137-6627,2014-12-01,Anales del Sistema Sanitario de Navarra v.37 ...,http://scielo.isciii.es/scielo.php?script=sci_...
4,S0365-66912004001200011-1.txt,1.txt,0365-6691,2004-12-01,Archivos de la Sociedad Española de Oftalmolog...,http://scielo.isciii.es/scielo.php?script=sci_...


In [5]:
testing_rel = testing_rel.reset_index()

In [6]:
testing_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [7]:
testing_rel = testing_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [8]:
testing_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0211-69952013000500019-1,SHORT_FORM,3739,ARA II,SHORT-LONG,LONG_FORM,3695,3737,antagonista del receptor de angiotesina II
1,S0211-69952013000500019-1,SHORT_FORM,2793,PCR,SHORT-LONG,LONG_FORM,2798,2837,reacción en cadena de enzima polimerasa
2,S0365-66912004000600008-1,SHORT_FORM,406,AV,SHORT-LONG,LONG_FORM,390,404,agudeza visual
3,S0211-69952012000500025-1,SHORT_FORM,945,angio-TAC,SHORT-LONG,LONG_FORM,908,943,angiotomografía computarizada axial
4,S1130-05582017000200122-1,SHORT_FORM,940,RMN,SHORT-LONG,LONG_FORM,910,938,resonancia magnética nuclear


In [9]:
testing_rel.Relation_type.unique()

array(['SHORT-LONG', 'SHORT-NESTED', 'NESTED-LONG'], dtype=object)

In [10]:
testing_abbr = testing_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [11]:
testing_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0004-06142010000500014-1,2037,2044,16SrRNA,16s ribosomal rna,16s ribosomal rno
1,S0004-06142010000500014-1,1349,1351,M.,mycobacterium,mycobacterium
2,S0004-06142010000500014-1,1339,1342,PCR,polymerase chain reaction,polymerase chain reaction
3,S0004-06142010000500014-1,611,615,BHCG,beta-human chorionic gonadotropin,beta-humar chorionic gonadotropin
4,S0004-06142010000500014-1,594,597,CEA,carcinoembrionary antigen,carcinoembrionary antiger


In [12]:
testing_raw = read_texts("../datasets/testing_set/testing_set.raw_text/")

In [13]:
testing_raw.head()

Unnamed: 0,nombre,texto
0,S1139-76322009000700016-1,Paciente de sexo femenino de 13 años y 7 meses...
1,S0210-48062007000700015-1,Varón de 72 años con antecedentes personales d...
2,S0212-71992005000500009-1,"Varón de 81 años, con antecedentes de fibrilac..."
3,S0365-66912004001200011-1,Paciente varón de 52 años que acudió a urgenci...
4,S1130-01082009000900015-1,Varón de 54 años con episodios de pancreatitis...


### Backgound

2879 clinical cases. 220 will be use for clinical evaluation

In [14]:
back_met = pd.read_csv("../datasets/background_test/clinical_cases.metadata.background_set.tsv", sep = '\t')

In [15]:
back_met = back_met.rename(columns = {'# Document_ID': 'doc_id'})

In [16]:
back_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S0325-00752010000100014.txt,1.txt,0325-0075,02/2010,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
1,S0325-00752013000200014.txt,1.txt,0325-0075,04/2013,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
2,S0325-00752011000400017.txt,1.txt,0325-0075,08/2011,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
3,S0325-00752013000600022.txt,1.txt,0325-0075,12/2013,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
4,S0325-00752008000500013.txt,1.txt,0325-0075,10/2008,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...


In [17]:
back_raw = read_texts("../datasets/background_test/background_test_set/")

In [18]:
back_raw.head()

Unnamed: 0,nombre,texto
0,S0212-71992004000800006-1,Mujer de 57 años de edad con antecedentes pers...
1,S0716-10182014000100007-1,Se recibió en el Hospital Veterinario Puente A...
2,S1130-01082009000200016-1,"Mujer de 47 años de edad, sin antecedentes de ..."
3,S0716-10182015000400016-1,"Escolar de 11 años de edad, género femenino, s..."
4,S1130-01082004001200010-2,"Varón de 25 años, remitido a nuestro Servicio ..."


### Development

146 clinical cases

In [19]:
dev_abbr = pd.read_csv("../datasets/development_set/clinical_cases.abbreviations.development_set.tsv", sep = '\t')
dev_met = pd.read_csv("../datasets/development_set/clinical_cases.metadata.development_set.tsv", sep = '\t')
dev_rel = pd.read_csv("../datasets/development_set/clinical_cases.relations.development_set.tsv", sep = '\t')

In [20]:
dev_met = dev_met.rename(columns = {'# Document_ID': 'doc_id'})

In [21]:
dev_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S1130-14732005000300004-1.txt,1.txt,1130-1473,2005-06-01,Neurocirugía v.16 n.3 2005,http://scielo.isciii.es/scielo.php?script=sci_...
1,S1130-01082008000200009-1.txt,1.txt,1130-0108,2008-02-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S1137-66272012000300021-1.txt,1.txt,1137-6627,2012-12-01,Anales del Sistema Sanitario de Navarra v.35 ...,http://scielo.isciii.es/scielo.php?script=sci_...
3,S1699-695X2016000200010-1.txt,1.txt,1699-695X,2016-06-01,Revista Clínica de Medicina de Familia v.9 n....,http://scielo.isciii.es/scielo.php?script=sci_...
4,S1130-01082007001100009-1.txt,1.txt,1130-0108,2007-11-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...


In [22]:
dev_rel = dev_rel.reset_index()

In [23]:
dev_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [24]:
dev_rel = dev_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [25]:
dev_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1888-75462015000400006-1,SHORT_FORM,1436,AP,SHORT-LONG,LONG_FORM,1419,1434,anteroposterior
1,S0210-56912009000700006-2,SHORT_FORM,876,angio-RMN,SHORT-LONG,LONG_FORM,858,874,angiorresonancia
2,S0210-56912009000700006-2,SHORT_FORM,819,RMN,SHORT-LONG,LONG_FORM,789,817,resonancia magnética nuclear
3,S1698-44472005000300015-1,SHORT_FORM,209,ATM,SHORT-LONG,LONG_FORM,177,207,articulación témporomandibular
4,S1698-44472005000300015-1,SHORT_FORM,1125,TC,SHORT-LONG,LONG_FORM,1099,1123,tomografía computerizada


In [26]:
dev_abbr = dev_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [27]:
dev_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S1130-14732005000300004-1,1216,1218,C3,tercera vértebra cervical,tercero vértebra cervical
1,S1130-14732005000300004-1,717,719,C2,segunda vértebra cervical,segundo vértebra cervical
2,S1130-14732005000300004-1,3191,3193,C3,tercera vértebra cervical,tercero vértebra cervical
3,S1130-14732005000300004-1,2867,2869,C3,tercera vértebra cervical,tercero vértebra cervical
4,S1130-14732005000300004-1,2862,2864,C2,segunda vértebra cervical,segundo vértebra cervical


In [28]:
dev_raw = read_texts("../datasets/development_set/development_set.raw_text/")

In [29]:
dev_raw.head()

Unnamed: 0,nombre,texto
0,S1139-76322017000200010-1,Niña de dos años y diez meses con antecedentes...
1,S0365-66912005001100008-1,Se presenta el caso de un varón de 45 años que...
2,S1130-01082006001000017-1,Mujer de 42 años de edad con antecedentes pers...
3,S0212-71992005001200008-1,Paciente de 57 años con ingresos hospitalarios...
4,S0365-66912007000300010-1,Paciente de 33 años que el 20-08-05 es traslad...


### Sample test

15 clinical cases

In [30]:
sample_abbr = pd.read_csv("../datasets/sample_set/clinical_cases.abbreviations.sample_set.tsv", sep = '\t')
sample_met = pd.read_csv("../datasets/sample_set/clinical_cases.metadata.sample_set.tsv", sep = '\t')
sample_rel = pd.read_csv("../datasets/sample_set/clinical_cases.relations.sample_set.tsv", sep = '\t')

In [31]:
sample_met = sample_met.rename(columns = {'# Document_ID': 'doc_id'})

In [32]:
sample_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S0004-06142006000900015-1,1,0004-0614,2006-11-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
1,S0004-06142006000600015-1,1,0004-0614,2006-08-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S0004-06142007000700014-1,1,0004-0614,2007-09-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
3,S0004-06142007000900013-1,1,0004-0614,2007-11-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
4,S0004-06142006000200014-1,1,0004-0614,2006-03-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...


In [33]:
sample_rel = sample_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [34]:
sample_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0004-06142006000700014-1,SHORT_FORM,926,929,CEA,SHORT-LONG,LONG_FORM,896,924,Antígeno Carcino Embrionario
1,S0004-06142005001000011-1,SHORT_FORM,1626,1629,ROT,SHORT-LONG,LONG_FORM,1600,1624,reflejos osteotendinosos
2,S0004-06142005001000011-1,SHORT_FORM,1715,1718,RMN,SHORT-LONG,LONG_FORM,1685,1713,resonancia magnética nuclear
3,S0004-06142005001000011-1,SHORT_FORM,1663,1666,RCP,SHORT-LONG,LONG_FORM,1639,1661,reflejo cutaneoplantar
4,S0004-06142005001000011-1,SHORT_FORM,1808,1811,LCR,SHORT-LONG,LONG_FORM,1783,1806,líquido cefalorraquídeo


In [35]:
sample_abbr = sample_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [36]:
sample_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0004-06142005001000011-1,1034,1036,Kg,kilogramo,kilogramo
1,S0004-06142005001000011-1,1031,1033,mg,miligramo,miligramo
2,S0004-06142005001000011-1,196,199,IgA,inmunoglobulina a,inmunoglobulina a
3,S0004-06142005001000011-1,2057,2060,LCR,líquido cefalorraquídeo,líquido cefalorraquídeo
4,S0004-06142005001000011-1,1594,1598,EEII,extremidades inferiores,extremidad inferior


In [37]:
sample_raw = read_texts("../datasets/sample_set/sample_set.raw_text/")

In [38]:
sample_raw.head()

Unnamed: 0,nombre,texto
0,S0004-06142006000600014-1,"Paciente varón, de 40 años de edad, con antece..."
1,S0004-06142006000300015-1,Paciente de 50 años con antecedente de litiasi...
2,S0004-06142007000900013-1,Presentamos el caso de un recién nacido de tre...
3,S0004-06142005001000011-1,Varón de 58 años de edad en el momento del tra...
4,S0004-06142006000700013-1,"Paciente varón, de 63 años de edad, mestizo, d..."


### Trainning

318 clinical cases

In [5]:
train_abbr = pd.read_csv("../datasets/trainning_set/clinical_cases.abbreviations.training_set.tsv", sep = '\t')
train_met = pd.read_csv("../datasets/trainning_set/clinical_cases.metadata.training_set.tsv", sep = '\t')
train_rel = pd.read_csv("../datasets/trainning_set/clinical_cases.relations.training_set.tsv", sep = '\t')

In [6]:
train_met = train_met.rename(columns = {'# Document_ID': 'doc_id'})

In [7]:
train_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S1139-76322015000500009-1.txt,1.txt,1139-7632,2015-12-01,Pediatría Atención Primaria v.17 n.68 2015,http://scielo.isciii.es/scielo.php?script=sci_...
1,S1130-05582008000400007-2.txt,2.txt,1130-0558,2008-08-01,Revista Española de Cirugía Oral y Maxilofacia...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S0210-48062006000100012-1.txt,1.txt,0210-4806,2006-01-01,Actas Urológicas Españolas v.30 n.1 2006,http://scielo.isciii.es/scielo.php?script=sci_...
3,S0213-12852003000500002-1.txt,1.txt,0213-1285,2003-10-01,Avances en Odontoestomatología v.19 n.5 2003,http://scielo.isciii.es/scielo.php?script=sci_...
4,S0212-71992005000400007-1.txt,1.txt,0212-7199,2005-04-01,Anales de Medicina Interna v.22 n.4 2005,http://scielo.isciii.es/scielo.php?script=sci_...


In [8]:
train_rel = train_rel.reset_index()

In [9]:
train_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [10]:
train_rel = train_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [11]:
train_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-01082009000400014-1,SHORT_FORM,476,NPT,SHORT-LONG,LONG_FORM,454.0,474.0,nutrición parenteral
1,S1130-63432016000100009-1,SHORT_FORM,614,NIHSS,SHORT-LONG,LONG_FORM,621.0,662.0,National Institute of Health Stroke Scale
2,S1139-76322017000200007-1,SHORT_FORM,1145,CMV,SHORT-LONG,LONG_FORM,1128.0,1143.0,citomegalovirus
3,S1139-76322017000200007-1,SHORT_FORM,1243,VSG,SHORT-LONG,LONG_FORM,1206.0,1241.0,velocidad de sedimentación globular
4,S1139-76322017000200007-1,SHORT_FORM,1300,IGRA,SHORT-LONG,LONG_FORM,1267.0,1298.0,interferon-gamma release assays


In [12]:
train_abbr = train_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [13]:
train_abbr.Definition.nunique()

908

In [14]:
train_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0210-48062004000500008-1,1650,1652,ml,mililitro,mililitro
1,S0210-48062004000500008-1,708,709,l,litro,litro
2,S0210-48062004000500008-1,704,707,mEq,miliequivalente,miliequivalente
3,S0210-48062004000500008-1,677,681,pCO2,presión parcial de co2,presión parcial de co2
4,S0210-48062004000500008-1,2287,2290,HLA,human leucocyte antigen,human leucocyte antiger


In [15]:
train_raw = read_texts("../datasets/trainning_set/training_set.raw_text/")

In [16]:
train_raw = train_raw.rename(columns = {'nombre': 'doc_id'})

In [17]:
train_raw.head()

Unnamed: 0,doc_id,texto
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...


## Prepare Data

In [18]:
train_abbr.shape

(4260, 6)

In [19]:
train_raw.shape

(318, 2)

In [87]:
train = train_raw.merge(train_abbr[['doc_id', 'Abbreviation', 'Definition_lemmatized']], on = 'doc_id', how = 'left')

In [88]:
train = train.merge(train_abbr, on = ['doc_id', 'Abbreviation'], how = 'left')

In [89]:
train = train.drop_duplicates()

In [91]:
train = train[['doc_id', 'texto', 'Abbreviation', 'Definition_lemmatized_x', 'StartOffset', 'EndOffset']]

In [92]:
train = train.rename(columns = {'Definition_lemmatized_x':'Definition'})

In [93]:
train.head()

Unnamed: 0,doc_id,texto,Abbreviation,Definition,StartOffset,EndOffset
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,PAAF,punción aspiración con aguja fino,1006.0,1010.0
1,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,RM,resonancia magnético,789.0,791.0
2,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,cm,centímetro,1345.0,1347.0
3,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,cm,centímetro,1217.0,1219.0
4,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,g,gramo,1304.0,1305.0


In [26]:
train.shape

(4886, 6)

In [34]:
train['doc_id'].nunique()

318

In [35]:
train['Abbreviation'].nunique()

768

In [33]:
# train['texto'] = train.apply(replace_sf_target, axis = 1)

In [94]:
train = train.rename(columns = {'Abbreviation': 'short_form', 'Definition': 'long_form'})

In [95]:
train = train.dropna(subset = ['short_form', 'long_form'])

In [96]:
train.shape

(4355, 6)

## Processing text

### 1. Tokenize and delete stop words

In [106]:
train['texto'] = train['texto'].map(clean_text)

In [107]:
train['long_form'] = train['long_form'].map(clean_text)

In [109]:
train = train.drop_duplicates()
train.shape

(4341, 6)

In [49]:
# train['text_tokenize'] = train['text_tokenize'].map(lambda x: word_tokenize(x))

In [110]:
train.head()

Unnamed: 0,doc_id,texto,short_form,long_form,StartOffset,EndOffset
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,PAAF,puncion aspiracion aguja fino,1006.0,1010.0
1,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,RM,resonancia magnetico,789.0,791.0
2,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,cm,centimetro,1345.0,1347.0
3,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,cm,centimetro,1217.0,1219.0
4,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,g,gramo,1304.0,1305.0


Replace SF in the text for a label target

In [31]:
# def replace_sf_target(row):
#     row['texto'] = row['texto'].map(lambda x: x.replace(x, '<target>') if x == row['Abbreviation'] else x) 
#     return row['texto']

Transform dataframe to a list of dictionaries

In [111]:
train_data = train[['doc_id', 'texto', 'short_form', 'long_form']].to_dict('records')

### 2. Build vocabulary

In [54]:
def build_vocab(data):
    """
    return a dict with each word and its id. Id is assigned based on word frecuency, it means, it 0 will be assigned to a most fecuency word
    """
    
    #coge cada palabra de cada texto y la añade a una colección que tiene la palabra y su frecuencia de aparición
    counter = collections.Counter()
    for elem in data:
        counter.update(word_tokenize(elem['texto']))

    # remove infrequent words
    min_freq = 1
    filtered = [item for item in counter.items() if item[1]>=min_freq]

    count_pairs = sorted(filtered, key=lambda x: -x[1]) #ordena la lista filtered en orden descendente de frecuencia (que es el segundo elemento de la tupla)
    words, _ = list(zip(*count_pairs)) #separa las palabras de su frecuencia
    word_to_id = dict(zip(words, range(len(words)))) #crea un diccionario donde la clave es la palabra y el valor un índice. La palabra de índice 0 será la de mayor frecuencia de aparición

    return word_to_id

In [112]:
word_id_dict = build_vocab(train_data)

In [81]:
len(word_id_dict)

13606

### 3. Calculate sense Ids

In [58]:
def build_sense_ids(data):
    words = set()
    word_to_senses = {}
    for elem in data:
        target_word = elem['short_form']
        target_sense = elem['long_form']
        if target_word not in words:
            words.add(target_word) #añade la palabra al set de palabras
            word_to_senses.update({target_word: [target_sense]}) #añade la palabra al dict con su significado en una lista de valores
        else:
            if target_sense not in word_to_senses[target_word]:
                word_to_senses[target_word].append(target_sense) #si la palabra está en el dict pero no el significado, lo añade a la lista
    
    words = list(words)
    target_word_to_id = dict(zip(words, range(len(words)))) #diccionario de cada palabra ambigua con un un índice
    target_sense_to_id = [dict(zip(word_to_senses[word], range(len(word_to_senses[word])))) for word in words] #diccionario de cada significado y un índice

    n_senses_from_word_id = dict([(target_word_to_id[word], len(word_to_senses[word])) for word in words]) #diccionario índice de cada palabra ambigua y el número de significados diferentes
    return target_word_to_id, target_sense_to_id, len(words), n_senses_from_word_id

In [113]:
sf_to_id, lf_to_id, n_words, n_lf_from_sf_id = build_sense_ids(train_data)
print('Vocabulary size: %d' % len(word_id_dict))

Vocabulary size: 13412


### 4. Context for each target sense

In [118]:
def build_context(data, word_to_id):
    target_sense_to_context = {}
    for elem in data:
        target_sense_id = elem['long_form'] # 'id' or 'target_sense'?? DEBERIA SER TARGET SENSE ID
        context = word_tokenize(elem['texto'])
        #context = sparse_matrix(context, word_to_id)
        if target_sense_id not in target_sense_to_context:
            #target_sense_to_context.update({target_sense:context})
            target_sense_to_context[target_sense_id] = []
        target_sense_to_context[target_sense_id].append(context) #dict por sense id: contexto
    
    return target_sense_to_context

In [119]:
#build context vocab of the target sense
train_target_sense_to_context = build_context(train_data , word_id_dict)

### 5. Convert to numeric

Get forward and backward context of the ambiguous word

In [None]:
def convert_to_numeric(data, word_to_id, target_word_to_id, target_sense_to_id, n_senses_from_word_id, target_sense_to_context_embedding, is_training=True):
    
    n_senses_sorted_by_target_id = [n_senses_from_word_id[target_id] for target_id in range(len(n_senses_from_word_id))] #ordena las palabras ambiguas de mayor a menor cantidad de significados
    starts = (np.cumsum(np.append([0], n_senses_sorted_by_target_id)))[:-1]
    tot_n_senses = sum(n_senses_from_word_id.values())

    all_data = []
    #target_tag_id = word_to_id['<target>']
    for instance in data:
        words = split_context(instance['context'])            
        target_word = instance['target_word'] 
        
        ctx_ints = [word_to_id[word] for word in words if word in word_to_id] #id de la palabra que obtuvimos en word_to_id
        stop_idx = words.index('<target>') #índice de la palabra target

        
        _instance = []
        xf = np.array(ctx_ints[:stop_idx]) #índices de las palabras anteriores a la target
        xb = np.array(ctx_ints[stop_idx+1:])[::-1]    #índices de las palabras posteriores a la target            
        
        instance_id = instance['id'] #id del texto O DEBERIA SER ID DEL SIGNIFICADO DE LA PALABRA (TARGET_SENSE_ID)?        
        target_id = target_word_to_id[target_word] #id de la target
        
        _instance.append(xf)
        _instance.append(xb)
        _instance.append(instance_id)

        if is_training:                   
            target_sense = instance['target_sense']   
            if instance_id in target_sense_to_context_embedding:
                sense_embedding = target_sense_to_context_embedding[instance_id] #se trae el embeddings del contexto para cada significado de la palabra
                senses = target_sense_to_id[target_id] #significados
                sense_id = senses[target_sense] if target_sense else -1 #ids de los significados
                _instance.append(sense_embedding)       

        all_data.append(_instance[:])

    return all_data

In [153]:
# make numeric
train_ndata = convert_to_numeric(train_data_, word_to_id, target_word_to_id, 
                                 target_sense_to_id, n_senses_from_target_id, target_sense_to_context_embedding, is_training = True)
#test_ndata = convert_to_numeric(test_data, word_to_id, target_word_to_id, target_sense_to_id, n_senses_from_target_id, target_sense_to_context_embedding, is_training = False)

KeyError: '1'

### 2. Get forward and backward context

a) Get theposition of the short form to get forward and backward context centered on sf

In [230]:
prueba = train['texto'].iloc[0]
st = train['StartOffset'].iloc[0]

In [232]:
def get_data(_data, n_step_f, n_step_b):
    forward_data, backward_data, target_sense_ids, sense_embeddings = [], [], [], []
    for target_id, data in _data.items():
        for instance in data:
            xf, xb, target_sense_id, sense_embedding = instance[0], instance[1], instance[2], instance[3]
            
            n_to_use_f = min(n_step_f, len(xf))
            n_to_use_b = min(n_step_b, len(xb))
            xfs = np.zeros([n_step_f], dtype=np.int32)
            xbs = np.zeros([n_step_b], dtype=np.int32)            
            if n_to_use_f != 0:
                xfs[-n_to_use_f:] = xf[-n_to_use_f:]
            if n_to_use_b != 0:
                xbs[-n_to_use_b:] = xb[-n_to_use_b:]
            
            forward_data.append(xfs)
            backward_data.append(xbs)
            target_sense_ids.append(target_sense_id)
            sense_embeddings.append(sense_embedding)
            
            #print("xf", len(xf))
            #print("xfs", len(xfs))
            #print("sense_embedding", sense_embedding)
    
    return (np.array(forward_data), np.array(backward_data), np.array(target_sense_ids), np.array(sense_embeddings))

AttributeError: 'str' object has no attribute 'start'

In [None]:
clean_data = clean_text(row['texto'])

In [204]:
row = train.iloc[2]

In [205]:
row['Abbreviation']

'cm'

In [206]:
clean_data = clean_text(row['texto'])

In [209]:
words = word_tokenize(clean_data)

In [202]:
def get_context(row):
    print(row['Abbreviation'], row['doc_id'])
    data = row['texto']
    clean_data = clean_text(data)
    words = word_tokenize(clean_data)
    print(words)
    window = 10
    
    for i,w in enumerate(words):
        if w == row['Abbreviation']:
            print('FOUND!')
            ctx =' '.join(words[i-window:i]+ words[i:i+window])
    return ctx