# Library

In [1]:
import pandas as pd
import numpy as np
import os
import re
import collections
#from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation

from nltk import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Functions

In [2]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [3]:
# def filter_byindex(x,y):
#     #selectors = [x for x in col2]
#     return list(itertools.compress(x,y))

In [4]:
def check_len(x,y):
    if x==y:
        return 0
    else:
        return 1

In [5]:
def get_lf(i,j,k):
    if pd.isnull(k):
        regex_lf = re.compile(r'((?:\w+\W+){1,'+str(len(i))+'})\(\s'+i[0]+'.*\)')
        return regex_lf.findall(j)
    else:
        return k

In [6]:
def ngram_filter(doc, word, n):
    tokens = doc.split()
    all_ngrams = ngrams(tokens, n)
    filtered_ngrams = [x for x in all_ngrams if word in x]
    return filtered_ngrams

In [7]:
def get_longform(tokens, acro, margin = 2, i =1):
    long_form = ''
    #Looking for before
    for word in tokens[index-margin-len(acro):index]:
        #if first letter of word is equal to first letter os acronym
        if word[0] == acro[i].lower():
            long_form += word + ' '
            i += 1
            if i == len(acro):
                break
        elif (i == 1) and (word[0] == acro[i-1].lower()):
            long_form = word + ' '
            i = 1
            if i == len(acro):
                break
    long_form = long_form.rstrip()
    return long_form
    

# Load Data

### Testing

220 clinical cases.

In [5]:
testing_abbr = pd.read_csv("../datasets/testing_set/clinical_cases.abbreviations.testing_set.tsv", sep = '\t')
testing_met = pd.read_csv("../datasets/testing_set/clinical_cases.metadata.testing_set.tsv", sep = '\t')
testing_rel = pd.read_csv("../datasets/testing_set/clinical_cases.relations.testing_set.tsv", sep = '\t')

In [6]:
testing_met.head()

Unnamed: 0,# Document_ID,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S1130-01082006000700014-1.txt,1.txt,1130-0108,2006-07-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...
1,S1130-01082007000300006-7.txt,7.txt,1130-0108,2007-03-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S1134-80462009000100005-1.txt,1.txt,1134-8046,2009-02-01,Revista de la Sociedad Española del Dolor v.1...,http://scielo.isciii.es/scielo.php?script=sci_...
3,S1137-66272014000300015-1.txt,1.txt,1137-6627,2014-12-01,Anales del Sistema Sanitario de Navarra v.37 ...,http://scielo.isciii.es/scielo.php?script=sci_...
4,S0365-66912004001200011-1.txt,1.txt,0365-6691,2004-12-01,Archivos de la Sociedad Española de Oftalmolog...,http://scielo.isciii.es/scielo.php?script=sci_...


In [7]:
testing_rel = testing_rel.reset_index()

In [8]:
testing_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [9]:
testing_rel = testing_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [10]:
testing_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0211-69952013000500019-1,SHORT_FORM,3739,ARA II,SHORT-LONG,LONG_FORM,3695,3737,antagonista del receptor de angiotesina II
1,S0211-69952013000500019-1,SHORT_FORM,2793,PCR,SHORT-LONG,LONG_FORM,2798,2837,reacción en cadena de enzima polimerasa
2,S0365-66912004000600008-1,SHORT_FORM,406,AV,SHORT-LONG,LONG_FORM,390,404,agudeza visual
3,S0211-69952012000500025-1,SHORT_FORM,945,angio-TAC,SHORT-LONG,LONG_FORM,908,943,angiotomografía computarizada axial
4,S1130-05582017000200122-1,SHORT_FORM,940,RMN,SHORT-LONG,LONG_FORM,910,938,resonancia magnética nuclear


In [11]:
testing_rel.Relation_type.unique()

array(['SHORT-LONG', 'SHORT-NESTED', 'NESTED-LONG'], dtype=object)

In [12]:
testing_abbr = testing_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [13]:
testing_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0004-06142010000500014-1,2037,2044,16SrRNA,16s ribosomal rna,16s ribosomal rno
1,S0004-06142010000500014-1,1349,1351,M.,mycobacterium,mycobacterium
2,S0004-06142010000500014-1,1339,1342,PCR,polymerase chain reaction,polymerase chain reaction
3,S0004-06142010000500014-1,611,615,BHCG,beta-human chorionic gonadotropin,beta-humar chorionic gonadotropin
4,S0004-06142010000500014-1,594,597,CEA,carcinoembrionary antigen,carcinoembrionary antiger


In [14]:
testing_raw = read_texts("../datasets/testing_set/testing_set.raw_text/")

In [15]:
testing_raw.head()

Unnamed: 0,nombre,texto
0,S1139-76322009000700016-1,Paciente de sexo femenino de 13 años y 7 meses...
1,S0210-48062007000700015-1,Varón de 72 años con antecedentes personales d...
2,S0212-71992005000500009-1,"Varón de 81 años, con antecedentes de fibrilac..."
3,S0365-66912004001200011-1,Paciente varón de 52 años que acudió a urgenci...
4,S1130-01082009000900015-1,Varón de 54 años con episodios de pancreatitis...


### Backgound

2879 clinical cases. 220 will be use for clinical evaluation

In [16]:
back_met = pd.read_csv("../datasets/background_test/clinical_cases.metadata.background_set.tsv", sep = '\t')

In [17]:
back_met = back_met.rename(columns = {'# Document_ID': 'doc_id'})

In [18]:
back_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S0325-00752010000100014.txt,1.txt,0325-0075,02/2010,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
1,S0325-00752013000200014.txt,1.txt,0325-0075,04/2013,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
2,S0325-00752011000400017.txt,1.txt,0325-0075,08/2011,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
3,S0325-00752013000600022.txt,1.txt,0325-0075,12/2013,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...
4,S0325-00752008000500013.txt,1.txt,0325-0075,10/2008,Archivos argentinos de pediatría,http://www.scielo.org.ar/scielo.php?script=sci...


In [19]:
back_raw = read_texts("../datasets/background_test/background_test_set/")

In [20]:
back_raw.head()

Unnamed: 0,nombre,texto
0,S0212-71992004000800006-1,Mujer de 57 años de edad con antecedentes pers...
1,S0716-10182014000100007-1,Se recibió en el Hospital Veterinario Puente A...
2,S1130-01082009000200016-1,"Mujer de 47 años de edad, sin antecedentes de ..."
3,S0716-10182015000400016-1,"Escolar de 11 años de edad, género femenino, s..."
4,S1130-01082004001200010-2,"Varón de 25 años, remitido a nuestro Servicio ..."


### Development

146 clinical cases

In [21]:
dev_abbr = pd.read_csv("../datasets/development_set/clinical_cases.abbreviations.development_set.tsv", sep = '\t')
dev_met = pd.read_csv("../datasets/development_set/clinical_cases.metadata.development_set.tsv", sep = '\t')
dev_rel = pd.read_csv("../datasets/development_set/clinical_cases.relations.development_set.tsv", sep = '\t')

In [22]:
dev_met = dev_met.rename(columns = {'# Document_ID': 'doc_id'})

In [23]:
dev_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S1130-14732005000300004-1.txt,1.txt,1130-1473,2005-06-01,Neurocirugía v.16 n.3 2005,http://scielo.isciii.es/scielo.php?script=sci_...
1,S1130-01082008000200009-1.txt,1.txt,1130-0108,2008-02-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S1137-66272012000300021-1.txt,1.txt,1137-6627,2012-12-01,Anales del Sistema Sanitario de Navarra v.35 ...,http://scielo.isciii.es/scielo.php?script=sci_...
3,S1699-695X2016000200010-1.txt,1.txt,1699-695X,2016-06-01,Revista Clínica de Medicina de Familia v.9 n....,http://scielo.isciii.es/scielo.php?script=sci_...
4,S1130-01082007001100009-1.txt,1.txt,1130-0108,2007-11-01,Revista Española de Enfermedades Digestivas v...,http://scielo.isciii.es/scielo.php?script=sci_...


In [24]:
dev_rel = dev_rel.reset_index()

In [25]:
dev_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [26]:
dev_rel = dev_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [27]:
dev_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1888-75462015000400006-1,SHORT_FORM,1436,AP,SHORT-LONG,LONG_FORM,1419,1434,anteroposterior
1,S0210-56912009000700006-2,SHORT_FORM,876,angio-RMN,SHORT-LONG,LONG_FORM,858,874,angiorresonancia
2,S0210-56912009000700006-2,SHORT_FORM,819,RMN,SHORT-LONG,LONG_FORM,789,817,resonancia magnética nuclear
3,S1698-44472005000300015-1,SHORT_FORM,209,ATM,SHORT-LONG,LONG_FORM,177,207,articulación témporomandibular
4,S1698-44472005000300015-1,SHORT_FORM,1125,TC,SHORT-LONG,LONG_FORM,1099,1123,tomografía computerizada


In [28]:
dev_abbr = dev_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [29]:
dev_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S1130-14732005000300004-1,1216,1218,C3,tercera vértebra cervical,tercero vértebra cervical
1,S1130-14732005000300004-1,717,719,C2,segunda vértebra cervical,segundo vértebra cervical
2,S1130-14732005000300004-1,3191,3193,C3,tercera vértebra cervical,tercero vértebra cervical
3,S1130-14732005000300004-1,2867,2869,C3,tercera vértebra cervical,tercero vértebra cervical
4,S1130-14732005000300004-1,2862,2864,C2,segunda vértebra cervical,segundo vértebra cervical


In [30]:
dev_raw = read_texts("../datasets/development_set/development_set.raw_text/")

In [31]:
dev_raw.head()

Unnamed: 0,nombre,texto
0,S1139-76322017000200010-1,Niña de dos años y diez meses con antecedentes...
1,S0365-66912005001100008-1,Se presenta el caso de un varón de 45 años que...
2,S1130-01082006001000017-1,Mujer de 42 años de edad con antecedentes pers...
3,S0212-71992005001200008-1,Paciente de 57 años con ingresos hospitalarios...
4,S0365-66912007000300010-1,Paciente de 33 años que el 20-08-05 es traslad...


### Sample test

15 clinical cases

In [44]:
sample_abbr = pd.read_csv("../datasets/sample_set/clinical_cases.abbreviations.sample_set.tsv", sep = '\t')
sample_met = pd.read_csv("../datasets/sample_set/clinical_cases.metadata.sample_set.tsv", sep = '\t')
sample_rel = pd.read_csv("../datasets/sample_set/clinical_cases.relations.sample_set.tsv", sep = '\t')

In [45]:
sample_met = sample_met.rename(columns = {'# Document_ID': 'doc_id'})

In [46]:
sample_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S0004-06142006000900015-1,1,0004-0614,2006-11-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
1,S0004-06142006000600015-1,1,0004-0614,2006-08-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S0004-06142007000700014-1,1,0004-0614,2007-09-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
3,S0004-06142007000900013-1,1,0004-0614,2007-11-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...
4,S0004-06142006000200014-1,1,0004-0614,2006-03-01,Archivos Españoles de Urología (Ed. impresa) ...,http://scielo.isciii.es/scielo.php?script=sci_...


In [47]:
sample_rel = sample_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [48]:
sample_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0004-06142006000700014-1,SHORT_FORM,926,929,CEA,SHORT-LONG,LONG_FORM,896,924,Antígeno Carcino Embrionario
1,S0004-06142005001000011-1,SHORT_FORM,1626,1629,ROT,SHORT-LONG,LONG_FORM,1600,1624,reflejos osteotendinosos
2,S0004-06142005001000011-1,SHORT_FORM,1715,1718,RMN,SHORT-LONG,LONG_FORM,1685,1713,resonancia magnética nuclear
3,S0004-06142005001000011-1,SHORT_FORM,1663,1666,RCP,SHORT-LONG,LONG_FORM,1639,1661,reflejo cutaneoplantar
4,S0004-06142005001000011-1,SHORT_FORM,1808,1811,LCR,SHORT-LONG,LONG_FORM,1783,1806,líquido cefalorraquídeo


In [49]:
sample_abbr = sample_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [50]:
sample_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0004-06142005001000011-1,1034,1036,Kg,kilogramo,kilogramo
1,S0004-06142005001000011-1,1031,1033,mg,miligramo,miligramo
2,S0004-06142005001000011-1,196,199,IgA,inmunoglobulina a,inmunoglobulina a
3,S0004-06142005001000011-1,2057,2060,LCR,líquido cefalorraquídeo,líquido cefalorraquídeo
4,S0004-06142005001000011-1,1594,1598,EEII,extremidades inferiores,extremidad inferior


In [51]:
sample_raw = read_texts("../datasets/sample_set/sample_set.raw_text/")

In [52]:
sample_raw.head()

Unnamed: 0,nombre,texto
0,S0004-06142006000600014-1,"Paciente varón, de 40 años de edad, con antece..."
1,S0004-06142006000300015-1,Paciente de 50 años con antecedente de litiasi...
2,S0004-06142007000900013-1,Presentamos el caso de un recién nacido de tre...
3,S0004-06142005001000011-1,Varón de 58 años de edad en el momento del tra...
4,S0004-06142006000700013-1,"Paciente varón, de 63 años de edad, mestizo, d..."


### Trainning

318 clinical cases

In [9]:
train_abbr = pd.read_csv("../datasets/trainning_set/clinical_cases.abbreviations.training_set.tsv", sep = '\t')
train_met = pd.read_csv("../datasets/trainning_set/clinical_cases.metadata.training_set.tsv", sep = '\t')
train_rel = pd.read_csv("../datasets/trainning_set/clinical_cases.relations.training_set.tsv", sep = '\t')

In [10]:
train_met = train_met.rename(columns = {'# Document_ID': 'doc_id'})

In [11]:
train_met.head()

Unnamed: 0,doc_id,Case_ID,ISSN,Date,Source,Full_Text_Link
0,S1139-76322015000500009-1.txt,1.txt,1139-7632,2015-12-01,Pediatría Atención Primaria v.17 n.68 2015,http://scielo.isciii.es/scielo.php?script=sci_...
1,S1130-05582008000400007-2.txt,2.txt,1130-0558,2008-08-01,Revista Española de Cirugía Oral y Maxilofacia...,http://scielo.isciii.es/scielo.php?script=sci_...
2,S0210-48062006000100012-1.txt,1.txt,0210-4806,2006-01-01,Actas Urológicas Españolas v.30 n.1 2006,http://scielo.isciii.es/scielo.php?script=sci_...
3,S0213-12852003000500002-1.txt,1.txt,0213-1285,2003-10-01,Avances en Odontoestomatología v.19 n.5 2003,http://scielo.isciii.es/scielo.php?script=sci_...
4,S0212-71992005000400007-1.txt,1.txt,0212-7199,2005-04-01,Anales de Medicina Interna v.22 n.4 2005,http://scielo.isciii.es/scielo.php?script=sci_...


In [12]:
train_rel = train_rel.reset_index()

In [13]:
train_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [14]:
train_rel = train_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [15]:
train_rel.head()

Unnamed: 0,doc_id,Mention_A_type,Mention_A_StartOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-01082009000400014-1,SHORT_FORM,476,NPT,SHORT-LONG,LONG_FORM,454.0,474.0,nutrición parenteral
1,S1130-63432016000100009-1,SHORT_FORM,614,NIHSS,SHORT-LONG,LONG_FORM,621.0,662.0,National Institute of Health Stroke Scale
2,S1139-76322017000200007-1,SHORT_FORM,1145,CMV,SHORT-LONG,LONG_FORM,1128.0,1143.0,citomegalovirus
3,S1139-76322017000200007-1,SHORT_FORM,1243,VSG,SHORT-LONG,LONG_FORM,1206.0,1241.0,velocidad de sedimentación globular
4,S1139-76322017000200007-1,SHORT_FORM,1300,IGRA,SHORT-LONG,LONG_FORM,1267.0,1298.0,interferon-gamma release assays


In [16]:
train_abbr = train_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [489]:
train_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0210-48062004000500008-1,1650,1652,ml,mililitro,mililitro
1,S0210-48062004000500008-1,708,709,l,litro,litro
2,S0210-48062004000500008-1,704,707,mEq,miliequivalente,miliequivalente
3,S0210-48062004000500008-1,677,681,pCO2,presión parcial de co2,presión parcial de co2
4,S0210-48062004000500008-1,2287,2290,HLA,human leucocyte antigen,human leucocyte antiger


In [8]:
train_raw = read_texts("../datasets/trainning_set/training_set.raw_text/txt/")

In [9]:
train_raw = train_raw.rename(columns = {'nombre': 'doc_id'})

In [10]:
train_raw.head()

Unnamed: 0,doc_id,texto
0,S0004-06142005000900013-1,Se trata de una mujer de 29 años sometida a un...
1,S0004-06142005000900015-1,"Varón de 36 años, sin antecedentes de interés,..."
2,S0004-06142005000900016-1,Mujer de 29 años con antecedentes de ulcus duo...
3,S0004-06142005001000011-1,Varón de 58 años de edad en el momento del tra...
4,S0004-06142005001000011-3,Mujer de 42 años en el momento de someterse a ...


# Sub-track 1

## Found abbreviations (Short Forms)

### Regex

In [11]:
#patron = r'\(([A-Z]{2,8})\)'
#patron1 = r'\s[A-Z]{1,3}\s'
#patron2 = r'\s[a-z]{1,3}\s'
patron3 = r'[A-Z]{2,8}'
patron4 = r'\s[a-z]{1,2}\s'
patron5 = r'[a-z]+\-[A-Z]{1,8}'
patron6 = r'[a-z]+\/[a-z]+'
patron7 = r'[A-Z]?[a-z]{1,4}[A-Z]+[a-z]*[1-9]*'
patron8 = r'\/[a-z]*[A-Z]*'


# create a list with them
regexes = [ patron3, patron4, patron5, patron6, patron7,patron8]
for i in regexes:
    generic_re = re.compile("%s|%s|%s|%s|%s|%s" % (patron3, patron4, patron5, patron6, patron7,patron8))

In [12]:
#nltk.download('stopwords')
swords = list(set(stopwords.words('spanish')))

**Get Short Formns with a regex in each text**

In [13]:
train_raw['abrev'] = train_raw['texto'].map(lambda x: generic_re.findall(x))

In [205]:
# train_raw['offse'] = train_raw['texto'].map(lambda x: generic_re.search(x).span())

**Get offsets of the Short Forms founded in the text**

In [14]:
train_raw['offse'] = train_raw['texto'].map(lambda x: [(m.start(0), m.end(0)) for m in re.finditer(generic_re, x)])

In [15]:
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [i.strip() for i in x])

Filter Short Forms to delete ones which are stopwords, get their index too for filter offsets lists later

In [16]:
train_raw['abrev_index'] = train_raw['abrev'].apply(lambda x: [x.index(i) for i in x if i not in swords])

In [17]:
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [i for i in x if i not in swords])

Filter offsets lists by index

In [18]:
train_raw['offse'] = train_raw.apply(lambda x: [x['offse'][i] for i in x['abrev_index']], axis = 1)

In [19]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index
0,S0004-06142005000900013-1,Se trata de una mujer de 29 años sometida a un...,"[mm, DAKO, HHF]","[(185, 189), (1502, 1506), (1513, 1516)]","[8, 58, 59]"
1,S0004-06142005000900015-1,"Varón de 36 años, sin antecedentes de interés,...","[CT, x, MESNA]","[(408, 410), (445, 448), (1120, 1125)]","[14, 17, 37]"
2,S0004-06142005000900016-1,Mujer de 29 años con antecedentes de ulcus duo...,"[UIV, nderson-H]","[(400, 403), (905, 914)]","[8, 22]"
3,S0004-06142005001000011-1,Varón de 58 años de edad en el momento del tra...,"[IRC, IgA, II, /K, gr/d, /K, EEII, EEII, ROT, ...","[(145, 148), (196, 199), (572, 574), (981, 983...","[7, 9, 18, 29, 30, 29, 45, 45, 53, 55, 57, 62,..."
4,S0004-06142005001000011-3,Mujer de 42 años en el momento de someterse a ...,"[HDA, II, EEII, EID, EID, ROT, RCP, RMN, PEES,...","[(234, 237), (276, 278), (829, 833), (866, 869...","[8, 10, 33, 35, 35, 40, 42, 44, 51, 54, 55, 57..."


In [635]:
train_raw[train_raw['doc_id'] == 'S0004-06142005001000011-1']['abrev']

3    [IRC, IgA, II, /K, gr/d, /K, EEII, EEII, ROT, ...
Name: abrev, dtype: object

Check abrev and offse columns has the same length

In [20]:
train_raw['abrev_len'] =train_raw['abrev'].str.len()
train_raw['offse_len'] =train_raw['offse'].str.len()

In [21]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index,abrev_len,offse_len
0,S0004-06142005000900013-1,Se trata de una mujer de 29 años sometida a un...,"[mm, DAKO, HHF]","[(185, 189), (1502, 1506), (1513, 1516)]","[8, 58, 59]",3,3
1,S0004-06142005000900015-1,"Varón de 36 años, sin antecedentes de interés,...","[CT, x, MESNA]","[(408, 410), (445, 448), (1120, 1125)]","[14, 17, 37]",3,3
2,S0004-06142005000900016-1,Mujer de 29 años con antecedentes de ulcus duo...,"[UIV, nderson-H]","[(400, 403), (905, 914)]","[8, 22]",2,2
3,S0004-06142005001000011-1,Varón de 58 años de edad en el momento del tra...,"[IRC, IgA, II, /K, gr/d, /K, EEII, EEII, ROT, ...","[(145, 148), (196, 199), (572, 574), (981, 983...","[7, 9, 18, 29, 30, 29, 45, 45, 53, 55, 57, 62,...",19,19
4,S0004-06142005001000011-3,Mujer de 42 años en el momento de someterse a ...,"[HDA, II, EEII, EID, EID, ROT, RCP, RMN, PEES,...","[(234, 237), (276, 278), (829, 833), (866, 869...","[8, 10, 33, 35, 35, 40, 42, 44, 51, 54, 55, 57...",14,14


Check both lists have the same lenght

In [22]:
train_raw['len_check'] = train_raw.apply(lambda row: check_len(row['abrev_len'],row['offse_len']),axis = 1)

Delete rows with different lenghts (check it later)

In [23]:
train_raw.shape

(318, 8)

In [24]:
train_raw[train_raw['len_check'] != 1].shape

(318, 8)

In [390]:
#train_raw = train_raw[train_raw['len_check'] != 1]

In [641]:
train_raw.shape

(318, 8)

**Remove stopwords from text**

In [25]:
train_raw['tokens'] = train_raw['texto'].map(lambda x: word_tokenize(x))
train_raw['texto_clean'] = train_raw['tokens'].map(lambda x: ' '.join([w for w in x if w not in swords]))

In [28]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index,abrev_len,offse_len,len_check,tokens,texto_clean
0,S0004-06142005000900013-1,Se trata de una mujer de 29 años sometida a un...,"[mm, DAKO, HHF]","[(185, 189), (1502, 1506), (1513, 1516)]","[8, 58, 59]",3,3,0,"[Se, trata, de, una, mujer, de, 29, años, some...",Se trata mujer 29 años sometida estudio ecográ...
1,S0004-06142005000900015-1,"Varón de 36 años, sin antecedentes de interés,...","[CT, x, MESNA]","[(408, 410), (445, 448), (1120, 1125)]","[14, 17, 37]",3,3,0,"[Varón, de, 36, años, ,, sin, antecedentes, de...","Varón 36 años , antecedentes interés , estudia..."
2,S0004-06142005000900016-1,Mujer de 29 años con antecedentes de ulcus duo...,"[UIV, ndersonH]","[(400, 403), (905, 914)]","[8, 22]",2,2,0,"[Mujer, de, 29, años, con, antecedentes, de, u...",Mujer 29 años antecedentes ulcus duodenal estr...
3,S0004-06142005001000011-1,Varón de 58 años de edad en el momento del tra...,"[IRC, IgA, II, K, grd, K, EEII, EEII, ROT, RCP...","[(145, 148), (196, 199), (572, 574), (981, 983...","[7, 9, 18, 29, 30, 29, 45, 45, 53, 55, 57, 62,...",19,19,0,"[Varón, de, 58, años, de, edad, en, el, moment...","Varón 58 años edad momento trasplante , 5 octu..."
4,S0004-06142005001000011-3,Mujer de 42 años en el momento de someterse a ...,"[HDA, II, EEII, EID, EID, ROT, RCP, RMN, PEES,...","[(234, 237), (276, 278), (829, 833), (866, 869...","[8, 10, 33, 35, 35, 40, 42, 44, 51, 54, 55, 57...",14,14,0,"[Mujer, de, 42, años, en, el, momento, de, som...",Mujer 42 años momento someterse trasplante hep...


Remove punctuation from abreviations

In [516]:
#train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [a.split("/") for a in x])

In [517]:
#train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [item for sublist in x for item in sublist])

In [27]:
for i in punctuation:
    train_raw['abrev'] = train_raw[train_raw['abrev'].notnull()]['abrev'].apply(lambda x: [a.replace(i,"") for a in x])

In [29]:
train_raw['zip'] = train_raw.apply(lambda row: list(zip(row['abrev'], row['offse'])), axis = 1)

In [30]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index,abrev_len,offse_len,len_check,tokens,texto_clean,zip
0,S0004-06142005000900013-1,Se trata de una mujer de 29 años sometida a un...,"[mm, DAKO, HHF]","[(185, 189), (1502, 1506), (1513, 1516)]","[8, 58, 59]",3,3,0,"[Se, trata, de, una, mujer, de, 29, años, some...",Se trata mujer 29 años sometida estudio ecográ...,"[(mm, (185, 189)), (DAKO, (1502, 1506)), (HHF,..."
1,S0004-06142005000900015-1,"Varón de 36 años, sin antecedentes de interés,...","[CT, x, MESNA]","[(408, 410), (445, 448), (1120, 1125)]","[14, 17, 37]",3,3,0,"[Varón, de, 36, años, ,, sin, antecedentes, de...","Varón 36 años , antecedentes interés , estudia...","[(CT, (408, 410)), (x, (445, 448)), (MESNA, (1..."
2,S0004-06142005000900016-1,Mujer de 29 años con antecedentes de ulcus duo...,"[UIV, ndersonH]","[(400, 403), (905, 914)]","[8, 22]",2,2,0,"[Mujer, de, 29, años, con, antecedentes, de, u...",Mujer 29 años antecedentes ulcus duodenal estr...,"[(UIV, (400, 403)), (ndersonH, (905, 914))]"
3,S0004-06142005001000011-1,Varón de 58 años de edad en el momento del tra...,"[IRC, IgA, II, K, grd, K, EEII, EEII, ROT, RCP...","[(145, 148), (196, 199), (572, 574), (981, 983...","[7, 9, 18, 29, 30, 29, 45, 45, 53, 55, 57, 62,...",19,19,0,"[Varón, de, 58, años, de, edad, en, el, moment...","Varón 58 años edad momento trasplante , 5 octu...","[(IRC, (145, 148)), (IgA, (196, 199)), (II, (5..."
4,S0004-06142005001000011-3,Mujer de 42 años en el momento de someterse a ...,"[HDA, II, EEII, EID, EID, ROT, RCP, RMN, PEES,...","[(234, 237), (276, 278), (829, 833), (866, 869...","[8, 10, 33, 35, 35, 40, 42, 44, 51, 54, 55, 57...",14,14,0,"[Mujer, de, 42, años, en, el, momento, de, som...",Mujer 42 años momento someterse trasplante hep...,"[(HDA, (234, 237)), (II, (276, 278)), (EEII, (..."


**Get one row per abbreviation**

Separate elements lists in different rows

In [31]:
mine = train_raw.explode('zip')
#mine = mine.explode('offse')

In [32]:
mine.shape

(3615, 11)

In [33]:
mine[mine['zip'].isnull()].shape

(18, 11)

In [34]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'zip']]

In [35]:
mine = mine[mine['zip'].notnull()]

In [36]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,zip
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...","(mm, (185, 189))"
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...","(DAKO, (1502, 1506))"
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...","(HHF, (1513, 1516))"
1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, de, 36, años, ,, sin, antecedentes, de...","(CT, (408, 410))"
1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, de, 36, años, ,, sin, antecedentes, de...","(x, (445, 448))"


In [37]:
mine['abrev'] = mine.apply(lambda row: row['zip'][0], axis = 1)

In [38]:
mine['offsets'] = mine.apply(lambda row: row['zip'][1], axis = 1)

In [39]:
mine = mine[mine['abrev'] != ""]

In [40]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,zip,abrev,offsets
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...","(mm, (185, 189))",mm,"(185, 189)"
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...","(DAKO, (1502, 1506))",DAKO,"(1502, 1506)"
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...","(HHF, (1513, 1516))",HHF,"(1513, 1516)"
1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, de, 36, años, ,, sin, antecedentes, de...","(CT, (408, 410))",CT,"(408, 410)"
1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, de, 36, años, ,, sin, antecedentes, de...","(x, (445, 448))",x,"(445, 448)"


Separate offsets tuples in different columns

In [41]:
mine[['startOffset', 'endOffset']] = pd.DataFrame(mine['offsets'].tolist(), index=mine.index) 

In [42]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'abrev', 'startOffset', 'endOffset']]

Delete null values, and change ttype to integer for offsets columns

In [43]:
mine.shape

(3367, 6)

In [44]:
mine.dropna(subset=['startOffset', 'endOffset'], inplace = True)

In [45]:
mine.shape

(3367, 6)

In [46]:
mine['startOffset'] = mine['startOffset'].astype(int)
mine['endOffset'] = mine['endOffset'].astype(int)

In [47]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'abrev', 'startOffset', 'endOffset']]

In [48]:
mine.shape

(3367, 6)

In [49]:
mine = mine.drop_duplicates(subset = ['doc_id', 'texto_clean', 'abrev', 'startOffset', 'endOffset'])

In [50]:
mine.shape

(2137, 6)

In [51]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...",mm,185,189
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...",DAKO,1502,1506
0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, de, una, mujer, de, 29, años, some...",HHF,1513,1516
1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, de, 36, años, ,, sin, antecedentes, de...",CT,408,410
1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, de, 36, años, ,, sin, antecedentes, de...",x,445,448


Check dataframes for one text

In [167]:
# mine[mine['doc_id'] == 'S1130-05582012000300005-1']

In [168]:
# train_abbr[train_abbr['doc_id'] == 'S1130-05582012000300005-1']

## Search Long Formns

### Search Long Forms in the same text

**measurement units dictionary**

In [412]:
# mu_dic = {"mL":"Mililitro",
# "mg":"Miligramo",
# "g":"Gramo",
# "L":"Litro",
# "mcg":"Microgramo",
# "mmol":"Milimol",
# "UI":"Unidades Internacionales",
# "Miles UI":"Miles de Unidades Internacionales",
# "Millones UI":"Millones de Unidades Internacionales",
# "UFC":"Unidades Formadoras de Colonias",
# "mEq":"Miliequivalente",
# "ng":"Nanogramo",
# "Lf":"Unidad Floculante",
# "UFP":"Unidad Formadora de Placa",
# "DIC":"Dosis Infectante Mediana de Cultivo Celular 50% ",
# "DIT":"Dosis Infectante Mediana de Cultivo Tisular 50% ",
# "DI":"Dosis Infectante 50% ",
# "mol":"Peso Molecular Gramo ",
# "Eq":"Peso Equivalente Gramo ",
# "Dosis":"Dosis",
# "Almh":"Almohadilla",
# "Amp":"Ampolla",
# "Anl":"Anillo",
# "Bar":"Barra",
# "Bolsa":"Bolsa",
# "Cap":"Capsula",
# "Car":"Caramelo",
# "Carp":"Carpula",
# "Cart":"Cartucho",
# "Com":"Comprimido",
# "Dia":"Dia",
# "Fras":"Frasco",
# "Fras-Amp":"Frasco Ampolla ",
# "Grag":"Gragea",
# "Hora":"Hora",
# "Imp":"Implante",
# "Jab":"Jab¢n",
# "Jer":"Jeringa Prellenada ",
# "uL":"Microlitro",
# "Ovu":"Ovulo",
# "Parche":"Parche",
# "Past":"Pastilla",
# "Perl":"Perla",
# "Pil":"Pildora",
# "Pip":"Pipeta",
# "%":"Porcentaje",
# "Sach":"Sachet",
# "Sob":"Sobre",
# "Sup":"Supositorio",
# "Tab":"Tableta",
# "Troc":"Trocisco",
# "Vial":"Vial",
# "Kg":"Kilogramo",
# "Gal":"Galon",
# "Sis":"Sistema Terapeutico",
# "mCi":"miliCuries",
# "mBq":"milibequerel",
# "UEL":"Unidades ELISA ",
# "DL":"Dosis Letal",
# "U USP":"Unidades USP ",
# "U":"Unidades",
# "Rot":"Rotacaps",
# "CCID":"Dosis Infecciosa en Cultivo de Célula ",
# "U":"UNIDAD",
# "Otros":"Otros",
# "µ Ci":"mcroCuries",
# "Esp":"Esporas",
# "mcHA":"microgramos de HA ",
# "Gom":"Goma",
# "KIU":"Unidad Inhibidora de Calicreina ",
# "mcel":"Millones de Células ",
# "DU":"Unidades de Antigeno D",
# "Dil D2":"Dil D2",
# "Tin.Mad.":"Tintura Madre",
# "Dil D4":"Dil D4",
# "Dil D5":"Dil D5",
# "Dil D1":"Dil D1",
# "Dil D8":"Dil D8",
# "Dil D3":"Dil D3",
# "OU":"Unidad de Opacidad ",
# "mm": "milimetro",
# "dm": "decimetro",
# "cm": "centimetro"}

In [411]:
# mine['long_form'] = mine['abrev'].map(mu_dic)

In [54]:
mine_lf = mine[mine['abrev'].str.len() > 1].reset_index()

**With regex looking for words before SF**

In [56]:
def get_longform(tokens, acro, margin = 2, i =0):
    index = tokens.index(acro)
    long_form = ''
    #Looking for before
    for word in tokens[index-margin-len(acro):index]:
        #if first letter of word is equal to first letter os acronym
        if word[0] == acro[i].lower():
            long_form += word + ' '
            i += 1
            if i == len(acro):
                break
        elif (i == 1) and (word[0] == acro[i-1].lower()):
            long_form = word + ' '
            i = 1
            if i == len(acro):
                break
    long_form = long_form.rstrip()
    return long_form

In [57]:
mine['long_form'] = mine.apply(lambda row: get_longform(row['tokens'], row['abrev']), axis = 1)

ValueError: 'HHF' is not in list

In [575]:
train_raw['tokens'] = train_raw['texto_clean'].map(lambda x: word_tokenize(x))

In [670]:
prueba = mine[mine['doc_id']=='S0004-06142005001000011-1']

In [620]:
for i in punctuation:
    prueba['tokens'] = prueba['tokens'].apply(lambda x: [a.replace(i,"") for a in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [621]:
prueba

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",IRC,145,148
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",IgA,196,199
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",II,572,574
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",K,981,983
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",grd,1009,1013
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",EEII,1455,1459
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",ROT,1626,1629
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",RCP,1663,1666
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",RMN,1715,1718
3,S0004-06142005001000011-1,"Varón 58 años edad momento trasplante , 5 octu...","[Varón, 58, años, edad, momento, trasplante, ,...",LCR,1808,1811


In [622]:
prueba.apply(lambda row: get_longform(row['tokens'], row['abrev']), axis = 1)

3    
3    
3    
3    
3    
3    
3    
3    
3    
3    
3    
3    
3    
3    
3    
dtype: object

In [671]:
get_longform(mine[mine['doc_id'] == 'S0004-06142005001000011-1'].iloc[3]['tokens'], 'PCR')

''

Check how many texts don't have LF for the SF in the same text

In [341]:
# mine['long_form'] = mine['long_form'].map(lambda x: x.replace(' ',''))

In [606]:
mine_lf[(mine_lf['abrev'].notnull()) & (mine_lf['long_form']== '')]

Unnamed: 0,index,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form
0,0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, mujer, 29, años, sometida, estudio...",mm,185,189,
1,0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, mujer, 29, años, sometida, estudio...",DAKO,1502,1506,
2,0,S0004-06142005000900013-1,Se trata mujer 29 años sometida estudio ecográ...,"[Se, trata, mujer, 29, años, sometida, estudio...",HHF,1513,1516,
3,1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, 36, años, ,, antecedentes, interés, ,,...",CT,408,410,
4,1,S0004-06142005000900015-1,"Varón 36 años , antecedentes interés , estudia...","[Varón, 36, años, ,, antecedentes, interés, ,,...",MESNA,1120,1125,
...,...,...,...,...,...,...,...,...
1983,317,S2340-98942015000100005-1,Presentamos caso paciente 62 años diciembre 20...,"[Presentamos, caso, paciente, 62, años, diciem...",TAD,988,992,
1984,317,S2340-98942015000100005-1,Presentamos caso paciente 62 años diciembre 20...,"[Presentamos, caso, paciente, 62, años, diciem...",mmHg,1002,1006,
1985,317,S2340-98942015000100005-1,Presentamos caso paciente 62 años diciembre 20...,"[Presentamos, caso, paciente, 62, años, diciem...",mgml,1306,1311,
1986,317,S2340-98942015000100005-1,Presentamos caso paciente 62 años diciembre 20...,"[Presentamos, caso, paciente, 62, años, diciem...",mg,1428,1432,


In [607]:
lf_null = mine_lf[(mine_lf['abrev'].notnull()) & (mine_lf['long_form'] == '')].shape[0]

In [608]:
print(f"LF has not be found in the same text where the SF is in {lf_null/mine_lf.shape[0]*100: .2f}% of texts")

LF has not be found in the same text where the SF is in  100.00% of texts


**Create dictionary with SF and LF pairs founded in the same text**

In [363]:
df_pairs = mine_lf[(mine_lf['abrev'].notnull()) & (mine_lf['long_form'] != '')]

In [364]:
df_pairs.head()

Unnamed: 0,level_0,index,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form


In [354]:
pairs_dic = {}
for index, row in df_pairs.iterrows():
    if not row['abrev'] in pairs_dic:
        pairs_dic[row['abrev']] = set()
    pairs_dic[row['abrev']].add(row['long_form'])

In [355]:
pairs_dic

{'RTU': {'de', 'estudio', 'iniciándose', 'lóbulo'},
 'cm': {'de', 'estudio', 'iniciándose', 'lóbulo'},
 'TAC': {'de', 'estudio', 'iniciándose', 'lóbulo', 'mostraban', 'por'},
 'HCG': {'de', 'estudio', 'iniciándose'},
 'LDH': {'de', 'estudio', 'iniciándose', 'lóbulo', 'por'},
 'l': {'de', 'estudio', 'iniciándose', 'lóbulo', 'por'},
 'ng': {'de', 'estudio', 'iniciándose', 'lóbulo', 'por'},
 'ml': {'de', 'estudio', 'iniciándose', 'lóbulo', 'mostraban', 'por'},
 'mU': {'de'},
 'x': {'de', 'estudio', 'iniciándose', 'lóbulo'},
 'PAS': {'de'},
 'PSA': {'de', 'iniciándose'},
 'VSG': {'de', 'estudio', 'iniciándose', 'lóbulo', 'por'},
 'LHRH': {'de'},
 'pT3': {'iniciándose'},
 'NoMo': {'iniciándose'},
 'CEA': {'estudio', 'iniciándose'},
 'ESWL': {'iniciándose'},
 'aNo': {'iniciándose'},
 'II': {'de', 'estudio', 'iniciándose', 'lóbulo'},
 'IV': {'estudio', 'iniciándose', 'lóbulo', 'por'},
 'FID': {'iniciándose'},
 'mg': {'de', 'estudio', 'iniciándose', 'lóbulo', 'mostraban', 'por'},
 'dl': {'de',

In [420]:
len(pairs_dic.keys())

275

## Using N-grams

In [326]:
for i in punctuation:
    mine['texto_clean'] = mine['texto_clean'].map(lambda x: x.replace(i,""))

In [328]:
a = mine.head(10)

In [335]:
a.apply(lambda row: ngram_filter(row['texto_clean'], row['abrev'], len(row['abrev'])+1), axis = 1)[2]

[('mediante', 'punciónaspiración', 'aguja', 'fina', 'PAAF'),
 ('punciónaspiración', 'aguja', 'fina', 'PAAF', 'confirmó'),
 ('aguja', 'fina', 'PAAF', 'confirmó', 'sospechas'),
 ('fina', 'PAAF', 'confirmó', 'sospechas', 'iniciales'),
 ('PAAF', 'confirmó', 'sospechas', 'iniciales', 'lipoma')]

In [333]:
mine['lf_ngrams'] = mine.apply(lambda row: ngram_filter(row['texto_clean'], row['abrev'], len(row['abrev'])+1), axis = 1)

In [334]:
mine.head()

Unnamed: 0,doc_id,texto_clean,abrev,startOffset,endOffset,long_form,lf_ngrams
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoración c...,RM,789,791,resonancia magnética,"[(resonancia, magnética, RM), (magnética, RM, ..."
1,S1130-05582012000300005-1,Acude consultas paciente presenta tumoración c...,RM,1006,1010,resonancia magnética,"[(resonancia, magnética, RM), (magnética, RM, ..."
2,S1130-05582012000300005-1,Acude consultas paciente presenta tumoración c...,PAAF,789,791,punción-aspiración aguja fina,"[(mediante, punciónaspiración, aguja, fina, PA..."
3,S1130-05582012000300005-1,Acude consultas paciente presenta tumoración c...,PAAF,1006,1010,punción-aspiración aguja fina,"[(mediante, punciónaspiración, aguja, fina, PA..."
4,S0212-71992005000400009-1,Se trataba varón 27 años edad sufrido neumoní...,mm,1056,1059,milimetro,"[(leucocitosis, 12000, mm), (12000, mm, cúbico..."


In [306]:
from nltk import ngrams

sentence = 'this is a foo bar sentences and i want to ngramize it this is it and'

n = 6
sixgrams = ngrams(sentence.split(), 2)

for grams in sixgrams:
    esBigramFreq = collections.Counter(grams)
    print(grams,esBigramFreq , esBigramFreq.most_common(10))

('this', 'is') Counter({'this': 1, 'is': 1}) [('this', 1), ('is', 1)]
('is', 'a') Counter({'is': 1, 'a': 1}) [('is', 1), ('a', 1)]
('a', 'foo') Counter({'a': 1, 'foo': 1}) [('a', 1), ('foo', 1)]
('foo', 'bar') Counter({'foo': 1, 'bar': 1}) [('foo', 1), ('bar', 1)]
('bar', 'sentences') Counter({'bar': 1, 'sentences': 1}) [('bar', 1), ('sentences', 1)]
('sentences', 'and') Counter({'sentences': 1, 'and': 1}) [('sentences', 1), ('and', 1)]
('and', 'i') Counter({'and': 1, 'i': 1}) [('and', 1), ('i', 1)]
('i', 'want') Counter({'i': 1, 'want': 1}) [('i', 1), ('want', 1)]
('want', 'to') Counter({'want': 1, 'to': 1}) [('want', 1), ('to', 1)]
('to', 'ngramize') Counter({'to': 1, 'ngramize': 1}) [('to', 1), ('ngramize', 1)]
('ngramize', 'it') Counter({'ngramize': 1, 'it': 1}) [('ngramize', 1), ('it', 1)]
('it', 'this') Counter({'it': 1, 'this': 1}) [('it', 1), ('this', 1)]
('this', 'is') Counter({'this': 1, 'is': 1}) [('this', 1), ('is', 1)]
('is', 'it') Counter({'is': 1, 'it': 1}) [('is', 1), (

**Get context vectors**

1. Bag of words model

In [344]:
count_vect = CountVectorizer()

In [347]:
text_counts = count_vect.fit_transform(mine['texto_clean'])

In [348]:
text_counts.shape

(27480, 12829)

In [349]:
tfidf_transformer = TfidfTransformer()
texto_tfidf = tfidf_transformer.fit_transform(text_counts)
texto_tfidf.shape

(27480, 12829)

In [362]:
from keras.preprocessing.text import Tokenizer

sentence = ["John likes to watch movies. Mary likes movies too."]

def print_bow(sentence: str) -> None:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentence)
    sequences = tokenizer.texts_to_sequences(sentence)
    word_index = tokenizer.word_index
    print(sequences,word_index)
    bow = {}
    for key in word_index:
        bow[key] = sequences[0].count(word_index[key])
    print(bow)
    print(f"Bag of word sentence 1 :\n{bow}")
    print(f'We found {len(word_index)} unique tokens.')

print_bow(sentence)

[[3, 1, 4, 5, 2, 6, 1, 2, 7]] {'likes': 1, 'movies': 2, 'john': 3, 'to': 4, 'watch': 5, 'mary': 6, 'too': 7}
{'likes': 2, 'movies': 2, 'john': 1, 'to': 1, 'watch': 1, 'mary': 1, 'too': 1}
Bag of word sentence 1 :
{'likes': 2, 'movies': 2, 'john': 1, 'to': 1, 'watch': 1, 'mary': 1, 'too': 1}
We found 7 unique tokens.
