# Library

In [184]:
import pandas as pd
import numpy as np
import os
import re
import collections
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation

from nltk import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import unidecode

# Functions

In [185]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [187]:
def check_len(x,y):
    if x==y:
        return 0
    else:
        return 1

In [188]:
def get_lf(i,j,k):
    if pd.isnull(k):
        regex_lf = re.compile(r'((?:\w+\W+){1,'+str(len(i))+'})\(\s'+i[0]+'.*\)')
        return regex_lf.findall(j)
    else:
        return k

In [189]:
def ngram_filter(doc, word, n):
    tokens = doc.split()
    all_ngrams = ngrams(tokens, n)
    filtered_ngrams = [x for x in all_ngrams if word in x]
    return filtered_ngrams

In [286]:
def get_longform(tokens, acro, long):
    #acro = acro.lower()
    long_form = ''
    margin = 2
    i =0
    #Looking for before
    if pd.isna(long):
        if acro not in tokens:
            return -1
        else:
            index = tokens.index(acro)
            for word in tokens[index-margin-len(acro):index]:
                #if first letter of word is equal to first letter os acronym
                if word[0] == acro[i].lower():
                    long_form += word + ' '
                    i += 1
                    if i == len(acro):
                        break
                elif (i == 1) and (word[0] == acro[i-1].lower()):
                    long_form = word + ' '
                    i = 1
                    if i == len(acro):
                        break
            long_form = long_form.rstrip()
            #print(re.split(' |-',long_form), len(re.split(' |-',long_form)), len(acro)-1)
            if len(re.split(' |-',long_form)) == (len(acro)):
                return long_form
            elif ((len(re.split(' |-',long_form))+1) == (len(acro))) & (len(re.split(' |-',long_form)) > 1):
                return long_form
    else:
        return long


In [287]:
def get_longform_after(tokens, acro, long):
    #acro = acro.lower()
    long_form = ''
    margin = 2
    i =0
    #Looking for before
    if pd.isna(long):
        if acro not in tokens:
            return -1
        else:
            index = tokens.index(acro)
            for word in tokens[index+1:index+margin+len(acro)]:
                if tokens[index+1] == '(':
                    if word[0] in acro.lower():
                        long_form += word + ' '
            long_form = long_form.rstrip()
            if len(re.split(' |-',long_form)) == (len(acro)):
                return long_form
            elif ((len(re.split(' |-',long_form))+1) == (len(acro))) & (len(re.split(' |-',long_form)) > 1):
                return long_form
    else:
        return long
    

# Load Data

### Trainning

318 clinical cases

In [192]:
train_raw = read_texts("../datasets/trainning_set/training_set.raw_text/")

In [193]:
train_raw = train_raw.rename(columns = {'nombre': 'doc_id'})

In [194]:
train_raw.head()

Unnamed: 0,doc_id,texto
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...


In [195]:
train_raw.shape

(318, 2)

# Sub-track 1

## Found abbreviations (Short Forms)

### Regex

In [196]:
patron1 = r'[A-Z]{2,8}' #Letras mayúsculas entre 2 y 8. Probar (\s|\()[A-Z]{2,8}
patron2 = r'\s[a-z]{1,2}\s' #Entre 2 y 3 letras minusculas entre espacios
patron3 = r'\b[aA-zZ]{1,4}\-[aA-zZ]{1,4}\b' #mayúsuclas o minúsculas entre guiones
patron4 = r'\b\w{2}\b\/' #medidas antes de un signo /
patron5 = r'\/\b\w{2}\b' #medidas después de un signo /
patron6 = r'[aA-zZ]{1,4}[A-Z]+[a-z]*[1-4]*'
patron7 = r'\b\w{1,3}\s[0-9]{1,3}\b' #letras y numeros separadas por espacio
patron8 = r'\b\w{1,3}[-]\w{1,3}\b' #tras y numeros separas por guión
patron9= r'\b[aA-zZ]{1,3}[0-9]{1,3}\b' #letras con numeros todo junto

# create a list with them
regexes = [ patron1, patron2, patron3, patron4, patron5, patron6, patron7, patron8, patron9]
for i in regexes:
    generic_re = re.compile("%s|%s|%s|%s|%s|%s|%s|%s|%s" % (patron1, patron2, patron3, patron4, patron5, patron6, patron7, patron8, patron9))

In [197]:
swords = list(set(stopwords.words('spanish')))

Add words to stopwords lists

In [198]:
swords = swords + ['I','II','III','VI','VII','VIII','IX', 'X', 'x']

### Get Short Formns with a regex in each text

In [199]:
train_raw['texto_clean'] = train_raw['texto'].str.split().map(lambda x: ' '.join([w for w in x if w not in swords]))

In [200]:
train_raw['abrev'] = train_raw['texto_clean'].map(lambda x: generic_re.findall(x))

**Get offsets of the Short Forms founded in the text**

In [203]:
train_raw['offse'] = train_raw['texto'].map(lambda x: [(m.start(0), m.end(0)) for m in re.finditer(generic_re, x)])

In [204]:
#remove whitespaces
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [i.strip() for i in x])

Filter Short Forms to delete ones which are stopwords, get their index too for filter offsets lists later

In [205]:
train_raw['abrev_index'] = train_raw['abrev'].apply(lambda x: [i for i,j in enumerate(x) if j not in swords])

In [206]:
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [i for i in x if i not in swords])

In [207]:
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [a.replace('/',"") for a in x])

Filter offsets lists by index

In [208]:
train_raw['offse'] = train_raw.apply(lambda x: [x['offse'][i] for i in x['abrev_index']], axis = 1)

In [209]:
train_raw.head()

Unnamed: 0,doc_id,texto,texto_clean,abrev,offse,abrev_index
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,Acude consultas paciente presenta tumoración c...,"[RM, PAAF]","[(5, 8), (26, 29)]","[0, 1]"
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...","Se trataba varón 27 años edad, sufrido neumoní...","[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(10, 14), (22, 26), (33, 37), (85, 88), (91, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,Varón 33 años fumador paquete cigarrillos día ...,[],[],[]
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...","Hombre 42 años, bebedor 100 g etanol día, ante...","[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(6, 10), (26, 30), (33, 37), (40, 43), (52, 5...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,Paciente 18 años edad 5 meses sido víctima gra...,[],[],[]


Check abrev and offse columns has the same length

In [210]:
train_raw['abrev_len'] =train_raw['abrev'].str.len()
train_raw['offse_len'] =train_raw['offse'].str.len()

In [211]:
train_raw.head()

Unnamed: 0,doc_id,texto,texto_clean,abrev,offse,abrev_index,abrev_len,offse_len
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,Acude consultas paciente presenta tumoración c...,"[RM, PAAF]","[(5, 8), (26, 29)]","[0, 1]",2,2
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...","Se trataba varón 27 años edad, sufrido neumoní...","[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(10, 14), (22, 26), (33, 37), (85, 88), (91, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",38,38
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,Varón 33 años fumador paquete cigarrillos día ...,[],[],[],0,0
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...","Hombre 42 años, bebedor 100 g etanol día, ante...","[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(6, 10), (26, 30), (33, 37), (40, 43), (52, 5...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",22,22
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,Paciente 18 años edad 5 meses sido víctima gra...,[],[],[],0,0


Check both lists have the same lenght

In [212]:
train_raw['len_check'] = train_raw.apply(lambda row: check_len(row['abrev_len'],row['offse_len']),axis = 1)

Delete rows with different lenghts (check it later)

In [213]:
train_raw.shape

(318, 9)

In [214]:
train_raw[train_raw['len_check'] != 1].shape

(318, 9)

### Text treatment

Remove string punctuation, lowecase, tokenize and remove stopwords

In [216]:
train_raw['texto_clean'] = train_raw['texto_clean'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]',' ')

In [217]:
train_raw['texto_clean'] = train_raw['texto_clean'].apply(lambda x: unidecode.unidecode(x))

In [219]:
train_raw['tokens'] = train_raw['texto_clean'].map(lambda x: word_tokenize(x))

In [220]:
train_raw.head()

Unnamed: 0,doc_id,texto,texto_clean,abrev,offse,abrev_index,abrev_len,offse_len,len_check,tokens
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,Acude consultas paciente presenta tumoracion c...,"[RM, PAAF]","[(5, 8), (26, 29)]","[0, 1]",2,2,0,"[Acude, consultas, paciente, presenta, tumorac..."
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...",Se trataba varon 27 anos edad sufrido neumoni...,"[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(10, 14), (22, 26), (33, 37), (85, 88), (91, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",38,38,0,"[Se, trataba, varon, 27, anos, edad, sufrido, ..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,Varon 33 anos fumador paquete cigarrillos dia ...,[],[],[],0,0,0,"[Varon, 33, anos, fumador, paquete, cigarrillo..."
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...",Hombre 42 anos bebedor 100 g etanol dia ante...,"[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(6, 10), (26, 30), (33, 37), (40, 43), (52, 5...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",22,22,0,"[Hombre, 42, anos, bebedor, 100, g, etanol, di..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,Paciente 18 anos edad 5 meses sido victima gra...,[],[],[],0,0,0,"[Paciente, 18, anos, edad, 5, meses, sido, vic..."


### Get abreviations and offsets

Remove punctuation from abreviations

In [222]:
train_raw['zip'] = train_raw.apply(lambda row: list(zip(row['abrev'], row['offse'])), axis = 1)

In [223]:
train_raw.head()

Unnamed: 0,doc_id,texto,texto_clean,abrev,offse,abrev_index,abrev_len,offse_len,len_check,tokens,zip
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,Acude consultas paciente presenta tumoracion c...,"[RM, PAAF]","[(5, 8), (26, 29)]","[0, 1]",2,2,0,"[Acude, consultas, paciente, presenta, tumorac...","[(RM, (5, 8)), (PAAF, (26, 29))]"
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...",Se trataba varon 27 anos edad sufrido neumoni...,"[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(10, 14), (22, 26), (33, 37), (85, 88), (91, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",38,38,0,"[Se, trataba, varon, 27, anos, edad, sufrido, ...","[(mm, (10, 14)), (mm, (22, 26)), (mg, (33, 37)..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,Varon 33 anos fumador paquete cigarrillos dia ...,[],[],[],0,0,0,"[Varon, 33, anos, fumador, paquete, cigarrillo...",[]
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...",Hombre 42 anos bebedor 100 g etanol dia ante...,"[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(6, 10), (26, 30), (33, 37), (40, 43), (52, 5...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",22,22,0,"[Hombre, 42, anos, bebedor, 100, g, etanol, di...","[(g, (6, 10)), (mg, (26, 30)), (dl, (33, 37)),..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,Paciente 18 anos edad 5 meses sido victima gra...,[],[],[],0,0,0,"[Paciente, 18, anos, edad, 5, meses, sido, vic...",[]


### Get one row per abbreviation

Separate elements lists in different rows

In [224]:
mine = train_raw.explode('zip')

In [225]:
mine.shape

(3825, 11)

In [226]:
mine[mine['zip'].isnull()].shape

(15, 11)

In [227]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'zip']]

In [228]:
mine = mine[mine['zip'].notnull()]

In [229]:
mine.shape

(3810, 4)

In [230]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,zip
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(RM, (5, 8))"
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(PAAF, (26, 29))"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...","(mm, (10, 14))"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...","(mm, (22, 26))"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...","(mg, (33, 37))"


### Separate SF from Offsets in different columns

In [231]:
mine['abrev'] = mine.apply(lambda row: row['zip'][0], axis = 1)

In [232]:
mine['offsets'] = mine.apply(lambda row: row['zip'][1], axis = 1)

In [233]:
mine = mine[mine['abrev'] != ""]

In [234]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,zip,abrev,offsets
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(RM, (5, 8))",RM,"(5, 8)"
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(PAAF, (26, 29))",PAAF,"(26, 29)"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...","(mm, (10, 14))",mm,"(10, 14)"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...","(mm, (22, 26))",mm,"(22, 26)"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...","(mg, (33, 37))",mg,"(33, 37)"


Separate offsets tuples in different columns

In [235]:
mine[['startOffset', 'endOffset']] = pd.DataFrame(mine['offsets'].tolist(), index=mine.index) 

In [236]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'abrev', 'startOffset', 'endOffset']]

In [237]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,5,8
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,26,29
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,10,14
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,22,26
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mg,33,37


Delete null values, and change ttype to integer for offsets columns

In [238]:
mine.shape

(3810, 6)

In [239]:
mine.dropna(subset=['startOffset', 'endOffset'], inplace = True)

In [240]:
mine.shape

(3810, 6)

In [241]:
mine['startOffset'] = mine['startOffset'].astype(int)
mine['endOffset'] = mine['endOffset'].astype(int)

In [242]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'abrev', 'startOffset', 'endOffset']]

In [243]:
mine.shape

(3810, 6)

In [245]:
mine = mine.drop_duplicates(subset = ['doc_id', 'texto_clean', 'abrev', 'startOffset', 'endOffset'])

In [246]:
mine.shape

(3810, 6)

In [247]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,5,8
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,26,29
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,10,14
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,22,26
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mg,33,37


#### Delete short forms that not apply

In [251]:
mine = mine[~mine['abrev'].str.contains(r'^[0-9]+[\s|-][0-9]+$')]

In [252]:
mine.shape

(3580, 6)

In [253]:
mine = mine[~mine['abrev'].str.contains(r'^([\s\d]+)$')]

  return func(self, *args, **kwargs)


In [254]:
mine.shape

(3494, 6)

In [255]:
mine = mine[~mine['abrev'].str.contains('Día')]
mine = mine[~mine['abrev'].str.contains('día')]
mine = mine[~mine['abrev'].str.contains('Del')]
mine = mine[~mine['abrev'].str.contains('El')]

In [256]:
mine.shape

(3461, 6)

In [279]:
mine['abrev'] = mine['abrev'].str.replace('anti-','')
mine['abrev'] = mine['abrev'].str.replace('ngio','')

## Search Long Formns

### Search Long Forms in the same text

**Measurement units dictionary**

In [280]:
mu_dic = {"ml":"mililitro",
"mg":"miligramo",
"g":"gramo",
"l":"litro",
"mcg":"microgramo",
"mmol":"milimol",
"ui":"Unidades Internacionales",
"miles ui":"Miles de Unidades Internacionales",
"millones ui":"Millones de Unidades Internacionales",
"ufc":"Unidades Formadoras de Colonias",
"meq":"miliequivalente",
"ng":"manogramo",
"lf":"Unidad Floculante",
"ufp":"Unidad Formadora de Placa",
"dic":"Dosis Infectante Mediana de Cultivo Celular 50% ",
"dit":"Dosis Infectante Mediana de Cultivo Tisular 50% ",
"di":"Dosis Infectante 50% ",
"mol":"Peso Molecular Gramo ",
"eq":"Peso Equivalente Gramo ",
"dosis":"Dosis",
"almh":"Almohadilla",
"amp":"Ampolla",
"anl":"Anillo",
"bar":"Barra",
"bolsa":"Bolsa",
"cap":"Capsula",
"car":"Caramelo",
"carp":"Carpula",
"cart":"Cartucho",
"com":"Comprimido",
"dia":"Dia",
"fras":"Frasco",
"fras-amp":"Frasco Ampolla ",
"grag":"Gragea",
"gora":"Hora",
"gmp":"Implante",
"jab":"Jab¢n",
"jer":"Jeringa Prellenada ",
"uL":"Microlitro",
"ovu":"Ovulo",
"parche":"Parche",
"past":"Pastilla",
"perl":"Perla",
"pil":"Pildora",
"pip":"Pipeta",
"%":"Porcentaje",
"sach":"Sachet",
"sob":"Sobre",
"sup":"Supositorio",
"tab":"Tableta",
"troc":"Trocisco",
"vial":"Vial",
"kg":"Kilogramo",
"gal":"Galon",
"sis":"Sistema Terapeutico",
"mci":"miliCuries",
"mbq":"milibequerel",
"uel":"Unidades ELISA ",
"dl":"Dosis Letal",
"u usp":"Unidades USP ",
"u":"Unidades",
"rot":"Rotacaps",
"ccid":"Dosis Infecciosa en Cultivo de Célula ",
"U":"UNIDAD",
"otros":"Otros",
"µci":"mcroCuries",
"esp":"Esporas",
"mcha":"microgramos de HA ",
"gom":"Goma",
"kiu":"Unidad Inhibidora de Calicreina ",
"mcel":"Millones de Células ",
"du":"Unidades de Antigeno D",
"dil d2":"Dil D2",
"tin.mad.":"Tintura Madre",
"dil d4":"Dil D4",
"dil d5":"Dil D5",
"dil d1":"Dil D1",
"dil d8":"Dil D8",
"dil d3":"Dil D3",
"ou":"Unidad de Opacidad ",
"mm": "milimetro",
"dm": "decimetro",
"cm": "centimetro"}

In [281]:
mine['long_form'] = mine['abrev'].str.lower().map(mu_dic)

In [283]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,5,8,
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,26,29,
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,10,14,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,22,26,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mg,33,37,miligramo


### Get LF from the text

With get_longform function, words befores SF that starts with SF letters will be detected. With get_longform_after words in brackets after SF will be detected

In [291]:
mine['long_form'] = mine.apply(lambda row: get_longform(row['tokens'], row['abrev'], row['long_form']), axis = 1)

In [292]:
mine['long_form'] = mine.apply(lambda row: get_longform_after(row['tokens'], row['abrev'], row['long_form']), axis = 1)

In [293]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,5,8,resonancia magnetica
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,26,29,puncion aspiracion aguja fina
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,10,14,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,22,26,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mg,33,37,miligramo


In [295]:
mine.shape

(3461, 7)

### Number of SF and LF in tetxs

**Short Forms**

In [296]:
mine['abrev'].nunique()

855

In [297]:
pd.options.display.max_rows = 1000

In [299]:
sorted(mine['abrev'].unique().tolist())

['1-y',
 '1a',
 '1b',
 '1g',
 '3g',
 '4g',
 '5-FU',
 '5-de',
 '5U',
 '7g',
 '8h',
 '99-MDP',
 'A 1',
 'A 10',
 'A 12',
 'A 15',
 'A 18',
 'A 2',
 'A 20',
 'A 24',
 'A 3',
 'A 30',
 'A 4',
 'A 40',
 'A 48',
 'A 5',
 'A 6',
 'A 7',
 'A 8',
 'A 9',
 'A-P',
 'A1',
 'AA',
 'AAS',
 'AAT',
 'AB',
 'ABI',
 'ABVD',
 'ACCS',
 'ACL',
 'ADA',
 'ADN',
 'ADR',
 'AE',
 'AEO',
 'AFG',
 'AFP',
 'AGCM',
 'AGE',
 'AGF',
 'AI',
 'AINES',
 'AL',
 'ALAT',
 'ALK',
 'ALP',
 'ALT',
 'AMA',
 'ANA',
 'ANAS',
 'ANCA',
 'ANCAS',
 'ANOES',
 'AO',
 'AOC',
 'AP',
 'APC',
 'AR',
 'ASAT',
 'ASIA',
 'ASLO',
 'ASPEN',
 'AST',
 'AT',
 'ATA',
 'ATM',
 'ATS',
 'AUC',
 'AV',
 'AVF',
 'AVI',
 'AVK',
 'AVSC',
 'AZF',
 'Alfa-feto',
 'B 7',
 'B 80',
 'B-27',
 'B-7',
 'B1',
 'B12 609',
 'B2',
 'B27',
 'B7',
 'B8',
 'BA',
 'BAAR',
 'BACTEC',
 'BAL',
 'BAS',
 'BAV',
 'BCG',
 'BCL',
 'BD',
 'BEP',
 'BH',
 'BIODISK',
 'BM',
 'BMC',
 'BMU',
 'BP',
 'BPA',
 'BQ',
 'BT',
 'BUN',
 'Beta-hCG',
 'C-GSF',
 'C-Kit',
 'C2',
 'C2-C6',
 'C3',
 

**Long Forms**

In [301]:
mine['long_form'].nunique()

103

Study -1 values

In [302]:
mine[mine['long_form'] == -1].shape

(428, 7)

In [303]:
mine[mine['long_form'] == -1]['abrev'].nunique()

245

### Study LF founded or not founded

Study SF without LF in the running text

In [304]:
mine[mine['long_form'] == -1].shape

(428, 7)

In [305]:
mine[mine['long_form'].isnull()].shape

(1721, 7)

Check how many texts don't have LF for the SF in the same text

In [306]:
mine[(mine['abrev'].notnull()) & (mine['long_form'].isnull())].shape

(1721, 7)

In [307]:
lf_null = mine[(mine['abrev'].notnull()) & (mine['long_form'].isnull())].shape[0]
lf_null

1721

In [308]:
print(f"LF has not be found in the same text where the SF is in {lf_null/mine.shape[0]*100: .2f}% of texts")

LF has not be found in the same text where the SF is in  49.73% of texts


In [309]:
mine[mine['long_form'].isnull()]['abrev'].nunique()

564

In [310]:
sorted(mine[mine['long_form'].isnull()]['abrev'].unique().tolist())

['1a',
 '1b',
 '1g',
 '3g',
 '4g',
 '5U',
 '7g',
 '8h',
 'A1',
 'AA',
 'AAS',
 'AB',
 'ABI',
 'ABVD',
 'ACCS',
 'ADA',
 'ADN',
 'ADR',
 'AE',
 'AFG',
 'AFP',
 'AGF',
 'AI',
 'AINES',
 'AL',
 'ALAT',
 'ALK',
 'ALP',
 'ALT',
 'AMA',
 'ANA',
 'ANAS',
 'ANCA',
 'ANCAS',
 'ANOES',
 'AOC',
 'AP',
 'AR',
 'ASAT',
 'ASIA',
 'ASLO',
 'ASPEN',
 'AST',
 'AT',
 'ATA',
 'ATS',
 'AUC',
 'AV',
 'AVF',
 'AVI',
 'AVK',
 'AVSC',
 'B1',
 'B2',
 'B27',
 'B7',
 'B8',
 'BAAR',
 'BACTEC',
 'BAL',
 'BAS',
 'BAV',
 'BCG',
 'BD',
 'BEP',
 'BH',
 'BIODISK',
 'BMC',
 'BMU',
 'BP',
 'BPA',
 'BQ',
 'BT',
 'BUN',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'CA',
 'CBMF',
 'CCR',
 'CD',
 'CDDP',
 'CDI',
 'CEA',
 'CEC',
 'CHCM',
 'CHOP',
 'CK',
 'CKPAN',
 'CMHG',
 'CMI',
 'CMV',
 'CO',
 'COL',
 'COPP',
 'CPAP',
 'CPK',
 'CRE',
 'CT',
 'CTX',
 'CU',
 'CV',
 'CX',
 'CellCe',
 'D1',
 'D10',
 'D12',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'DAI',
 'DAKO',
 'DFVA',
 'DG',
 'DHAP',
 'DIEP',
 'DII',
 'DIII',
 'DMNID',
 'DNA

## Create dictionary with SF and LF pairs founded in the same text

In [311]:
df_pairs = mine[(mine['abrev'].notnull()) & (mine['long_form'].notnull())]

In [312]:
df_pairs.reset_index(drop = True, inplace = True)

In [313]:
df_pairs.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,5,8,resonancia magnetica
1,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,26,29,puncion aspiracion aguja fina
2,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,10,14,milimetro
3,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mm,22,26,milimetro
4,S0212-71992005000400009-1,Se trataba varon 27 anos edad sufrido neumoni...,"[Se, trataba, varon, 27, anos, edad, sufrido, ...",mg,33,37,miligramo


In [314]:
pairs_dic = {}
for index, row in df_pairs.iterrows():
    if (row['long_form'] != -1) & (row['long_form'] != None):
        if not row['abrev'] in pairs_dic:
            pairs_dic[row['abrev']] = set()
        pairs_dic[row['abrev']].add(row['long_form'])

In [315]:
pairs_dic

{'RM': {'resonancia magnetica'},
 'PAAF': {'puncion aspiracion aguja fina'},
 'mm': {'milimetro'},
 'mg': {'miligramo'},
 'UI': {'Unidades Internacionales'},
 'kg': {'Kilogramo'},
 'g': {'gramo'},
 'dl': {'Dosis Letal'},
 'TAC': {'axial computerizada',
  'tomografia axial computadorizada',
  'tomografia axial computarizada',
  'tomografia axial computerizada'},
 'ng': {'manogramo'},
 'APC': {'argon plasma'},
 'ml': {'mililitro'},
 'cm': {'centimetro'},
 'dL': {'Dosis Letal'},
 'f': {''},
 'q': {''},
 'TBF': {'traves bioelectrica'},
 'h': {''},
 'mEq': {'miliequivalente'},
 'd': {''},
 'TC': {'tomografia computadorizada',
  'tomografia computarizada',
  'tomografia computerizada'},
 'RMN': {'resonancia magnetica nuclear'},
 'u': {'Unidades'},
 'DM': {'decimetro'},
 'HTA': {'hipotiroidismo tratamiento'},
 'l': {'litro'},
 'LLA': {'leucemia linfoblastica aguda'},
 'FO': {'fondo ojo'},
 'PIO': {'presion intraocular'},
 'mL': {'mililitro'},
 'AEO': {'aparatologia extraoral'},
 'HSA': {'hemo

# Check differences with original dataframes and SF

In [318]:
train_abbr = pd.read_csv("../datasets/trainning_set/clinical_cases.abbreviations.training_set.tsv", sep = '\t')
train_abbr = train_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [319]:
train_abbr.head()

Unnamed: 0,doc_id,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0210-48062004000500008-1,1650,1652,ml,mililitro,mililitro
1,S0210-48062004000500008-1,708,709,l,litro,litro
2,S0210-48062004000500008-1,704,707,mEq,miliequivalente,miliequivalente
3,S0210-48062004000500008-1,677,681,pCO2,presión parcial de co2,presión parcial de co2
4,S0210-48062004000500008-1,2287,2290,HLA,human leucocyte antigen,human leucocyte antiger


In [320]:
df1 = train_abbr[['doc_id', 'Abbreviation']]

In [321]:
df2 = mine[['doc_id', 'abrev']]

In [322]:
df3 = df1.merge(df2, left_on=['doc_id','Abbreviation'], right_on = ['doc_id','abrev'], how = 'left')

In [323]:
df3.shape

(10405, 3)

In [324]:
df3[df3['abrev'].isnull()].shape

(1236, 3)

In [325]:
len(sorted(df3[df3['abrev'].isnull()]['Abbreviation'].unique().tolist()))

267

Get Short Forms are not founded in my dataframe

In [326]:
df1_sf = pd.DataFrame(train_abbr['Abbreviation'].unique().tolist(), columns = ['sf1'])

In [327]:
df1_sf.shape

(768, 1)

In [328]:
df2_sf = pd.DataFrame(mine['abrev'].unique().tolist(), columns = ['sf2'])

In [329]:
df2_sf.shape

(855, 1)

In [330]:
df1_sf['sf1'] = df1_sf['sf1'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','').str.lower()
df2_sf['sf2'] = df2_sf['sf2'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','').str.lower()

In [331]:
df3_sf = df1_sf.merge(df2_sf, left_on=['sf1'], right_on = ['sf2'], how = 'left')

In [332]:
df3_sf.shape

(813, 2)

In [333]:
df3_sf.head()

Unnamed: 0,sf1,sf2
0,ml,ml
1,ml,ml
2,l,l
3,meq,meq
4,pco2,pco2


In [335]:
df3_sf[df3_sf['sf2'].isnull()].shape

(183, 2)

In [334]:
df3_sf[df3_sf['sf2'].isnull()].sort_values('sf1')

Unnamed: 0,sf1,sf2
190,67ga,
192,99tc,
69,a,
382,a1at,
701,ae1ae3,
651,ae1ae3,
568,ae1ae3,
774,afib,
235,ag,
368,ala,
