# Library

In [3094]:
import pandas as pd
import numpy as np
import os
import re
import collections
#from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation

from nltk import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import unidecode

# Functions

In [3095]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [3096]:
# def filter_byindex(x,y):
#     #selectors = [x for x in col2]
#     return list(itertools.compress(x,y))

In [3097]:
def check_len(x,y):
    if x==y:
        return 0
    else:
        return 1

In [3098]:
def get_lf(i,j,k):
    if pd.isnull(k):
        regex_lf = re.compile(r'((?:\w+\W+){1,'+str(len(i))+'})\(\s'+i[0]+'.*\)')
        return regex_lf.findall(j)
    else:
        return k

In [3099]:
def ngram_filter(doc, word, n):
    tokens = doc.split()
    all_ngrams = ngrams(tokens, n)
    filtered_ngrams = [x for x in all_ngrams if word in x]
    return filtered_ngrams

In [3100]:
# def get_longform(tokens, acro, margin = 2, i =1):
#     long_form = ''
#     #Looking for before
#     for word in tokens[index-margin-len(acro):index]:
#         #if first letter of word is equal to first letter os acronym
#         if word[0] == acro[i].lower():
#             long_form += word + ' '
#             i += 1
#             if i == len(acro):
#                 break
#         elif (i == 1) and (word[0] == acro[i-1].lower()):
#             long_form = word + ' '
#             i = 1
#             if i == len(acro):
#                 break
#     long_form = long_form.rstrip()
#     return long_form
    

In [3101]:
def get_longform(tokens, acro, long):
    acro = acro.lower()
    long_form = ''
    margin = 2
    i =0
    #Looking for before
    if acro not in tokens:
        return -1
    if pd.isna(long):
        index = tokens.index(acro)
        for word in tokens[index-margin-len(acro):index]:
            #if first letter of word is equal to first letter os acronym
            if word[0] == acro[i].lower():
                long_form += word + ' '
                i += 1
                if i == len(acro):
                    break
            elif (i == 1) and (word[0] == acro[i-1].lower()):
                long_form = word + ' '
                i = 1
                if i == len(acro):
                    break
        long_form = long_form.rstrip()
        return long_form
    else:
        return long


# Load Data

### Trainning

318 clinical cases

In [3102]:
train_raw = read_texts("../datasets/trainning_set/training_set.raw_text/")

In [3103]:
train_raw = train_raw.rename(columns = {'nombre': 'doc_id'})

In [3104]:
train_raw.head()

Unnamed: 0,doc_id,texto
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...


In [3105]:
train_raw.shape

(318, 2)

# Sub-track 1

## Found abbreviations (Short Forms)

### Regex

In [3106]:
patron3 = r'[A-Z]{2,8}' #Letras mayúsculas entre 2 y 8. Probar (\s|\()[A-Z]{2,8}
patron4 = r'\s[a-z]{1,2}\s' #Entre 2 y 3 letras minusculas entre espacios
patron5 = r'\b[aA-zZ]{1,4}\-[aA-zZ]{1,4}\b' #mayúsuclas o minúsculas entre guiones
#patron6 = r'[1-9]\s*[aA-zZ]{1,4}\/\b[aA-zZ]{1,4}\b' #Palabras divididas por / solo cuando las palabras no exceden de 4 caracteres
patron6 = r'\b\w{2}\b\/'
patron7 = r'\/\b\w{2}\b'
patron8 = r'[aA-zZ]{1,4}[A-Z]+[a-z]*[1-4]*'
#patron8 = r'\/[a-z]*[A-Z]*'


# create a list with them
regexes = [ patron3, patron4, patron5, patron6, patron7]
for i in regexes:
    generic_re = re.compile("%s|%s|%s|%s|%s|%s" % (patron3, patron4, patron5, patron6, patron7, patron8))

In [3107]:
#nltk.download('stopwords')
swords = list(set(stopwords.words('spanish')))

Add words to stopwords lists

In [3108]:
swords = swords + ['I','II','III','VI','VII','VIII','IX', 'X', 'x']

### Get Short Formns with a regex in each text

In [3109]:
train_raw['abrev'] = train_raw['texto'].map(lambda x: generic_re.findall(x))

Separate mesurement units separate by "/"

In [3110]:
# train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [a.split("/") for a in x])

In [3111]:
# train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [item for sublist in x for item in sublist])

**Get offsets of the Short Forms founded in the text**

In [3112]:
train_raw['offse'] = train_raw['texto'].map(lambda x: [(m.start(0), m.end(0)) for m in re.finditer(generic_re, x)])

In [3113]:
#remove whitespaces
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [i.strip() for i in x])

Filter Short Forms to delete ones which are stopwords, get their index too for filter offsets lists later

In [3114]:
# train_raw['abrev_index'] = train_raw['abrev'].apply(lambda x: [x.index(i) for i in x if i not in swords])
train_raw['abrev_index'] = train_raw['abrev'].apply(lambda x: [i for i,j in enumerate(x) if j not in swords])

In [3115]:
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [i for i in x if i not in swords])

In [3116]:
train_raw['abrev'] = train_raw['abrev'].apply(lambda x: [a.replace('/',"") for a in x])

Filter offsets lists by index

In [3117]:
train_raw['offse'] = train_raw.apply(lambda x: [x['offse'][i] for i in x['abrev_index']], axis = 1)

In [3118]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,"[RM, PAAF]","[(789, 791), (1006, 1010)]","[24, 31]"
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...","[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(1056, 1059), (1079, 1082), (1247, 1250), (12...","[36, 37, 44, 47, 48, 49, 50, 51, 52, 53, 54, 5..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,[],[],[]
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...","[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(40, 43), (654, 657), (670, 673), (697, 700),...","[3, 23, 24, 25, 26, 27, 28, 29, 30, 37, 64, 65..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,[],[],[]


Check abrev and offse columns has the same length

In [3119]:
train_raw['abrev_len'] =train_raw['abrev'].str.len()
train_raw['offse_len'] =train_raw['offse'].str.len()

In [3120]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index,abrev_len,offse_len
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,"[RM, PAAF]","[(789, 791), (1006, 1010)]","[24, 31]",2,2
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...","[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(1056, 1059), (1079, 1082), (1247, 1250), (12...","[36, 37, 44, 47, 48, 49, 50, 51, 52, 53, 54, 5...",34,34
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,[],[],[],0,0
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...","[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(40, 43), (654, 657), (670, 673), (697, 700),...","[3, 23, 24, 25, 26, 27, 28, 29, 30, 37, 64, 65...",22,22
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,[],[],[],0,0


Check both lists have the same lenght

In [3121]:
train_raw['len_check'] = train_raw.apply(lambda row: check_len(row['abrev_len'],row['offse_len']),axis = 1)

Delete rows with different lenghts (check it later)

In [3122]:
train_raw.shape

(318, 8)

In [3123]:
train_raw[train_raw['len_check'] != 1].shape

(318, 8)

In [3124]:
#train_raw = train_raw[train_raw['len_check'] != 1]

### Text treatment

Remove string punctuation, lowecase, tokenize and remove stopwords

In [3125]:
train_raw['texto_clean'] = train_raw['texto'].str.replace('[^\w\s]',' ')

In [3126]:
train_raw['texto_clean'] = train_raw['texto_clean'].apply(lambda x: unidecode.unidecode(x))

In [3127]:
train_raw['texto_clean'] = train_raw['texto_clean'].str.split().map(lambda x: ' '.join([w for w in x if w not in swords]))

In [3128]:
train_raw['tokens'] = train_raw['texto_clean'].map(lambda x: word_tokenize(x))

In [3129]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index,abrev_len,offse_len,len_check,texto_clean,tokens
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,"[RM, PAAF]","[(789, 791), (1006, 1010)]","[24, 31]",2,2,0,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac..."
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...","[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(1056, 1059), (1079, 1082), (1247, 1250), (12...","[36, 37, 44, 47, 48, 49, 50, 51, 52, 53, 54, 5...",34,34,0,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,[],[],[],0,0,0,Varon 33 anos fumador paquete cigarrillos dia ...,"[Varon, 33, anos, fumador, paquete, cigarrillo..."
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...","[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(40, 43), (654, 657), (670, 673), (697, 700),...","[3, 23, 24, 25, 26, 27, 28, 29, 30, 37, 64, 65...",22,22,0,Hombre 42 anos bebedor mas 100 g etanol dia an...,"[Hombre, 42, anos, bebedor, mas, 100, g, etano..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,[],[],[],0,0,0,Paciente 18 anos edad 5 meses habia sido victi...,"[Paciente, 18, anos, edad, 5, meses, habia, si..."


### Get abreviations and offsets

Remove punctuation from abreviations

In [3130]:
# for i in punctuation:
#     train_raw['abrev'] = train_raw[train_raw['abrev'].notnull()]['abrev'].apply(lambda x: [a.replace(i,"") for a in x])

In [3131]:
train_raw['zip'] = train_raw.apply(lambda row: list(zip(row['abrev'], row['offse'])), axis = 1)

In [3132]:
train_raw.head()

Unnamed: 0,doc_id,texto,abrev,offse,abrev_index,abrev_len,offse_len,len_check,texto_clean,tokens,zip
0,S1130-05582012000300005-1,Acude a nuestras consultas a un paciente que p...,"[RM, PAAF]","[(789, 791), (1006, 1010)]","[24, 31]",2,2,0,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","[(RM, (789, 791)), (PAAF, (1006, 1010))]"
1,S0212-71992005000400009-1,"Se trataba de un varón de 27 años de edad, que...","[mm, mm, mg, mg, mg, mg, LDH, UI, GOT, UI, GPT...","[(1056, 1059), (1079, 1082), (1247, 1250), (12...","[36, 37, 44, 47, 48, 49, 50, 51, 52, 53, 54, 5...",34,34,0,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...","[(mm, (1056, 1059)), (mm, (1079, 1082)), (mg, ..."
2,S0004-06142008000700015-2,Varón de 33 años fumador de un paquete de ciga...,[],[],[],0,0,0,Varon 33 anos fumador paquete cigarrillos dia ...,"[Varon, 33, anos, fumador, paquete, cigarrillo...",[]
3,S0210-56912006000800008-1,"Hombre de 42 años, bebedor de más de 100 g de ...","[g, mg, dl, dl, AST, ALT, GGT, UI, dl, TAC, mg...","[(40, 43), (654, 657), (670, 673), (697, 700),...","[3, 23, 24, 25, 26, 27, 28, 29, 30, 37, 64, 65...",22,22,0,Hombre 42 anos bebedor mas 100 g etanol dia an...,"[Hombre, 42, anos, bebedor, mas, 100, g, etano...","[(g, (40, 43)), (mg, (654, 657)), (dl, (670, 6..."
4,S0376-78922009000300010-1,Paciente de 18 años de edad que 5 meses antes ...,[],[],[],0,0,0,Paciente 18 anos edad 5 meses habia sido victi...,"[Paciente, 18, anos, edad, 5, meses, habia, si...",[]


### Get one row per abbreviation

Separate elements lists in different rows

In [3133]:
mine = train_raw.explode('zip')

In [3134]:
mine.shape

(3201, 11)

In [3135]:
mine[mine['zip'].isnull()].shape

(18, 11)

In [3136]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'zip']]

In [3137]:
mine = mine[mine['zip'].notnull()]

In [3138]:
mine.shape

(3183, 4)

In [3139]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,zip
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(RM, (789, 791))"
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(PAAF, (1006, 1010))"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...","(mm, (1056, 1059))"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...","(mm, (1079, 1082))"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...","(mg, (1247, 1250))"


### Separate SF from Offsets in different columns

In [3140]:
mine['abrev'] = mine.apply(lambda row: row['zip'][0], axis = 1)

In [3141]:
mine['offsets'] = mine.apply(lambda row: row['zip'][1], axis = 1)

In [3142]:
mine = mine[mine['abrev'] != ""]

In [3143]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,zip,abrev,offsets
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(RM, (789, 791))",RM,"(789, 791)"
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...","(PAAF, (1006, 1010))",PAAF,"(1006, 1010)"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...","(mm, (1056, 1059))",mm,"(1056, 1059)"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...","(mm, (1079, 1082))",mm,"(1079, 1082)"
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...","(mg, (1247, 1250))",mg,"(1247, 1250)"


Separate offsets tuples in different columns

In [3144]:
mine[['startOffset', 'endOffset']] = pd.DataFrame(mine['offsets'].tolist(), index=mine.index) 

In [3145]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'abrev', 'startOffset', 'endOffset']]

In [3146]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,789,791
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,1006,1010
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1056,1059
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1079,1082
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mg,1247,1250


Delete null values, and change ttype to integer for offsets columns

In [3147]:
mine.shape

(3183, 6)

In [3148]:
mine.dropna(subset=['startOffset', 'endOffset'], inplace = True)

In [3149]:
mine.shape

(3183, 6)

In [3150]:
mine['startOffset'] = mine['startOffset'].astype(int)
mine['endOffset'] = mine['endOffset'].astype(int)

In [3151]:
mine = mine[['doc_id', 'texto_clean', 'tokens', 'abrev', 'startOffset', 'endOffset']]

In [3152]:
mine.shape

(3183, 6)

In [3153]:
#mine[mine.duplicated(subset=['doc_id', 'texto_clean', 'abrev', 'startOffset', 'endOffset'], keep=False)]

In [3154]:
mine = mine.drop_duplicates(subset = ['doc_id', 'texto_clean', 'abrev', 'startOffset', 'endOffset'])

In [3155]:
mine.shape

(3183, 6)

In [3156]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,789,791
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,1006,1010
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1056,1059
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1079,1082
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mg,1247,1250


In [3157]:
# mine['abrev'] = mine['abrev'].apply(lambda x: [a.split("/") for a in x])

Check dataframes for one text

In [3158]:
# mine[mine['doc_id'] == 'S1130-05582012000300005-1']

In [3159]:
# train_abbr[train_abbr['doc_id'] == 'S1130-05582012000300005-1']

## Search Long Formns

### Search Long Forms in the same text

**measurement units dictionary**

In [3160]:
mu_dic = {"ml":"mililitro",
"mg":"miligramo",
"g":"gramo",
"l":"litro",
"mcg":"microgramo",
"mmol":"milimol",
"ui":"Unidades Internacionales",
"miles ui":"Miles de Unidades Internacionales",
"millones ui":"Millones de Unidades Internacionales",
"ufc":"Unidades Formadoras de Colonias",
"meq":"miliequivalente",
"ng":"manogramo",
"lf":"Unidad Floculante",
"ufp":"Unidad Formadora de Placa",
"dic":"Dosis Infectante Mediana de Cultivo Celular 50% ",
"dit":"Dosis Infectante Mediana de Cultivo Tisular 50% ",
"di":"Dosis Infectante 50% ",
"mol":"Peso Molecular Gramo ",
"eq":"Peso Equivalente Gramo ",
"dosis":"Dosis",
"almh":"Almohadilla",
"amp":"Ampolla",
"anl":"Anillo",
"bar":"Barra",
"bolsa":"Bolsa",
"cap":"Capsula",
"car":"Caramelo",
"carp":"Carpula",
"cart":"Cartucho",
"com":"Comprimido",
"dia":"Dia",
"fras":"Frasco",
"fras-amp":"Frasco Ampolla ",
"grag":"Gragea",
"gora":"Hora",
"gmp":"Implante",
"jab":"Jab¢n",
"jer":"Jeringa Prellenada ",
"uL":"Microlitro",
"ovu":"Ovulo",
"parche":"Parche",
"past":"Pastilla",
"perl":"Perla",
"pil":"Pildora",
"pip":"Pipeta",
"%":"Porcentaje",
"sach":"Sachet",
"sob":"Sobre",
"sup":"Supositorio",
"tab":"Tableta",
"troc":"Trocisco",
"vial":"Vial",
"kg":"Kilogramo",
"gal":"Galon",
"sis":"Sistema Terapeutico",
"mci":"miliCuries",
"mbq":"milibequerel",
"uel":"Unidades ELISA ",
"dl":"Dosis Letal",
"u usp":"Unidades USP ",
"u":"Unidades",
"rot":"Rotacaps",
"ccid":"Dosis Infecciosa en Cultivo de Célula ",
"U":"UNIDAD",
"otros":"Otros",
"µci":"mcroCuries",
"esp":"Esporas",
"mcha":"microgramos de HA ",
"gom":"Goma",
"kiu":"Unidad Inhibidora de Calicreina ",
"mcel":"Millones de Células ",
"du":"Unidades de Antigeno D",
"dil d2":"Dil D2",
"tin.mad.":"Tintura Madre",
"dil d4":"Dil D4",
"dil d5":"Dil D5",
"dil d1":"Dil D1",
"dil d8":"Dil D8",
"dil d3":"Dil D3",
"ou":"Unidad de Opacidad ",
"mm": "milimetro",
"dm": "decimetro",
"cm": "centimetro"}

In [3161]:
mine['long_form'] = mine['abrev'].str.lower().map(mu_dic)

In [3162]:
# mine[mine['doc_id'] == 'S0212-71992005000400009-1']

In [3163]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,789,791,
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,1006,1010,
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1056,1059,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1079,1082,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mg,1247,1250,miligramo


In [3164]:
# mine_lf = mine[mine['abrev'].str.len() > 1].reset_index()

### Get LF before SF

In [3165]:
# mine[mine['doc_id'] == 'S0004-06142005000900013-1'].iloc[0]['tokens']

With get_longform function, words befores SF that starts with SF letters will be detected

In [3166]:
def get_longform(tokens, acro, long):
    #acro = acro.lower()
    long_form = ''
    margin = 2
    i =0
    #Looking for before
    if pd.isna(long):
        if acro not in tokens:
            return -1
        else:
            index = tokens.index(acro)
            for word in tokens[index-margin-len(acro):index]:
                #if first letter of word is equal to first letter os acronym
                if word[0] == acro[i].lower():
                    long_form += word + ' '
                    i += 1
                    if i == len(acro):
                        break
                elif (i == 1) and (word[0] == acro[i-1].lower()):
                    long_form = word + ' '
                    i = 1
                    if i == len(acro):
                        break
            long_form = long_form.rstrip()
            #print(re.split(' |-',long_form), len(re.split(' |-',long_form)), len(acro)-1)
            if len(re.split(' |-',long_form)) == (len(acro)):
                return long_form
            elif ((len(re.split(' |-',long_form))+1) == (len(acro))) & (len(re.split(' |-',long_form)) > 1):
                return long_form
    else:
        return long


In [3167]:
def get_longform_after(tokens, acro, long):
    #acro = acro.lower()
    long_form = ''
    margin = 2
    i =0
    #Looking for before
    if pd.isna(long):
        if acro not in tokens:
            return -1
        else:
            index = tokens.index(acro)
            for word in tokens[index+1:index+margin+len(acro)]:
#                 if tokens[index+1] == '(':
#                     print
                if word[0] in acro.lower():
                    long_form += word + ' '
        long_form = long_form.rstrip()
        if len(re.split(' |-',long_form)) == (len(acro)):
            return long_form
        elif ((len(re.split(' |-',long_form))+1) == (len(acro))) & (len(re.split(' |-',long_form)) > 1):
            return long_form
    else:
        return long
    

In [3168]:
# a = mine[mine['doc_id'] == 'S0376-78922016000200011-1']

In [3169]:
# a.iloc[0]['tokens']

In [3170]:
# a.apply(lambda row: get_longform_after(row['tokens'], row['abrev'], row['long_form']), axis = 1)

In [3171]:
# mine[mine['doc_id'] == 'S1137-66272013000200023-1'].apply(lambda row: get_longform(row['tokens'], row['abrev'], row['long_form']), axis = 1)

In [3172]:
mine['long_form'] = mine.apply(lambda row: get_longform(row['tokens'], row['abrev'], row['long_form']), axis = 1)

In [3173]:
mine['long_form'] = mine.apply(lambda row: get_longform_after(row['tokens'], row['abrev'], row['long_form']), axis = 1)

In [3174]:
mine.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,789,791,resonancia magnetica
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,1006,1010,puncion aspiracion aguja fina
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1056,1059,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1079,1082,milimetro
1,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mg,1247,1250,miligramo


In [3175]:
# mine[mine['doc_id'] == 'S1130-05582012000300005-1'].iloc[0]['tokens']

In [3176]:
mine.shape

(3183, 7)

### Number of SF and LF in tetxs

**Short Forms**

In [3177]:
mine['abrev'].nunique()

695

In [3178]:
pd.options.display.max_rows = 1000

In [3179]:
# mine['abrev'].value_counts().sort_index()

Delete rows with SF that are just numbers or number + letter

In [3180]:
mine[(mine['abrev'].str.len() <= 2) & (~mine['abrev'].str.isalpha())| (mine['abrev'].str.len() == 1)]['abrev'].unique()

array(['g', '5U', '70', 'f', 'q', '24', '10', 'h', 'd', 'u', 'l', '1a',
       '8h', '45', '90', '80', '67', 'm', '13', '07', '12', '7g', '16',
       '34', 'm2', '68', '50', 'i', '20', '83', '4g', '88', '09', '05',
       '85', '31', '60', '1b', '40', '71', '32', '76', '15', '3g', '74',
       '02', '18', '75', 'C4', '99', '03', '11', '1g', '01', '26', '23',
       '08'], dtype=object)

In [3181]:
mine.shape

(3183, 7)

In [3182]:
mine[~((mine['abrev'].str.len() <= 2) & (~mine['abrev'].str.isalpha())| (mine['abrev'].str.len() == 1))].shape

(3020, 7)

In [3183]:
mine = mine[~((mine['abrev'].str.len() <= 2) & (~mine['abrev'].str.isalpha())| (mine['abrev'].str.len() == 1))]

In [3184]:
# mine['abrev'].value_counts().sort_index()

**Long Forms**

In [3185]:
mine['long_form'].nunique()

165

In [3186]:
# mine['long_form'].unique().tolist()

Study -1 values

In [3187]:
mine[mine['long_form'] == -1].shape

(228, 7)

In [3188]:
mine[mine['long_form'] == -1]['abrev'].nunique()

112

Delete blank space from some letter in SF

In [3189]:
# mine[mine['long_form'] == '']['abrev'].unique()

In [3190]:
# mine[mine['long_form'] == ''].shape

In [3191]:
# mine = mine[mine['long_form'] != '']

### Study LF founded or not founded

Study SF without LF in the running text

In [3192]:
mine[mine['long_form'] == -1].shape

(228, 7)

In [3193]:
mine[mine['long_form'].isnull()].shape

(1437, 7)

Check how many texts don't have LF for the SF in the same text

In [3194]:
mine[(mine['abrev'].notnull()) & (mine['long_form'].isnull())].shape

(1437, 7)

In [3195]:
lf_null = mine[(mine['abrev'].notnull()) & (mine['long_form'].isnull())].shape[0]
lf_null

1437

In [3196]:
print(f"LF has not be found in the same text where the SF is in {lf_null/mine.shape[0]*100: .2f}% of texts")

LF has not be found in the same text where the SF is in  47.58% of texts


In [3197]:
mine[mine['long_form'].isnull()]['abrev'].nunique()

468

In [3198]:
sorted(mine[mine['long_form'].isnull()]['abrev'].unique().tolist())

['AA',
 'AAS',
 'AB',
 'ABI',
 'ABVD',
 'ACCS',
 'ADA',
 'ADR',
 'AE',
 'AFG',
 'AFP',
 'AGF',
 'AI',
 'AINES',
 'AL',
 'ALAT',
 'ALK',
 'ALP',
 'ALT',
 'AMA',
 'ANA',
 'ANAS',
 'ANCA',
 'ANCAS',
 'AOC',
 'AP',
 'AR',
 'ASAT',
 'ASIA',
 'ASLO',
 'ASPEN',
 'AST',
 'AT',
 'ATA',
 'ATS',
 'AUC',
 'AV',
 'AVK',
 'AVSC',
 'BAAR',
 'BACTEC',
 'BAL',
 'BAS',
 'BAV',
 'BCG',
 'BD',
 'BEP',
 'BH',
 'BIODISK',
 'BMC',
 'BMU',
 'BP',
 'BPA',
 'BQ',
 'BT',
 'BUN',
 'CA',
 'CBMF',
 'CCR',
 'CD',
 'CDDP',
 'CEA',
 'CEC',
 'CHCM',
 'CHOP',
 'CK',
 'CKPAN',
 'CMHG',
 'CMI',
 'CMV',
 'CO',
 'COL',
 'COPP',
 'CPAP',
 'CPK',
 'CRE',
 'CT',
 'CTX',
 'CU',
 'CV',
 'CX',
 'CellCe',
 'DAI',
 'DAKO',
 'DFVA',
 'DG',
 'DHAP',
 'DIEP',
 'DII',
 'DIII',
 'DMNID',
 'DR',
 'DTIC',
 'EA',
 'EB',
 'EBHGA',
 'EC',
 'ECA',
 'ECE',
 'ECG',
 'ECO',
 'ECOT',
 'ECS',
 'EE',
 'EEG',
 'EEII',
 'EGO',
 'EIA',
 'EIAI',
 'EID',
 'EII',
 'EKG',
 'ELA',
 'ELISA',
 'EMA',
 'EMG',
 'EMLA',
 'ENA',
 'ENG',
 'EPOC',
 'EPPMA',
 'EPR'

## Create dictionary with SF and LF pairs founded in the same text

In [3199]:
df_pairs = mine[(mine['abrev'].notnull()) & (mine['long_form'].notnull())]

In [3200]:
df_pairs.reset_index(drop = True, inplace = True)

In [3201]:
df_pairs.head()

Unnamed: 0,doc_id,texto_clean,tokens,abrev,startOffset,endOffset,long_form
0,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",RM,789,791,resonancia magnetica
1,S1130-05582012000300005-1,Acude consultas paciente presenta tumoracion c...,"[Acude, consultas, paciente, presenta, tumorac...",PAAF,1006,1010,puncion aspiracion aguja fina
2,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1056,1059,milimetro
3,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mm,1079,1082,milimetro
4,S0212-71992005000400009-1,Se trataba varon 27 anos edad habia sufrido ne...,"[Se, trataba, varon, 27, anos, edad, habia, su...",mg,1247,1250,miligramo


In [3202]:
pairs_dic = {}
for index, row in df_pairs.iterrows():
    if (row['long_form'] != -1) & (row['long_form'] != None):
        if not row['abrev'] in pairs_dic:
            pairs_dic[row['abrev']] = set()
        pairs_dic[row['abrev']].add(row['long_form'])

In [3204]:
pairs_dic

{'RM': {'mostrando region', 'resonancia magnetica'},
 'PAAF': {'puncion aspiracion aguja fina'},
 'mm': {'milimetro'},
 'mg': {'miligramo'},
 'UI': {'Unidades Internacionales'},
 'VIH': {'virus hepatitis'},
 'kg': {'Kilogramo'},
 'dl': {'Dosis Letal'},
 'TAC': {'abdominal confirmandose trombosis',
  'abdominal tumor',
  'abdominales ascitis',
  'abdomino confirmaron',
  'abdomino confirmo',
  'abdomino contraste',
  'apreciar adyacente',
  'aumento atenuacion',
  'axial computerizada abdominal',
  'cervical abdominopelvico',
  'columna abdominal',
  'contrastada confirmaron',
  'contraste tejido',
  'cortes axiales craneocaudales',
  'craneal aprecian anormalidades',
  'craneal contraste',
  'craneal toracico abdominal',
  'tomografia axial computadorizada',
  'tomografia axial computarizada',
  'tomografia axial computerizada',
  'toracica afectacion',
  'toracico abdominal',
  'toraco abdominal',
  'toraco abdomino'},
 'ng': {'manogramo'},
 'APC': {'argon plasma'},
 'ml': {'mililitro

In [328]:
sorted(pairs_dic.keys())

['12',
 'A-P',
 'AA',
 'AAS',
 'AAT',
 'ACL',
 'AEO',
 'AFG',
 'AFP',
 'AGF',
 'AI',
 'AINES',
 'AL',
 'ALAT',
 'ALT',
 'AMA',
 'ANA',
 'ANCA',
 'ANCAS',
 'ANOES',
 'AO',
 'AP',
 'APC',
 'ASAT',
 'ASLO',
 'AST',
 'ATM',
 'AV',
 'AVI',
 'AVSC',
 'Alfa-feto',
 'BA',
 'BAL',
 'BAS',
 'BD',
 'BM',
 'BMC',
 'BMU',
 'BPA',
 'C-GSF',
 'C-Kit',
 'CA',
 'CD',
 'CDI',
 'CEA',
 'CK',
 'CMV',
 'COL',
 'CPK',
 'CT',
 'CU',
 'CX',
 'D-AAT',
 'DA',
 'DAI',
 'DAKO',
 'DFVA',
 'DG',
 'DII',
 'DM',
 'DP',
 'DPAR',
 'DR',
 'DRNS',
 'DTM',
 'EA',
 'EBHGA',
 'ECE',
 'ECG',
 'ECMO',
 'ECS',
 'EEII',
 'EF',
 'EFK',
 'EIA',
 'EID',
 'ELA',
 'EMA',
 'EMG',
 'ENA',
 'ENG',
 'EPOC',
 'EPR',
 'ESCHAP',
 'EVA',
 'FA',
 'FAV',
 'FID',
 'FMO',
 'FO',
 'FiO2',
 'GC',
 'GER',
 'GGT',
 'GOT',
 'GPT',
 'HBPM',
 'HCT',
 'HLA',
 'HPTS',
 'HPV',
 'HQR',
 'HSA',
 'HTA',
 'HTBI',
 'IAM',
 'ICG',
 'IF',
 'IGRA',
 'INSS',
 'IgA',
 'IgE',
 'IgG',
 'IgG4',
 'IgM',
 'LBA',
 'LCCT',
 'LCR',
 'LDH',
 'LH',
 'LLA',
 'LLBCP',
 'LLI',

In [2574]:
len(pairs_dic.keys())

94

## Check text individually

In [745]:
# texto = train_raw.iloc[4]['texto']
# texto

In [2576]:
texto = train_raw[train_raw['doc_id'] == 'S0376-78922016000200011-1'].iloc[0]['texto']
texto

'Mujer de raza mestiza de 43 años de edad, con antecedente de mamoplastia de aumento realizada en junio de 2009: incisión periareolar, disección transglandular, bolsillo subfascial, colocación de implantes anatómicos de gel cohesivo texturizado de 290 cc, (McGhan® Medical Corporation, Santa Bárbara, California, EE.UU.) En la misma intervención se le realizó también abdominoplastia y liposucción de la zona baja de la espalda. Se colocaron drenajes aspirativos que se retiraron a las 24 horas de la intervención. El postoperatorio cursó sin incidencias y el resultado fue satisfactorio.\nAl año del procedimiento, la paciente presentó molestias y edema en la mama derecha que cedieron con antinflamatorios no esteroideos a dosis de 120 mg por día durante 7 días. La evolución posterior fue satisfactoria hasta el año 2015, es decir 5 años después de esos síntomas y 6 tras la intervención, cuando la paciente nuevamente presentó molestias y edema en la mama derecha, así como aparición de galactorr

In [2577]:
sf = generic_re.findall(texto)

In [2578]:
sf_clean =  [i.strip() for i in sf if i.strip() not in swords]
# sf_clean

In [2579]:
sf_clean = [a.replace('/',"") for a in sf_clean]

In [2580]:
# for i in punctuation:
#     sf_clean = [a.replace(i,"") for a in sf_clean]

In [2581]:
sf_clean

['McGhan',
 'EE',
 'UU',
 'mg',
 'RM',
 'cc',
 'RM',
 'LACG',
 'CD',
 'CD',
 'CD',
 'EMA',
 'CKAE',
 'AE',
 'CD',
 'ALK',
 'PET']

In [2582]:
texto_clean = texto.split()

In [2583]:
texto_clean = ' '.join([w for w in texto_clean if w not in swords])

In [2584]:
texto_clean = texto_clean.replace('[^\w\s]',' ').lower()

In [2585]:
# df['column'] = df['column'].apply(remove_accents)
texto_clean = unidecode.unidecode(texto_clean)

In [2591]:
tokens = word_tokenize(texto_clean)
tokens

['mujer',
 'raza',
 'mestiza',
 '43',
 'anos',
 'edad',
 ',',
 'antecedente',
 'mamoplastia',
 'aumento',
 'realizada',
 'junio',
 '2009',
 ':',
 'incision',
 'periareolar',
 ',',
 'diseccion',
 'transglandular',
 ',',
 'bolsillo',
 'subfascial',
 ',',
 'colocacion',
 'implantes',
 'anatomicos',
 'gel',
 'cohesivo',
 'texturizado',
 '290',
 'cc',
 ',',
 '(',
 'mcghan',
 '(',
 'r',
 ')',
 'medical',
 'corporation',
 ',',
 'santa',
 'barbara',
 ',',
 'california',
 ',',
 'ee.uu',
 '.',
 ')',
 'en',
 'misma',
 'intervencion',
 'realizo',
 'abdominoplastia',
 'liposuccion',
 'zona',
 'baja',
 'espalda',
 '.',
 'se',
 'colocaron',
 'drenajes',
 'aspirativos',
 'retiraron',
 '24',
 'horas',
 'intervencion',
 '.',
 'el',
 'postoperatorio',
 'curso',
 'incidencias',
 'resultado',
 'satisfactorio',
 '.',
 'al',
 'ano',
 'procedimiento',
 ',',
 'paciente',
 'presento',
 'molestias',
 'edema',
 'mama',
 'derecha',
 'cedieron',
 'antinflamatorios',
 'esteroideos',
 'dosis',
 '120',
 'mg',
 'dia',


In [2592]:
def get_longform(tokens, acro):
    acro = acro.lower()
    long_form = ''
    margin = 2
    i =0
    #Looking for before
    if acro not in tokens:
        return -1
    index = tokens.index(acro)
    for word in tokens[index-margin-len(acro):index]:
        #if first letter of word is equal to first letter os acronym
        if word[0] == acro[i].lower():
            long_form += word + ' '
            i += 1
            if i == len(acro):
                break
        elif (i == 1) and (word[0] == acro[i-1].lower()):
            long_form = word + ' '
            i = 1
            if i == len(acro):
                break 
    long_form = long_form.rstrip()
    return long_form
    for word in tokens[index:index+margin+len(acro)]:
        print(tokens[index:index+margin+len(acro)])
#     if len(re.split(' |-',long_form)) == (len(acro)):
#         return long_form
    #elif (len(re.split(' |-',long_form)) == (len(acro)-1)) & 



In [2680]:
def get_longform_after(tokens, acro):
    acro = acro.lower()
    long_form = ''
    margin = 2
    i =0
    #Looking for before
    if acro not in tokens:
        return -1
    index = tokens.index(acro)
    for word in tokens[index+1:index+margin+len(acro)]:
        if tokens[index+1] == '(':
            if word[0] in acro:
                long_form += word + ' '
    long_form = long_form.rstrip()
    return long_form


In [2681]:
for sf in sf_clean:
    a = get_longform_after(tokens, sf)
    print(f"Esta es la short form y su forma larga\n ({sf},{a})")

Esta es la short form y su forma larga
 (McGhan,medical corporation)
Esta es la short form y su forma larga
 (EE,-1)
Esta es la short form y su forma larga
 (UU,-1)
Esta es la short form y su forma larga
 (mg,)
Esta es la short form y su forma larga
 (RM,)
Esta es la short form y su forma larga
 (cc,)
Esta es la short form y su forma larga
 (RM,)
Esta es la short form y su forma larga
 (LACG,)
Esta es la short form y su forma larga
 (CD,)
Esta es la short form y su forma larga
 (CD,)
Esta es la short form y su forma larga
 (CD,)
Esta es la short form y su forma larga
 (EMA,antigeno membrana epitelial)
Esta es la short form y su forma larga
 (CKAE,-1)
Esta es la short form y su forma larga
 (AE,-1)
Esta es la short form y su forma larga
 (CD,)
Esta es la short form y su forma larga
 (ALK,kinasa linfocitos anaplasicos)
Esta es la short form y su forma larga
 (PET,)


# Study text

In [2214]:
train_raw[train_raw['doc_id'] == 'S0211-69952016000300015-1'].iloc[0]['texto']

254    Presentamos el caso de un varón de 47 años de ...
Name: texto, dtype: object

In [1976]:
mine[mine['doc_id'] == 'S1130-05582010000200004-1'].iloc[0]['tokens']

['Mujer',
 '70',
 'años',
 'edad',
 'acudió',
 'hospital',
 'referencia',
 'tumoración',
 'cara',
 'lateral',
 'derecha',
 'lengua',
 'estadio',
 'clínico',
 'T2N0M0',
 'La',
 'biopsia',
 'informó',
 'carcinoma',
 'epidermoide',
 'La',
 'modalidad',
 'terapéutica',
 'decidida',
 'hospital',
 'radioterapia',
 'externa',
 'dividida',
 '30',
 'fracciones',
 '2',
 'Gyfracción',
 'total',
 '60',
 'Gy',
 'ambos',
 'campos',
 'cervicales',
 'El',
 'tratamiento',
 'completó',
 'braquiterapia',
 'cuatro',
 'agujas',
 'Iridium',
 '196',
 'total',
 '50',
 'Gy',
 'La',
 'enfermedad',
 'remitió',
 'completo',
 'paciente',
 'libre',
 'misma',
 'año',
 'momento',
 'acudió',
 'nuevo',
 'especialista',
 'presentar',
 'síntomas',
 'inflamatorios',
 'zona',
 'radiada',
 'dolor',
 'fístula',
 'ángulo',
 'mandibular',
 'derecho',
 'La',
 'exploración',
 'física',
 'imagen',
 'ortopantomografía',
 'diagnósticas',
 'ORN',
 'cuerpo',
 'mandibular',
 'derecho',
 'decidió',
 'remitir',
 'paciente',
 'Servicio',