# Library

In [1]:
import pandas as pd
import numpy as np
import os
import re
import collections
import unidecode
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation
from functools import reduce
import seaborn as sns
from abbreviations import schwartz_hearst

In [2]:
pd.set_option('display.max_colwidth', 100)

In [3]:
%matplotlib inline
from matplotlib import pyplot as plt

# Functions

In [4]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [5]:
nltk.download('stopwords')
swords = list(set(stopwords.words('spanish')))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def clean_text(string):
    """
    A method to clean text 
    """
    
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuation:
            if x != '/':
                string = string.replace(x, "")
            else:
                string = string.replace(x, " ")
    
    string = unidecode.unidecode(string)

#     # Converting the text to lower
#     string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in swords])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string 

In [7]:
def distance_levenshtein(str1, str2):
    d=dict()
    for i in range(len(str1)+1):
        d[i]=dict()
        d[i][0]=i
    for i in range(len(str2)+1):
        d[0][i] = i
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not str1[i-1] == str2[j-1]))
    return d[len(str1)][len(str2)]

In [8]:
def normalize_lf(row):
    leven2 = []
    for i in row:
        for j in row:
            if i != j:
                long = max(len(i),len(j))
                ratio = distance_levenshtein(i,j)/long
                if ratio < 0.2:
                    leven2.append(j)
    if leven2:
        leven2 = set(leven2)
        lista = []
        for i in leven2:
            val = frec[frec['index'] == i]['long_form'].iloc[0]
            lista.append((i, val))
        lista = set(lista)
        most_freq = sorted(set(lista), key=lambda x: x[1], reverse = True)[0][0]
        sust = {}
        for i in set(leven2):
            sust[i] = most_freq
        
        return sust
    else:
        pass
    

In [9]:
def get_label(row):
    if row['long_form_x'] == row['long_form_y']:
        return 1
    else:
        return 0

In [12]:
def offsetA(row):
    return row['texto'].find(row['Mention_A'])
    
def offsetB(row):
    return row['texto'].find(row['Mention_B'])

def offsetB_end(row):
    return row['texto'].find(row['Mention_B']) + len(row['Mention_B'])

def offsetA_end(row):
    return row['Mention_A_StartOffset'] + len(row['Mention_A'])

# Load Data

In [126]:
testing_rel = pd.read_csv("../../acronym_disambiguation_tfm/data/ibereval_data/testing_set/clinical_cases.relations.testing_set.tsv", sep = '\t')

In [127]:
testing_raw = read_texts("../../acronym_disambiguation_tfm/data/ibereval_data/testing_set/testing_set.raw_text/")

In [128]:
train_rel = pd.read_csv("../../acronym_disambiguation_tfm/data/ibereval_data/trainning_set/clinical_cases.relations.training_set.tsv", sep = '\t')

In [129]:
train_raw = read_texts("../../acronym_disambiguation_tfm/data/ibereval_data/trainning_set/training_set.raw_text/")

## Transform data

#### Transform gold standard

#### Train

In [130]:
train_rel = train_rel.reset_index()

In [131]:
train_rel.head(2)

Unnamed: 0,level_0,level_1,level_2,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S1130-01082009000400014-1,SHORT_FORM,476,NPT,SHORT-LONG,LONG_FORM,454.0,474.0,nutrición parenteral
1,S1130-63432016000100009-1,SHORT_FORM,614,NIHSS,SHORT-LONG,LONG_FORM,621.0,662.0,National Institute of Health Stroke Scale


In [132]:
train_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [133]:
train_rel = train_rel[train_rel['Mention_A_StartOffset'] != 'EndOffset']

In [134]:
train_rel['Mention_A_StartOffset'] = train_rel['Mention_A_StartOffset'].astype(int)
train_rel['Mention_B_StartOffset'] = train_rel['Mention_B_StartOffset'].astype(int)
train_rel['Mention_B_EndOffset'] = train_rel['Mention_B_EndOffset'].astype(int)

In [135]:
train_rel['Mention_A_EndOffset'] = train_rel.apply(offsetA_end, axis = 1)

In [136]:
train_rel = train_rel[['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset','Mention_A_EndOffset', 'Mention_A',
       'Relation_type', 'Mention_B_type', 'Mention_B_StartOffset',
       'Mention_B_EndOffset', 'Mention_B']]

In [137]:
train_rel.head(2)

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-01082009000400014-1,SHORT_FORM,476,479,NPT,SHORT-LONG,LONG_FORM,454,474,nutrición parenteral
1,S1130-63432016000100009-1,SHORT_FORM,614,619,NIHSS,SHORT-LONG,LONG_FORM,621,662,National Institute of Health Stroke Scale


#### Test

In [139]:
testing_rel = testing_rel.reset_index()

In [140]:
testing_rel.head(2)

Unnamed: 0,level_0,level_1,level_2,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S0211-69952013000500019-1,SHORT_FORM,3739,ARA II,SHORT-LONG,LONG_FORM,3695,3737,antagonista del receptor de angiotesina II
1,S0211-69952013000500019-1,SHORT_FORM,2793,PCR,SHORT-LONG,LONG_FORM,2798,2837,reacción en cadena de enzima polimerasa


In [141]:
testing_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [142]:
testing_rel = testing_rel[testing_rel['Mention_A_StartOffset'] != 'EndOffset']

In [143]:
testing_rel['Mention_A_StartOffset'] = testing_rel['Mention_A_StartOffset'].astype(int)
testing_rel['Mention_B_StartOffset'] = testing_rel['Mention_B_StartOffset'].astype(int)
testing_rel['Mention_B_EndOffset'] = testing_rel['Mention_B_EndOffset'].astype(int)

In [144]:
testing_rel['Mention_A_EndOffset'] = testing_rel.apply(offsetA_end, axis = 1)

In [145]:
testing_rel = testing_rel[['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset','Mention_A_EndOffset', 'Mention_A',
       'Relation_type', 'Mention_B_type', 'Mention_B_StartOffset',
       'Mention_B_EndOffset', 'Mention_B']]

In [146]:
testing_rel.head(2)

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0211-69952013000500019-1,SHORT_FORM,3739,3745,ARA II,SHORT-LONG,LONG_FORM,3695,3737,antagonista del receptor de angiotesina II
1,S0211-69952013000500019-1,SHORT_FORM,2793,2796,PCR,SHORT-LONG,LONG_FORM,2798,2837,reacción en cadena de enzima polimerasa


## Acronym 

#### Train

In [147]:
train_predict = train_raw\
.copy()

In [148]:
train_predict.head(2)

Unnamed: 0,nombre,texto
0,S1130-01082006000100014-1,"Se trata de una mujer de 35 años, con antecedentes familiares de enfermedad de Crohn y sin antec..."
1,S1130-01082009000300015-1,"Varón de 70 años, fumador, con enfisema pulmonar y vitíligo al que en mayo de 2001 se realizó un..."


In [149]:
train_predict['predictions'] = train_predict['texto'].apply(lambda x: schwartz_hearst.extract_abbreviation_definition_pairs(doc_text = x))

In [150]:
train_prediction = train_predict[['nombre', 'predictions']]

In [151]:
train_prediction = (pd.DataFrame(train_prediction['predictions'].tolist(), index=train_prediction['nombre'])
            .reset_index()
            .melt('nombre', var_name='Mention_A', value_name='Mention_B')
            .dropna()
            .reset_index(drop=True))

In [152]:
train_prediction = train_prediction.merge(train_predict[['nombre','texto']], on = 'nombre', how = 'left')

In [153]:
train_prediction.head()

Unnamed: 0,nombre,Mention_A,Mention_B,texto
0,S0210-56912010000200009-1,BAL,broncoalveolar,Se trata de una mujer de 70 años con antecedentes de HTA y diagnosticada recientemente de neopla...
1,S0365-66912006000500008-1,AV,agudeza visual,"Mujer de 35 años acude a consulta por mala agudeza visual (AV) desde la infancia, sin tratamient..."
2,S0365-66912006000200012-1,AV,agudeza visual,Paciente de 76 años de edad que acudió a urgencias por TCE frontal derecho con pérdida de concie...
3,S0365-66912009001100005-1,AV,agudeza visual,Paciente varón de 39 años que acude al Servicio de Urgencias por presentar proptosis de un año d...
4,S0365-66912005000600011-1,AV,agudeza visual,"Un paciente varón de 76 años acude a consulta de oftalmología por presentar dolor, lagrimeo y se..."


In [154]:
train_prediction['Mention_A_type'] = 'SHORT_FORM'
train_prediction['Relation_type'] = 'SHORT-LONG'
train_prediction['Mention_B_type'] = 'LONG_FORM'
train_prediction['Mention_A_StartOffset'] = train_prediction.apply(offsetA, axis = 1)
train_prediction['Mention_B_StartOffset'] = train_prediction.apply(offsetB, axis = 1)
train_prediction['Mention_B_EndOffset'] = train_prediction.apply(offsetB_end, axis = 1)
train_prediction['Mention_A_EndOffset'] = train_prediction.apply(offsetA_end, axis = 1)

In [155]:
train_prediction = train_prediction[['nombre', 'Mention_A_type', 'Mention_A_StartOffset','Mention_A_EndOffset', 'Mention_A',
       'Relation_type', 'Mention_B_type', 'Mention_B_StartOffset',
       'Mention_B_EndOffset', 'Mention_B']]

In [156]:
train_prediction = train_prediction.rename(columns = {'nombre': '# Document_ID'})

In [157]:
train_prediction.head()

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S0210-56912010000200009-1,SHORT_FORM,2068,2071,BAL,SHORT-LONG,LONG_FORM,2052,2066,broncoalveolar
1,S0365-66912006000500008-1,SHORT_FORM,59,61,AV,SHORT-LONG,LONG_FORM,43,57,agudeza visual
2,S0365-66912006000200012-1,SHORT_FORM,562,564,AV,SHORT-LONG,LONG_FORM,546,560,agudeza visual
3,S0365-66912009001100005-1,SHORT_FORM,163,165,AV,SHORT-LONG,LONG_FORM,147,161,agudeza visual
4,S0365-66912005000600011-1,SHORT_FORM,544,546,AV,SHORT-LONG,LONG_FORM,528,542,agudeza visual


#### Test

In [159]:
test_predict = testing_raw\
.copy()

In [160]:
test_predict.head(2)

Unnamed: 0,nombre,texto
0,S1130-01082008001000010-1,"Varón de 43 años originario de Marruecos, que ingresó en nuestro servicio por cuadro de 4 días d..."
1,S0004-06142009000400011-1,Varón de 75 años con antecedentes de EPOC moderado sin otros antecedentes médicos de interés. En...


In [161]:
test_predict['predictions'] = test_predict['texto'].apply(lambda x: schwartz_hearst.extract_abbreviation_definition_pairs(doc_text = x))

In [162]:
test_prediction = test_predict[['nombre', 'predictions']]

In [163]:
test_prediction = (pd.DataFrame(test_prediction['predictions'].tolist(), index=test_prediction['nombre'])
            .reset_index()
            .melt('nombre', var_name='Mention_A', value_name='Mention_B')
            .dropna()
            .reset_index(drop=True))

In [164]:
test_prediction = test_prediction.merge(test_predict[['nombre','texto']], on = 'nombre', how = 'left')

In [165]:
test_prediction.head()

Unnamed: 0,nombre,Mention_A,Mention_B,texto
0,S1130-01082008001000010-1,CPRE,colangiopancreatografía retrógrada endoscópica,"Varón de 43 años originario de Marruecos, que ingresó en nuestro servicio por cuadro de 4 días d..."
1,S0376-78922011000200004-1,IV,intravenosa,Mujer de 44 años de edad con antecedentes médicos de insuficiencia renal crónica en tratamiento ...
2,S1130-01082008000800019-1,TC,tomografía computerizada,"Recientemente, atendimos en nuestro hospital a un varón de 46 años, con antecedente de síndrome ..."
3,S1130-01082009000900012-1,TC,tomografía computerizada,Varón de 26 años derivado a Cirugía General tras ser diagnosticado de forma accidental tras prue...
4,S0211-69952011000400013-1,TC,tomografía computarizada,Mujer de 58 años con antecedentes personales de síndrome depresivo y estenosis del canal lumbar....


In [166]:
test_prediction['Mention_A_type'] = 'SHORT_FORM'
test_prediction['Relation_type'] = 'SHORT-LONG'
test_prediction['Mention_B_type'] = 'LONG_FORM'
test_prediction['Mention_A_StartOffset'] = test_prediction.apply(offsetA, axis = 1)
test_prediction['Mention_B_StartOffset'] = test_prediction.apply(offsetB, axis = 1)
test_prediction['Mention_B_EndOffset'] = test_prediction.apply(offsetB_end, axis = 1)
test_prediction['Mention_A_EndOffset'] = test_prediction.apply(offsetA_end, axis = 1)

In [167]:
test_prediction = test_prediction[['nombre', 'Mention_A_type', 'Mention_A_StartOffset','Mention_A_EndOffset', 'Mention_A',
       'Relation_type', 'Mention_B_type', 'Mention_B_StartOffset',
       'Mention_B_EndOffset', 'Mention_B']]

In [168]:
test_prediction = test_prediction.rename(columns = {'nombre': '# Document_ID'})

In [169]:
test_prediction.head()

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-01082008001000010-1,SHORT_FORM,2168,2172,CPRE,SHORT-LONG,LONG_FORM,2120,2166,colangiopancreatografía retrógrada endoscópica
1,S0376-78922011000200004-1,SHORT_FORM,661,663,IV,SHORT-LONG,LONG_FORM,648,659,intravenosa
2,S1130-01082008000800019-1,SHORT_FORM,1079,1081,TC,SHORT-LONG,LONG_FORM,1053,1077,tomografía computerizada
3,S1130-01082009000900012-1,SHORT_FORM,571,573,TC,SHORT-LONG,LONG_FORM,545,569,tomografía computerizada
4,S0211-69952011000400013-1,SHORT_FORM,1266,1268,TC,SHORT-LONG,LONG_FORM,1240,1264,tomografía computarizada


## Write data

In [170]:
train_rel.to_csv("../data/clinical_cases.relations.training_set_2.tsv", sep = '\t', index = False)
train_prediction.to_csv('../data/track_1_train.tsv', sep = '\t', index = False)

In [171]:
testing_rel.to_csv("../data/clinical_cases.relations.testing_set_2.tsv", sep = '\t', index = False)
test_prediction.to_csv('../data/track_1_test.tsv', sep = '\t', index = False)