# Library

In [1056]:
import pandas as pd
import numpy as np
import os
import re
import collections
import unidecode
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation
from functools import reduce
import ast
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from unidecode import unidecode

[nltk_data] Downloading package wordnet to /Users/egarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [1057]:
pd.set_option('display.max_colwidth', 100)

In [1058]:
%matplotlib inline
from matplotlib import pyplot as plt

# Functions

In [1059]:
def read_texts(path):
    data = []
    file_name = os.listdir(path)

    for name in file_name:
        if name.endswith('.txt'):
            with open(path + name,encoding="utf8") as f:
                text = f.read()
                data.append({'nombre':name.replace('.txt',''), 'texto':text})

    df = pd.DataFrame(data)
    return df

In [1060]:
def clean_text(string):
    """
    A method to clean text 
    """
    
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuation:
            if x != '/':
                string = string.replace(x, "")
            else:
                string = string.replace(x, " ")
    
    string = unidecode.unidecode(string)

#     # Converting the text to lower
#     string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in swords])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string 

In [1061]:
nltk.download('stopwords')
swords = list(set(stopwords.words('spanish')))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1062]:
def sep_num_text(data):

    words = word_tokenize(data) 
    for w in words:
        if re.search(r'\d', w):
            ind = words.index(w)
            words[ind] = [''.join(g) for k, g in itertools.groupby(w, str.isalpha)]
    data = ' '.join([x if type(x) is not list else ' '.join(x) for x in words])
    
    return data

In [1063]:
def get_f_b_context_text(data):
    
    '''Return context in a string format'''
    
    all_data = []
    sf_not_found = []
    
    for instance in data:
           
        texto = instance['texto']    
        #target_word = instance['short_form']
        target_word = instance['Abbreviation']
        doc_id = instance['# Document_ID']
        
        if target_word in texto:

            stop_ini_idx = instance['StartOffset'] #índice del inicio de la target
            stop_fin_idx = instance['EndOffset'] #índice del inicio de la target

            _instance = []
            xf = texto[:stop_ini_idx] + ' <start> ' +texto[stop_ini_idx:stop_fin_idx] + ' <end> ' #palabras anteriores a la target
            #xf = texto[:stop_ini_idx] +texto[stop_ini_idx:stop_fin_idx]
            xb = texto[stop_fin_idx+1:]   #palabras posteriores a la target            

            instance_id = instance['Definition'] #id del significado
            #instance_id = instance['long_form']
            _instance.append(doc_id)
            _instance.append(target_word)
            _instance.append(xf)
            _instance.append(xb)
            _instance.append(instance_id)

            all_data.append(_instance[:])
        else:
            sf_not_found.append(target_word)
#             print("El acrónimo {} no aparece en el texto {}".format(target_word, instance['doc_id']))
        
    return all_data, sf_not_found

In [1064]:
def limit_context(data):

    for doc in data:
        
        sf, xf, xb, lf = doc[1], doc[2], doc[3], doc[4]

        xf_words = word_tokenize(xf)[-n_step_f-1:]
        xb_words = word_tokenize(xb)[:n_step_b]

        doc[2] = ' '.join(xf_words)
        doc[3] = ' '.join(xb_words)   

    return data        

In [1065]:
def create_dict(data):
    
    data_dic = []
    
    for instance in data:
            
        dic = {}
        
        dic['doc_id'] = instance[0]
        dic['short_form'] = instance[1]
        dic['context'] = instance[2] + ' ' + instance[3]
        dic['long_form'] = instance[4]
    
        data_dic.append(dic)
        
    return data_dic
    

In [1066]:
def distance_levenshtein(str1, str2):
    d=dict()
    for i in range(len(str1)+1):
        d[i]=dict()
        d[i][0]=i
    for i in range(len(str2)+1):
        d[0][i] = i
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not str1[i-1] == str2[j-1]))
    return d[len(str1)][len(str2)]

In [1067]:
def normalize_lf(row):
    leven2 = []
    for i in row:
        for j in row:
            if i != j:
                long = max(len(i),len(j))
                ratio = distance_levenshtein(i,j)/long
                if ratio < 0.2:
                    leven2.append(j)
    if leven2:
        leven2 = set(leven2)
        lista = []
        for i in leven2:
            #val = frec[frec['index'] == i]['long_form'].iloc[0]
            val = frec[frec['index'] == i]['Definition'].iloc[0]
            lista.append((i, val))
        lista = set(lista)
        most_freq = sorted(set(lista), key=lambda x: x[1], reverse = True)[0][0]
        sust = {}
        for i in set(leven2):
            sust[i] = most_freq
        
        return sust
    else:
        pass
    

In [1068]:
def get_label(row):
    if row['long_form_x'] == row['long_form_y']:
        return 1
    else:
        return 0

In [1069]:
def offsetA(row):
    return row['texto'].find(row['Mention_A'])
    
def offsetB(row):
    return row['texto'].find(row['Mention_B'])

def offsetB_end(row):
    return row['texto'].find(row['Mention_B']) + len(row['Mention_B'])

def offsetA_end(row):
    return row['Mention_A_StartOffset'] + len(row['Mention_A'])

In [1070]:
def offset(row):
    return row['texto'].find(row['abrev'])

def offsetend(row):
    return row['StartOffset']+len(row['abrev'])

In [1071]:
def defin_dictionary(row,dictionary):
    if row['Definition'] == 'no_existe':
        return dictionary.get(row['Abbreviation'])
    else:
        return row['Definition']

In [1072]:
def defin_abremes_dictionary(row,dictionary):
    return dictionary.get(row['Abbreviation'])

# Load Data

### Testing

220 clinical cases.

No haría falta procesarlo pues crearemos el fichero directamente d elas notas clínicas de test. Luego lo pasaremos por el transformer y la salida la procesaremos para que sea como el gold standard. Aplicaremos el evaluador de IberEval.

In [1073]:
testing_abbr = pd.read_csv("../data/ibereval_data/testing_set/clinical_cases.abbreviations.testing_set.tsv", sep = '\t')
#testing_met = pd.read_csv("../data/ibereval_data/clinical_cases.metadata.testing_set.tsv", sep = '\t')
testing_rel = pd.read_csv("../data/ibereval_data/testing_set/clinical_cases.relations.testing_set.tsv", sep = '\t')

FileNotFoundError: [Errno 2] No such file or directory: '../data/ibereval_data/testing_set/clinical_cases.abbreviations.testing_set.tsv'

In [None]:
#testing_met.head()

In [None]:
testing_rel = testing_rel.reset_index()

In [None]:
testing_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [None]:
testing_rel = testing_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
testing_rel.head()

In [None]:
testing_rel.Relation_type.unique()

In [None]:
testing_abbr = testing_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
testing_abbr.head()

In [None]:
testing_raw = read_texts("../data/ibereval_data/testing_set/testing_set.raw_text/")

In [None]:
testing_raw.head()

#### TEST NEW DF

In [None]:
testing= pd.read_csv("../data/data_paper/test_subtrack2_parte1.csv")

### Trainning

318 clinical cases

In [None]:
train_abbr = pd.read_csv("../data/ibereval_data/trainning_set/clinical_cases.abbreviations.training_set.tsv", sep = '\t')

In [None]:
train_met = pd.read_csv("../data/ibereval_data/trainning_set/clinical_cases.metadata.training_set.tsv", sep = '\t')
train_rel = pd.read_csv("../data/ibereval_data/trainning_set/clinical_cases.relations.training_set.tsv", sep = '\t')

In [None]:
#train_met = train_met.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
#train_met.head()

In [None]:
#train_rel = train_rel.reset_index()

In [None]:
#train_rel.columns = ['# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
#      'Mention_A', 'Relation_type', 'Mention_B_type',
#       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [None]:
#train_rel = train_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
#train_rel.head()

In [None]:
train_abbr = train_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
train_abbr.Definition.nunique()

In [None]:
train_abbr.head()

In [None]:
train_raw = read_texts("../data/ibereval_data/trainning_set/training_set.raw_text/")

In [None]:
train_raw = train_raw.rename(columns = {'nombre': 'doc_id'})

In [None]:
train_raw.head()

#### TRAIN NEW DF

In [None]:
train = pd.read_csv("../data/data_paper/train_subtrack2_parte1.csv")

### Development

In [None]:
dev_abbr = pd.read_csv("../../datasets/development_set/clinical_cases.abbreviations.development_set.tsv", sep = '\t')

In [None]:
dev_met = pd.read_csv("../../datasets/development_set/clinical_cases.metadata.development_set.tsv", sep = '\t')
dev_rel = pd.read_csv("../../datasets/development_set/clinical_cases.relations.development_set.tsv", sep = '\t')

In [None]:
dev_met = train_met.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
dev_met.head()

In [None]:
dev_rel = train_rel.reset_index()

In [None]:
dev_rel.columns = ['index', '# Document_ID', 'Mention_A_type', 'Mention_A_StartOffset',
      'Mention_A', 'Relation_type', 'Mention_B_type',
       'Mention_B_StartOffset', 'Mention_B_EndOffset', 'Mention_B']

In [None]:
dev_rel = dev_rel.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
dev_rel.head()

In [None]:
dev_abbr = dev_abbr.rename(columns = {'# Document_ID': 'doc_id'})

In [None]:
dev_abbr.Definition.nunique()

In [None]:
dev_abbr.head()

In [None]:
dev_raw = read_texts("../../datasets/development_set/development_set.raw_text/")

In [None]:
dev_raw = dev_raw.rename(columns = {'nombre': 'doc_id'})

In [None]:
dev_raw.head()

#### DEV NEW DF

In [None]:
#train_raw = pd.read_csv("../data/data_paper/train_subtrack2_parte1.csv")

## Prepare Train Data

### Preprocessing

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train = train.rename(columns = {'Abbreviation': 'short_form', 'Definition': 'long_form'})

In [None]:
train = train.dropna(subset = ['short_form', 'long_form'])

In [None]:
train[['StartOffset', 'EndOffset']] = train[['StartOffset', 'EndOffset']].astype(int)

Clean long forms to delete acents, string punctuation etc

In [None]:
train['long_form'] = train['long_form'].map(clean_text)

In [None]:
train.head(3)

### Normalize long forms

In [None]:
sf_lf = train[['short_form', 'long_form']].drop_duplicates()

In [None]:
sf_lf.head()

In [None]:
sf_lf_list = sf_lf.groupby('short_form', as_index=False).agg({'long_form': list})
sf_lf_list['len'] = sf_lf_list['long_form'].map(lambda x: len(x))
sf_lf_list.sort_values('len', ascending = False)

In [None]:
table = pd.crosstab(sf_lf_list['len'], columns = 'Count')
table.plot.bar(legend = None)
plt.xlabel("Definiciones por acrónimo")
plt.ylabel("Count")
#plt.show()
#plt.savefig('data/acron_count.png')

In [None]:
table = pd.crosstab(sf_lf_list['len'], columns = 'Count')
table

#### Apply Levenshtein distance to normalize Long Forms

Get more frequent lf per sf

In [None]:
frec = train['long_form'].value_counts().reset_index()
frec

Create a dictionary where keys are the lf to normalize and the values the normalized form

In [None]:
norm = []
for i in sf_lf_list['long_form']:
    norm.append(normalize_lf(i))

In [None]:
norm = [i for i in norm if i != None]

In [None]:
norm_dict = {}
for i in norm:
    norm_dict.update(i)
#norm_dict

Finally normalize long forms over the dataframe

In [None]:
train.head()

In [None]:
train = train.replace({"long_form": norm_dict})

Check that deffinitions have been normalized

In [None]:
train.head()

### Get just ambigous acronyms

Execute this code just in case only ambigous acronyms are needed.

In [None]:
#amb = train.groupby('short_form')['long_form'].nunique().reset_index().sort_values('long_form', ascending = False)

In [None]:
#amb.head()

In [None]:
#table = pd.crosstab(amb['long_form'], columns = 'Count')
#table.plot.bar()
#plt.show()

In [None]:
#defin = amb[amb['long_form'] != 1]['short_form'].values.tolist()

In [None]:
#train.shape

In [None]:
#train = train[train['short_form'].isin(defin)]

In [None]:
#train.shape

### Add texts from Medline

Some long forms are imbalanced. Texts from Medline are added to improve balance.

Execute this code just in case we need to add more test. We won't do it al first.

Study imbalaced long forms. Number of rows for each different definition. We can see mostly long forms have just one row.

In [None]:
#frec2 = train['long_form'].value_counts().reset_index()

In [None]:
#train.long_form.nunique()

In [None]:
#train[['doc_id','long_form']].drop_duplicates().shape

In [None]:
#def_count = train[['doc_id','long_form']].drop_duplicates().groupby('long_form').agg({'doc_id':'count'}).reset_index()
#def_count.sort_values('doc_id').tail()

In [None]:
#table = pd.crosstab(def_count['doc_id'], columns = 'Count')
#table.plot.bar(legend = None, figsize=(15,8))
#plt.xlabel("Cantidad de textos en los que aparece la definición")
#plt.ylabel("Count")
#plt.savefig('data/def_count.png')
#plt.show()

In [None]:
#frec2[frec2['long_form'] == 1].head()

Let's find more text for definitions with 3 or less rows in train dataset.

In [None]:
#train_lf = train.groupby('long_form').size().reset_index().rename(columns={0:'count'}).sort_values('count')
#train_lf_list = train_lf[train_lf['count'] <= 3]['long_form'].unique().tolist()

In [None]:
#len(train_lf_list)

List of deffinition to search in Medline

In [None]:
#sorted(train_lf_list)

#### Medline texts

In [None]:
#medline = pd.read_csv("../data/scrapping/textos_medline_scrapping2.csv")
#medline.head()

In [None]:
#medline[medline['concept'].isin(train_lf_list)]['concept'].nunique()

Just 30 deffinition have been find in Medline

In [None]:
#medline[medline['concept'].isin(train_lf_list)].shape

In [None]:
#medline[medline['concept'].isin(train_lf_list)].head()

In [None]:
#train.head()

Transform Medline text with train structure

In [None]:
#med_texts = medline[medline['concept'].isin(train_lf_list)]

In [None]:
#med_texts = med_texts[['text', 'concept']].rename(columns = {'text':'texto', 'concept':'long_form'})

In [None]:
#sf_lf = train[train['long_form'].isin(train_lf_list)][['short_form', 'long_form']].set_index('long_form').to_dict()['short_form']

In [None]:
#med_texts['short_form'] = med_texts['long_form'].map(sf_lf)

In [None]:
#def replace_sf_lf(row):
#    row['texto'] = row['texto'].replace(row['long_form'], row['short_form'])
#    return row['texto']

In [None]:
#med_texts['texto'] = med_texts.apply(replace_sf_lf, axis = 1)

In [None]:
#med_texts.head()

In [None]:
#def find_offset(row):
#    start_i = row['texto'].find(row['short_form'])
#    end_i = start_i + len(row['short_form'])
    
#    return start_i, end_i   

In [None]:
#med_texts['offsets'] = med_texts.apply(find_offset, axis = 1)

In [None]:
#med_texts[['StartOffset', 'EndOffset']] = pd.DataFrame(med_texts['offsets'].tolist(), index=med_texts.index)

In [None]:
#med_texts = med_texts[['texto', 'short_form', 'long_form','StartOffset', 'EndOffset']]

In [None]:
#med_texts.head()

Concat train and Medline texts

In [None]:
#train.shape

In [None]:
#train = pd.concat([train, med_texts], axis = 0)

In [None]:
#train.shape

In [None]:
#train.reset_index(inplace = True, drop = True)
#train.head()

Check how many deffinition have low texts

In [None]:
#frec3 = train['long_form'].value_counts().reset_index()

In [None]:
#table = pd.crosstab(frec2['long_form'], columns = 'Count')
#table.plot.bar()
#plt.title('Number of deffinition records train')
#plt.show()

In [None]:
#table = pd.crosstab(frec3['long_form'], columns = 'Count')
#table.plot.bar()
#plt.title('Number of deffinition records after adding Medline texts to train')
#plt.show()

In [None]:
#lf_low = set(train_lf_list)

In [None]:
#lf_inmedline = set(medline[medline['concept'].isin(train_lf_list)]['concept'].unique().tolist())

In [None]:
#lf_low ^ lf_inmedline

### Transform dataframe to a list of dictionaries

In [None]:
train_data = train[['nombre', 'texto', 'short_form', 'long_form', 'StartOffset', 'EndOffset']].to_dict('records')

### Get the text before and after the SF

In [None]:
train_ndata, sf_not_found = get_f_b_context_text(train_data)

Check if any SF is not founded in the text

In [None]:
sf_not_found_set = set(sf_not_found)

In [None]:
len(sf_not_found_set)

Execute this code just in case clean the texts are needed: remove string punctuation, accents, lower case, remove double spaces, separate numbers from sf

In [None]:
# def clean_text_after(data):
    
#     for instance in data:
#         instance[1] = clean_text(instance[1]).lower()
#         instance[2] = clean_text(instance[2]).lower()
#         instance[1] = re.sub(r'(\d+)', r'\g<1> ', instance[1]) #metemos espacio entre número y acrónimos que quedan pegados    
#         instance[2] = re.sub(r'(\d+)', r'\g<1> ', instance[2])
#         instance[1] = re.sub(r'\s+', ' ', instance[1]).strip()
#         instance[2] = re.sub(r'\s+', ' ', instance[2]).strip()
#     return data
        

In [None]:
# train_ndata = clean_text_after(train_ndata)

### Get the contexts 

From the texts before and after the sf, we limit the number of words

In [None]:
star_ends_tokens = 6 #number of tokens to add because <start> and <end> labels 

In [None]:
n_step_f = 10 + star_ends_tokens #number of words to select from the forward context
n_step_b = 10 #number of words to select from the backward context

In [None]:
nltk.download('punkt')

In [None]:
train_ndata = limit_context(train_ndata)

In [None]:
train_ndata[0]

### Transform into df grouped by LF

In [None]:
data_dic = create_dict(train_ndata)

In [None]:
data_df = pd.DataFrame(data_dic)

Asign an id to each LF

In [None]:
data_df = data_df.assign(id=(data_df['long_form']).astype('category').cat.codes)

In [None]:
data_df.head()

In [None]:
data_df.shape

We will join each SF with all possible LFs, one for each record, so that later the model works in binary form assigning the probability that that is its LF

In [None]:
sf_lf_unique = data_df[['short_form', 'long_form']].drop_duplicates()

In [None]:
sf_lf_unique.shape

In [None]:
sf_lf_unique.sort_values('short_form').head()

In [None]:
data_merged = data_df.merge(sf_lf_unique, on = 'short_form', how = 'left')

In [None]:
data_merged.shape

In [None]:
pd.set_option('display.max_colwidth', 100)

In [None]:
data_df.sort_values(['short_form', 'context']).head()

In [None]:
data_merged.sort_values(['short_form', 'context']).head()

Target must be 1 or 0. 1 if the corresponding LF is the one assigned to it, 0 otherwise

In [None]:
data_merged['label'] = data_merged.apply(get_label, axis = 1)

In [None]:
data_merged.head()

In [None]:
data_merged = data_merged[['short_form', 'context', 'long_form_y', 'label']]

In [None]:
data_merged = data_merged.rename(columns = {'long_form_y':'long_form'})

In [None]:
data_merged.reset_index(drop = True, inplace = True)

In [None]:
data_merged.head()

In [None]:
data_merged = data_merged.drop_duplicates()

In [None]:
data_merged.isna().sum()

In [None]:
data_merged.to_csv('../data/marzo2023/train_data_beto_10_allacron_nomedline_nolevenstein_ownpreproces.csv', index = False, sep = '\t')

In [None]:
#prueba = pd.read_csv('../data/marzo2023/subtrack2/train_data_beto_10_allacron_nomedline_nolevenstein_ownpreproces.csv', sep = '\t')
#prueba = prueba.head(10)
#prueba.to_csv('../data/marzo2023/train_prueba.csv', index = False, sep = '\t')

## Prepare Dev Data

### Preprocessing

In [None]:
dev = dev_raw.merge(dev_abbr[['doc_id', 'Abbreviation', 'Definition']], on = 'doc_id', how = 'left')

In [None]:
dev = dev.merge(dev_abbr, on = ['doc_id', 'Abbreviation'], how = 'left')

In [None]:
dev = dev.drop_duplicates()

In [None]:
dev = dev[['doc_id', 'texto', 'Abbreviation', 'Definition_x', 'StartOffset', 'EndOffset']]

In [None]:
dev = dev.rename(columns = {'Definition_x':'Definition'})

In [None]:
dev.head()

In [None]:
dev = dev.rename(columns = {'Abbreviation': 'short_form', 'Definition': 'long_form'})

In [None]:
dev = dev.dropna(subset = ['short_form', 'long_form'])

In [None]:
dev[['StartOffset', 'EndOffset']] = dev[['StartOffset', 'EndOffset']].astype(int)

Clean long forms to delete acents, string punctuation etc

In [None]:
dev['long_form'] = dev['long_form'].map(clean_text)

In [None]:
dev.head(3)

### Normalize long forms

In [None]:
sf_lf = dev[['short_form', 'long_form']].drop_duplicates()

In [None]:
sf_lf.head()

In [None]:
sf_lf_list = sf_lf.groupby('short_form', as_index=False).agg({'long_form': list})
sf_lf_list['len'] = sf_lf_list['long_form'].map(lambda x: len(x))
sf_lf_list.sort_values('len', ascending = False)

In [None]:
table = pd.crosstab(sf_lf_list['len'], columns = 'Count')
table.plot.bar(legend = None)
plt.xlabel("Definiciones por acrónimo")
plt.ylabel("Count")
#plt.show()
#plt.savefig('data/acron_count.png')

In [None]:
table = pd.crosstab(sf_lf_list['len'], columns = 'Count')
table

#### Apply Levenshtein distance to normalize Long Forms

Get more frequent lf per sf

In [None]:
frec = dev['long_form'].value_counts().reset_index()
frec

Create a dictionary where keys are the lf to normalize and the values the normalized form

In [None]:
norm = []
for i in sf_lf_list['long_form']:
    norm.append(normalize_lf(i))

In [None]:
norm = [i for i in norm if i != None]

In [None]:
norm_dict = {}
for i in norm:
    norm_dict.update(i)
#norm_dict

Finally normalize long forms over the dataframe

In [None]:
dev.head()

In [None]:
dev = dev.replace({"long_form": norm_dict})

Check that deffinitions have been normalized

In [None]:
dev.head()

### Get just ambigous acronyms

Execute this code just in case only ambigous acronyms are needed.

In [None]:
#amb = train.groupby('short_form')['long_form'].nunique().reset_index().sort_values('long_form', ascending = False)

In [None]:
#amb.head()

In [None]:
#table = pd.crosstab(amb['long_form'], columns = 'Count')
#table.plot.bar()
#plt.show()

In [None]:
#defin = amb[amb['long_form'] != 1]['short_form'].values.tolist()

In [None]:
#train.shape

In [None]:
#train = train[train['short_form'].isin(defin)]

In [None]:
#train.shape

### Add texts from Medline

Some long forms are imbalanced. Texts from Medline are added to improve balance.

In [None]:
dev.head()

Study imbalaced long forms. Number of rows for each different definition. We can see mostly long forms have just one row.

In [None]:
frec2 = dev['long_form'].value_counts().reset_index()

In [None]:
dev.long_form.nunique()

In [None]:
dev[['doc_id','long_form']].drop_duplicates().shape

In [None]:
def_count = dev[['doc_id','long_form']].drop_duplicates().groupby('long_form').agg({'doc_id':'count'}).reset_index()
def_count.sort_values('doc_id').tail()

In [None]:
table = pd.crosstab(def_count['doc_id'], columns = 'Count')
table.plot.bar(legend = None, figsize=(15,8))
plt.xlabel("Cantidad de textos en los que aparece la definición")
plt.ylabel("Count")
#plt.savefig('data/def_count.png')
plt.show()

In [None]:
frec2[frec2['long_form'] == 1].head()

Let's find more text for definitions with 3 or less rows in train dataset.

In [None]:
dev_lf = dev.groupby('long_form').size().reset_index().rename(columns={0:'count'}).sort_values('count')
dev_lf_list = dev_lf[dev_lf['count'] <= 3]['long_form'].unique().tolist()

In [None]:
len(dev_lf_list)

List of deffinition to search in Medline

In [None]:
#sorted(dev_lf_list)

#### Medline texts

In [None]:
medline = pd.read_csv("../data/scrapping/textos_medline_scrapping2.csv")
medline.head()

In [None]:
medline[medline['concept'].isin(dev_lf_list)]['concept'].nunique()

Just 8 deffinition have been find in Medline

In [None]:
medline[medline['concept'].isin(dev_lf_list)].shape

In [None]:
medline[medline['concept'].isin(dev_lf_list)].head()

In [None]:
dev.head()

Transform Medline text with train structure

In [None]:
med_texts = medline[medline['concept'].isin(dev_lf_list)]

In [None]:
med_texts = med_texts[['text', 'concept']].rename(columns = {'text':'texto', 'concept':'long_form'})

In [None]:
sf_lf = dev[dev['long_form'].isin(dev_lf_list)][['short_form', 'long_form']].set_index('long_form').to_dict()['short_form']

In [None]:
med_texts['short_form'] = med_texts['long_form'].map(sf_lf)

In [None]:
def replace_sf_lf(row):
    row['texto'] = row['texto'].replace(row['long_form'], row['short_form'])
    return row['texto']

In [None]:
med_texts['texto'] = med_texts.apply(replace_sf_lf, axis = 1)

In [None]:
med_texts.head()

In [None]:
def find_offset(row):
    start_i = row['texto'].find(row['short_form'])
    end_i = start_i + len(row['short_form'])
    
    return start_i, end_i   

In [None]:
med_texts['offsets'] = med_texts.apply(find_offset, axis = 1)

In [None]:
med_texts[['StartOffset', 'EndOffset']] = pd.DataFrame(med_texts['offsets'].tolist(), index=med_texts.index)

In [None]:
med_texts = med_texts[['texto', 'short_form', 'long_form','StartOffset', 'EndOffset']]

In [None]:
med_texts.head()

Concat train and Medline texts

In [None]:
dev.shape

In [None]:
dev = pd.concat([dev, med_texts], axis = 0)

In [None]:
dev.shape

In [None]:
dev.reset_index(inplace = True, drop = True)
dev.head()

Check how many deffinition have low texts

In [None]:
frec3 = dev['long_form'].value_counts().reset_index()

In [None]:
table = pd.crosstab(frec2['long_form'], columns = 'Count')
table.plot.bar()
plt.title('Number of deffinition records train')
plt.show()

In [None]:
table = pd.crosstab(frec3['long_form'], columns = 'Count')
table.plot.bar()
plt.title('Number of deffinition records after adding Medline texts to train')
plt.show()

In [None]:
lf_low = set(dev_lf_list)

In [None]:
lf_inmedline = set(medline[medline['concept'].isin(dev_lf_list)]['concept'].unique().tolist())

In [None]:
#lf_low ^ lf_inmedline

Transform dataframe to a list of dictionaries

In [None]:
dev_data = dev[['doc_id', 'texto', 'short_form', 'long_form', 'StartOffset', 'EndOffset']].to_dict('records')

### Get the text before and after the SF

In [None]:
dev_ndata, sf_not_found = get_f_b_context_text(dev_data)

Check if any SF is not founded in the text

In [None]:
sf_not_found_set = set(sf_not_found)

In [None]:
len(sf_not_found_set)

Execute this code just in case clean the texts are needed: remove string punctuation, accents, lower case, remove double spaces, separate numbers from sf

In [None]:
# def clean_text_after(data):
    
#     for instance in data:
#         instance[1] = clean_text(instance[1]).lower()
#         instance[2] = clean_text(instance[2]).lower()
#         instance[1] = re.sub(r'(\d+)', r'\g<1> ', instance[1]) #metemos espacio entre número y acrónimos que quedan pegados    
#         instance[2] = re.sub(r'(\d+)', r'\g<1> ', instance[2])
#         instance[1] = re.sub(r'\s+', ' ', instance[1]).strip()
#         instance[2] = re.sub(r'\s+', ' ', instance[2]).strip()
#     return data
        

In [None]:
# train_ndata = clean_text_after(train_ndata)

### Get the contexts 

From the texts before and after the sf, we limit the number of words

In [None]:
star_ends_tokens = 6 #number of tokens to add because <start> and <end> labels 

In [None]:
n_step_f = 10 + star_ends_tokens #number of words to select from the forward context
n_step_b = 10 #number of words to select from the backward context

In [None]:
nltk.download('punkt')

In [None]:
dev_ndata = limit_context(dev_ndata)

In [None]:
dev_ndata[0]

### Transform into df grouped by LF

In [None]:
data_dic = create_dict(dev_ndata)

In [None]:
data_df = pd.DataFrame(data_dic)

Asign an id to each LF

In [None]:
data_df = data_df.assign(id=(data_df['long_form']).astype('category').cat.codes)

In [None]:
data_df.head()

In [None]:
data_df.shape

We will join each SF with all possible LFs, one for each record, so that later the model works in binary form assigning the probability that that is its LF

In [None]:
sf_lf_unique = data_df[['short_form', 'long_form']].drop_duplicates()

In [None]:
sf_lf_unique.shape

In [None]:
sf_lf_unique.sort_values('short_form').head()

In [None]:
data_merged = data_df.merge(sf_lf_unique, on = 'short_form', how = 'left')

In [None]:
data_merged.shape

In [None]:
pd.set_option('display.max_colwidth', 100)

In [None]:
data_df.sort_values(['short_form', 'context']).head()

In [None]:
data_merged.sort_values(['short_form', 'context']).head()

Target must be 1 or 0. 1 if the corresponding LF is the one assigned to it, 0 otherwise

In [None]:
data_merged['label'] = data_merged.apply(get_label, axis = 1)

In [None]:
data_merged.head()

In [None]:
data_merged = data_merged[['short_form', 'context', 'long_form_y', 'label']]

In [None]:
data_merged = data_merged.rename(columns = {'long_form_y':'long_form'})

In [None]:
data_merged.reset_index(drop = True, inplace = True)

In [None]:
data_merged.head()

In [None]:
data_merged.to_csv('../data/data_train/dev_data_beto_10_allacron_lfnorm_medline.csv')

## Prepare Test Data

### Preprocessing

In [None]:
testing.rename(columns={'nombre':'doc_id'}, inplace = True)

In [None]:
test = testing.rename(columns = {'Definition_lemmatized_x':'Definition'})

In [None]:
test.head()

In [None]:
test = test.rename(columns = {'Abbreviation': 'short_form', 'Definition': 'long_form'})

In [None]:
test = test.dropna(subset = ['short_form', 'long_form'])

In [None]:
test[['StartOffset', 'EndOffset']] = test[['StartOffset', 'EndOffset']].astype(int)

Clean long forms to delete acents, string punctuation etc

In [None]:
test['long_form'] = test['long_form'].map(clean_text)

### Normalize long forms

In [None]:
sf_lf_test = test[['short_form', 'long_form']].drop_duplicates()

In [None]:
sf_lf_list_test = sf_lf_test.groupby('short_form', as_index=False).agg({'long_form': list})
sf_lf_list_test['len'] = sf_lf_list_test['long_form'].map(lambda x: len(x))
sf_lf_list_test.sort_values('len', ascending = False)

#### Apply Levenshtein distance to normalize Long Forms

Get more frequent lf per sf

In [None]:
frec = test['long_form'].value_counts().reset_index()
frec

Create a dictionary where keys are the lf to normalize and the values the normalized form

In [None]:
norm_test = []
for i in sf_lf_list_test['long_form']:
    norm_test.append(normalize_lf(i))

In [None]:
norm_test = [i for i in norm_test if i != None]

In [None]:
norm_dict_test = {}
for i in norm_test:
    norm_dict_test.update(i)
#norm_dict

Finally normalize long forms over the dataframe

In [None]:
train.head()

In [None]:
test = test.replace({"long_form": norm_dict_test})

In [None]:
train.head()

### Get just ambigous acronyms

Execute this code just in case only ambigous acronyms are needed.

In [None]:
#amb_test = test.groupby('short_form')['long_form'].nunique().reset_index().sort_values('long_form', ascending = False)

In [None]:
#amb_test.head()

In [None]:
#table = pd.crosstab(amb_test['long_form'], columns = 'Count')
#table.plot.bar()
#plt.show()

In [None]:
#defin_test = amb_test[amb_test['long_form'] != 1]['short_form'].values.tolist()

In [None]:
#test.shape

In [None]:
#test = test[test['short_form'].isin(defin_test)]

In [None]:
#test.shape

### Transform dataframe to a list of dictionaries

In [None]:
test_data = test[['doc_id', 'texto', 'short_form', 'long_form', 'StartOffset', 'EndOffset']].to_dict('records')

### Get the text before and after the SF

In [None]:
test_ndata, sf_not_found = get_f_b_context_text(test_data)

Check if any SF is not founded in the text

In [None]:
sf_not_found_set = set(sf_not_found)

In [None]:
len(sf_not_found_set)

Execute this code just in case clean the texts are needed: remove string punctuation, accents, lower case, remove double spaces, separate numbers from sf

In [None]:
# def clean_text_after(data):
    
#     for instance in data:
#         instance[1] = clean_text(instance[1]).lower()
#         instance[2] = clean_text(instance[2]).lower()
#         instance[1] = re.sub(r'(\d+)', r'\g<1> ', instance[1]) #metemos espacio entre número y acrónimos que quedan pegados    
#         instance[2] = re.sub(r'(\d+)', r'\g<1> ', instance[2])
#         instance[1] = re.sub(r'\s+', ' ', instance[1]).strip()
#         instance[2] = re.sub(r'\s+', ' ', instance[2]).strip()
#     return data
        

In [None]:
# train_ndata = clean_text_after(train_ndata)

### Get the contexts 

From the texts before and after the sf, we limit the number of words

In [None]:
star_ends_tokens = 6 #number of tokens to add because <start> and <end> labels 

In [None]:
n_step_f = 10 + star_ends_tokens #number of words to select from the forward context
n_step_b = 10 #number of words to select from the backward context

In [None]:
test_ndata = limit_context(test_ndata)

In [None]:
test_ndata[0]

### Transform into df grouped by LF

In [None]:
data_dic = create_dict(test_ndata)

In [None]:
data_df = pd.DataFrame(data_dic)

Asign an id to each LF

In [None]:
data_df = data_df.assign(id=(data_df['long_form']).astype('category').cat.codes)

In [None]:
data_df.head()

In [None]:
data_df.shape

We will join each SF with all possible LFs, one for each record, so that later the model works in binary form assigning the probability that that is its LF

In [None]:
sf_lf_unique = data_df[['short_form', 'long_form']].drop_duplicates()

In [None]:
sf_lf_unique.shape

In [None]:
sf_lf_unique.sort_values('short_form').head()

In [None]:
data_merged_test = data_df.merge(sf_lf_unique, on = 'short_form', how = 'left')

In [None]:
data_merged_test.shape

In [None]:
data_df.sort_values(['short_form', 'context']).head()

In [None]:
data_merged_test.sort_values(['short_form', 'context']).head()

Target must be 1 or 0. 1 if the corresponding LF is the one assigned to it, 0 otherwise

In [None]:
data_merged_test['label'] = data_merged_test.apply(get_label, axis = 1)

In [None]:
data_merged_test.head()

In [None]:
data_merged_test = data_merged_test[['short_form', 'context', 'long_form_y', 'label']]

In [None]:
data_merged_test = data_merged_test.rename(columns = {'long_form_y':'long_form'})

In [None]:
data_merged_test.reset_index(drop = True, inplace = True)

In [None]:
data_merged_test.head()

In [None]:
data_merged_test.to_csv('../data/data_train/test_data_beto_10_allacronim_ownprocess.csv', index = False, sep = '\t')

## Analize acronyms included in train and test

In [None]:
data_merged.head(3)

In [None]:
data_merged_test.head(3)

In [None]:
merged = data_merged_test.merge(data_merged, on = 'short_form', indicator = True, how = 'left')

In [None]:
merged.head()

In [None]:
merged[merged['_merge'] == 'left_only'].shape

In [None]:
data_merged_test.shape

In [None]:
#sorted(merged[merged['_merge'] == 'left_only']['short_form'].unique().tolist())

# Prepare Test Soto

### 1) Get the short-form from the text

I already have it from Soto process

In [1074]:
test = pd.read_csv("../../data/marzo2023/test_subtrack2_marzo23soto_parte1.csv", sep = '\t')

In [1075]:
test.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,Definition,Definition_lemmatized
0,S1130-14732005000200003-1,300,302,mm,aminoacidos más abundantes,aminoacidos más abundantes
1,S1130-14732005000200003-1,649,651,TC,tomografías computarizadas,tomografías computarizadas


In [1076]:
#test = test.rename(columns = {'Abbreviation': 'short_form', 'Definition': 'long_form'})

In [1077]:
#test = test.dropna(subset = ['short_form', 'long_form'])

In [1078]:
test[['StartOffset', 'EndOffset']] = test[['StartOffset', 'EndOffset']].astype(int)

In [1079]:
test = test[['# Document_ID', 'StartOffset', 'EndOffset', 'Abbreviation']]

In [1080]:
test.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation
0,S1130-14732005000200003-1,300,302,mm
1,S1130-14732005000200003-1,649,651,TC


Join the text from raw to Soto dataset

In [1081]:
testing_raw = testing_raw.rename(columns = {'nombre': '# Document_ID'})

In [1082]:
test = test.merge(testing_raw, on = '# Document_ID', how = 'left')

In [1083]:
test.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."


In [1084]:
test = test.drop_duplicates()
test.shape

(1561, 5)

### 2) Separate SF with LF in the text from the ones without explicit LF in the text

In [1085]:
test_sflf = pd.read_csv("../../data/originales_soto/OutputApproach4Relations_testing.tsv", sep = '\t')

In [1086]:
test_sflf.head()

Unnamed: 0,# Document_ID,Mention_A_type,Mention_A_StartOffset,Mention_A_EndOffset,Mention_A,Relation_type,Mention_B_type,Mention_B_StartOffset,Mention_B_EndOffset,Mention_B
0,S1130-14732005000200003-1,SHORT_FORM,1683,1688,XOMED,SHORT-LONG,LONG_FORM,1672,1681,Medtronic
1,S0365-66912011001100006-1,SHORT_FORM,127,129,AV,SHORT-LONG,LONG_FORM,111,125,agudeza visual
2,S0365-66912011001100006-1,SHORT_FORM,206,208,OD,SHORT-LONG,LONG_FORM,193,204,ojo derecho
3,S0365-66912011001100006-1,SHORT_FORM,250,252,OI,SHORT-LONG,LONG_FORM,235,248,ojo izquierdo
4,S0212-71992004000300009-1,SHORT_FORM,933,936,CEA,SHORT-LONG,LONG_FORM,907,931,con marcadores tumorales


In [1087]:
print(test.shape)
print(test_sflf.shape)
print(test['# Document_ID'].nunique())
print(test_sflf['# Document_ID'].nunique())

(1561, 5)
(213, 10)
208
99


In [1088]:
doc_id_test = test['# Document_ID'].unique().tolist()
doc_id_test_sflf = test_sflf['# Document_ID'].unique().tolist()

In [1089]:
len(set(doc_id_test) ^ set(doc_id_test_sflf))

111

We took the SF from the other dataset where the LF is explicit written on the test

In [1090]:
test_sflf2 = test_sflf[['# Document_ID', 'Mention_A_StartOffset','Mention_A_EndOffset', 'Mention_A', 'Mention_B']]
test_sflf2 = test_sflf2.rename(columns = {'Mention_A_StartOffset':'StartOffset','Mention_A_EndOffset':'EndOffset', 'Mention_A':'Abbreviation', 'Mention_B':'Definition'})

In [1091]:
df_all = test.merge(test_sflf2, how = 'left', on = ['# Document_ID', 'Abbreviation','StartOffset','EndOffset'], indicator = True)

In [1092]:
df_all = df_all.drop_duplicates()

In [1093]:
df_all.head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",,left_only
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",,left_only
2,S1130-14732005000200003-1,741,743,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",,left_only
3,S1130-14732005000200003-1,819,821,RM,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",,left_only
4,S1130-14732005000200003-1,1011,1013,IV,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",,left_only


In [1094]:
df_all._merge.value_counts()

left_only     1380
both           181
right_only       0
Name: _merge, dtype: int64

In [1095]:
df_all[df_all._merge == 'both'].head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
8,S0365-66912011001100006-1,127,129,AV,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",agudeza visual,both
9,S0365-66912011001100006-1,206,208,OD,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",ojo derecho,both
10,S0365-66912011001100006-1,250,252,OI,"Varón de 75 años, diagnosticado de queratopatía lipoidea bilateral primaria, que refería pérdida...",ojo izquierdo,both
15,S0212-71992004000300009-1,933,936,CEA,Varón de 22 años de edad que acude a consultas por presentar desde hacía 4 meses una adenopatía ...,con marcadores tumorales,both
37,S0211-69952011000400013-1,1266,1268,TC,Mujer de 58 años con antecedentes personales de síndrome depresivo y estenosis del canal lumbar....,tomografía computarizada,both


Save separate SF with LF explicint in the text and SF without definition in the text

In [1096]:
df_all.shape

(1561, 7)

In [1097]:
df_all[df_all._merge == 'both'].to_csv('../../data/abril23/test_sf_lftext_soto.tsv', sep = '\t', index = False)

In [1098]:
test = df_all[df_all._merge == 'left_only']

The SF without definition on the text are added its definitions from dictionary and disambiguated with a transformer

In [1099]:
test = test[['# Document_ID', 'StartOffset', 'EndOffset', 'Abbreviation', 'texto']]

In [1100]:
test.head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."
2,S1130-14732005000200003-1,741,743,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."
3,S1130-14732005000200003-1,819,821,RM,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."
4,S1130-14732005000200003-1,1011,1013,IV,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."


In [1101]:
test[['Abbreviation']].drop_duplicates().to_csv('../../data/abril23/test_abbreviation.csv', index = False)

### 3) Give a long-form from AbreMES data base

In [1102]:
abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/test_abremes.tsv", sep = '\t')

In [1103]:
abremes.head()

Unnamed: 0,Abbreviation,Definition
0,Zn,zinc
1,Zn,zonas 1 y 2
2,WPW,wolff-parkinson-white
3,WPW,wolff-parkinson-white manifiesto y oculto
4,WAIS,wechsler adult intelligence scale


In [1105]:
#abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')

In [1106]:
test['Abbreviation'] = test['Abbreviation'].str.strip()

In [1107]:
abremes[abremes.Abbreviation == 'mm']

Unnamed: 0,Abbreviation,Definition


In [1036]:
print(test.shape)
print(test.Abbreviation.nunique())
print(abremes.shape)
print(abremes.Abbreviation.nunique())

(1380, 5)
281
(1979, 2)
232


In [1037]:
abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/pairs.tsv", sep = '\t')

In [1038]:
abremes.head()

Unnamed: 0,Abbreviation,Definition
0,ZVTN,Zonas Veredales Transitorias de Normalización
1,ZUA,Zona de Última Acción
2,ZU,zona de salud urbana
3,ZU,zonas urbanas
4,ZTPI,Zimbardo Time Perspective Inventory


In [1039]:
abremes['Abbreviation'] = abremes['Abbreviation'].str.replace('[!"#$%&*+,-./:;<=>?@^_`{|}~]','')

  """Entry point for launching an IPython kernel.


In [1040]:
test = test.rename(columns = {'abrev':'Abbreviation'})

In [1041]:
test['Abbreviation'] = test['Abbreviation'].str.strip()

In [1042]:
print(test.shape)
print(test.Abbreviation.nunique())
print(abremes.shape)
print(abremes.Abbreviation.nunique())

(1380, 5)
281
(55302, 2)
20852


In [1108]:
test.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven..."


In [1109]:
test_def = test.merge(abremes, how = 'left', on = 'Abbreviation', indicator = True)

In [1110]:
test_def.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",,left_only
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",tomografaa computerizada,both


In [1111]:
test_def['Definition'] = test_def['Definition'].fillna('no_existe')

In [1112]:
test_def.shape

(9580, 7)

In [1113]:
test_def.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",no_existe,left_only
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",tomografaa computerizada,both


### 4) Add dictionary with measure units

In [1114]:
with open("dictionary_measureunits.txt", "r") as data:
    dictionary = ast.literal_eval(data.read())

Assign definitions from AbreMES DB

In [1115]:
test_def['Definition'] = test_def.apply(lambda x: defin_dictionary(x, dictionary), axis = 1)

In [1116]:
test_def.head(2)

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",milimetro,left_only
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",tomografaa computerizada,both


In [1117]:
sf_notfind= test_def[test_def['Definition'].isna()]['Abbreviation'].unique().tolist()

In [1118]:
len(sf_notfind)

35

In [1119]:
test_def= test_def.dropna(subset = ['Definition'])

In [1120]:
#En test real hay 600 y pico
print(test_def.Abbreviation.nunique())

246


In [1121]:
test_def = test_def.drop_duplicates()

In [1122]:
test_def.head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",milimetro,left_only
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",tomografaa computerizada,both
2,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",trayectorias clanicas,both
3,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",trastornos cra3nicos,both
4,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",taninos condensados,both


In [1123]:
print(test_def.shape)
print(test_def.Abbreviation.nunique())

(9407, 7)
246


### Normalize long forms

In [1124]:
test_def[test_def['Abbreviation'] == 'II']

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
558,S0365-66912007001100010-1,262,264,II,Paciente de 63 años que refería déficit de agudeza visual (AV) en el ojo derecho (OD) de varios ...,infrahepa!tica,both
559,S0365-66912007001100010-1,262,264,II,Paciente de 63 años que refería déficit de agudeza visual (AV) en el ojo derecho (OD) de varios ...,inguinales directas,both
560,S0365-66912007001100010-1,262,264,II,Paciente de 63 años que refería déficit de agudeza visual (AV) en el ojo derecho (OD) de varios ...,intraoperatoria en urologaa,both
561,S0365-66912007001100010-1,262,264,II,Paciente de 63 años que refería déficit de agudeza visual (AV) en el ojo derecho (OD) de varios ...,informacia3n,both
562,S0365-66912007001100010-1,262,264,II,Paciente de 63 años que refería déficit de agudeza visual (AV) en el ojo derecho (OD) de varios ...,intestinal,both
...,...,...,...,...,...,...,...
8019,S1137-66272014000300015-1,732,734,II,"Paciente mujer de 57 años, sin antecedentes médicos de interés, exfumadora desde hace 15 años, r...",insuficiencia cardaaca,both
8020,S1137-66272014000300015-1,732,734,II,"Paciente mujer de 57 años, sin antecedentes médicos de interés, exfumadora desde hace 15 años, r...",investigacia3n urola3gica,both
8021,S1137-66272014000300015-1,732,734,II,"Paciente mujer de 57 años, sin antecedentes médicos de interés, exfumadora desde hace 15 años, r...",italia,both
8022,S1137-66272014000300015-1,732,734,II,"Paciente mujer de 57 años, sin antecedentes médicos de interés, exfumadora desde hace 15 años, r...",investigacia3n,both


In [1125]:
sf_lf_test = test_def[['Abbreviation', 'Definition']].drop_duplicates()

In [1126]:
sf_lf_list_test = sf_lf_test.groupby('Abbreviation', as_index=False).agg({'Definition': list})
sf_lf_list_test['len'] = sf_lf_list_test['Definition'].map(lambda x: len(x))
sf_lf_list_test.sort_values('len', ascending = False)

Unnamed: 0,Abbreviation,Definition,len
15,AP,"[andadores de puntillas, atencia3n primaria, a!reas protegidas, alcohol peralico, angioplastia p...",58
40,DC,"[desnutricia3n cra3nica, de cabeza, dermatosis cenicienta, dendraticas, doppler color, distorsio...",57
28,CA,"[control absoluto, comportamiento alimentario, calcio en la sangre, crioterapia, ca!mara anterio...",53
0,AA,"[avena, agudizados, aguda, aminoa!cidos, adenoamigdalectomaa, avanzados, a!cido asca3rbico, abdo...",53
33,CI,"[ciento, calorimetraa indirecta, cardiaca, concentracia3n del inocula3, cuidadores informales, c...",49
...,...,...,...
143,Mg,[magnesia],1
142,Mc,[metacognicia3n],1
129,LSI,[lineal invariante a desplazamiento],1
120,LDH,[la!ctico deshidrogenasa],1


#### Apply Levenshtein distance to normalize Long Forms

Get more frequent lf per sf

In [1127]:
frec = test_def['Definition'].value_counts().reset_index()
frec

Unnamed: 0,index,Definition
0,centimetro,143
1,tomografaa computerizada,126
2,tomografaa computada de abdomen,79
3,tomografaa axial computarizada de ta3rax,79
4,tratamiento asertivo comunitario,79
...,...,...
1970,aceite residual automotriz,1
1971,infarto agudo del miocardio,1
1972,trasplante de pa!ncreas,1
1973,tiempo programado,1


Create a dictionary where keys are the lf to normalize and the values the normalized form

In [1128]:
norm_test = []
for i in sf_lf_list_test['Definition']:
    norm_test.append(normalize_lf(i))

In [1129]:
norm_test = [i for i in norm_test if i != None]

In [1130]:
norm_dict_test = {}
for i in norm_test:
    norm_dict_test.update(i)
#norm_dict

Finally normalize long forms over the dataframe

In [1131]:
test_def.head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",milimetro,left_only
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",tomografaa computerizada,both
2,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",trayectorias clanicas,both
3,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",trastornos cra3nicos,both
4,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",taninos condensados,both


In [1132]:
test_def = test_def.replace({"Definition": norm_dict_test})

In [1133]:
test_def.head()

Unnamed: 0,# Document_ID,StartOffset,EndOffset,Abbreviation,texto,Definition,_merge
0,S1130-14732005000200003-1,300,302,mm,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",milimetro,left_only
1,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",tomografaa computerizada,both
2,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",trayectorias clanicas,both
3,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",trastornos cra3nicos,both
4,S1130-14732005000200003-1,649,651,TC,"Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue interven...",taninos condensados,both


In [1134]:
test_def.shape

(9407, 7)

In [1135]:
test_def = test_def.drop_duplicates()

In [1136]:
test_def.shape

(9392, 7)

### Transform dataframe to a list of dictionaries

In [1137]:
#test_data = test[['doc_id', 'texto', 'short_form', 'long_form', 'StartOffset', 'EndOffset']].to_dict('records')
test_data = test_def.to_dict('records')

In [1138]:
len(test_data)

9392

### Get the text before and after the SF

In [1139]:
test_ndata, sf_not_found = get_f_b_context_text(test_data)

In [1140]:
len(test_ndata)

9009

Check if any SF is not founded in the text

In [1141]:
sf_not_found_set = set(sf_not_found)

In [1142]:
len(sf_not_found_set)

20

In [1143]:
test_ndata[0]

['S1130-14732005000200003-1',
 'mm',
 'Mujer de 29 años de edad, diagnosticada de cavernomas múltiples en 1996, año en que fue intervenida tras hemorragia frontal de angioma cavernoso frontal derecho, con evacuación del hematoma y resección de la lesión vascular. Ese mismo año se objetivó un pequeño angioma protuberancial izquierdo de 3  <start> mm <end> ',
 'de diámetro. La paciente permaneció asintomática hasta enero de 2004, cuando ingresa por presentar cefalea y vómitos asociados a hemihipoestesia con parestesias de hemicuerpo derecho y hemiparesia 4/5 derecha. En la exploración destacaba un nistagmo vertical, una hemiparesia derecha 4/5 y una hemihipoestesia derecha con extinción sensitiva.\nLa TC craneal realizada al ingreso mostraba un hematoma protuberancial posterior derecho de 20 mm de diámetro, relacionado con la localización ya conocida del cavernoma.\nLa RM cerebral confirmaba le presencia de un angioma cavernoso protuberancial con la presencia de hemosiderina perilesional

### Get the contexts 

From the texts before and after the sf, we limit the number of words

In [1144]:
star_ends_tokens = 6 #number of tokens to add because <start> and <end> labels 

In [1145]:
n_step_f = 10 + star_ends_tokens #number of words to select from the forward context
n_step_b = 10 #number of words to select from the backward context

In [1146]:
test_ndata = limit_context(test_ndata)

In [1147]:
len(test_ndata)

9009

In [1148]:
test_ndata[0]

['S1130-14732005000200003-1',
 'mm',
 'año se objetivó un pequeño angioma protuberancial izquierdo de 3 < start > mm < end >',
 'de diámetro . La paciente permaneció asintomática hasta enero de',
 'milimetro']

### Asign an LF to each SF

In [1149]:
data_dic = create_dict(test_ndata)

In [1150]:
len(data_dic)

9009

In [1151]:
data_df = pd.DataFrame(data_dic)

In [1152]:
#data_df['label'] = 0

In [1158]:
data_df['long_form'] = data_df['long_form'].str.replace('milimetro','milímetro')
data_df['long_form'] = data_df['long_form'].str.replace('centimetro','centímetro')

In [1162]:
data_df['long_form'] = data_df['long_form'].str.replace('Kilogramo','kilogramo')

In [1163]:
data_df.head()

Unnamed: 0,doc_id,short_form,context,long_form
0,S1130-14732005000200003-1,mm,año se objetivó un pequeño angioma protuberancial izquierdo de 3 < start > mm < end > de diámetr...,milímetro
1,S1130-14732005000200003-1,TC,4/5 y una hemihipoestesia derecha con extinción sensitiva . La < start > TC < end > craneal real...,tomografaa computerizada
2,S1130-14732005000200003-1,TC,4/5 y una hemihipoestesia derecha con extinción sensitiva . La < start > TC < end > craneal real...,trayectorias clanicas
3,S1130-14732005000200003-1,TC,4/5 y una hemihipoestesia derecha con extinción sensitiva . La < start > TC < end > craneal real...,trastornos cra3nicos
4,S1130-14732005000200003-1,TC,4/5 y una hemihipoestesia derecha con extinción sensitiva . La < start > TC < end > craneal real...,taninos condensados


In [1164]:
data_df.shape

(9009, 4)

In [1165]:
data_df[['doc_id', 'short_form', 'long_form']].to_csv('../../data/abril23/test_prueba.csv', sep = '\t',index = False)

Remove doc_id for the model but save to join it after

In [1166]:
data_df2 = data_df[['short_form', 'context', 'long_form']]
data_doc_ids = data_df[['doc_id']]

In [1167]:
data_doc_ids.to_csv('../../data/julio23/test_data_beto_10_allacronim_normalizedlf_abremesprocessed_julio23_IDS.csv', index = False, sep = '\t')
data_df2.to_csv('../../data/julio23/test_data_beto_10_allacronim_normalizedlf_abremesprocessed_julio23.csv', index = False, sep = '\t')

## Datasets for model test

In [352]:
#prueba = pd.read_csv('../data/marzo2023/subtrack2/train_data_beto_10_allacron_nomedline_nolevenstein_ownpreproces.csv', sep = '\t')
#prueba = data_df2.head(10)
#prueba.to_csv('../data/marzo2023/subtrack2/test_prueba_input.csv', index = False, sep = '\t')

Separate train in train and validation to check if over train data the model predicts right

In [445]:
from sklearn.model_selection import train_test_split

In [446]:
train = pd.read_csv("../../data/data_train/train_data_beto_10_NOamb_lfnorm_medline.csv")

In [449]:
del train['Unnamed: 0']

In [450]:
train.head(1)

Unnamed: 0,short_form,context,long_form,label
0,dl,"del líquido una glucorraquia normal , proteinorraquia de 102 mg/ < start > dl < end > 960 célula...",decilitro,1


In [469]:
train_split, val = train_test_split(train, test_size=0.20, random_state=42)

In [470]:
del val['label']

In [471]:
train_split.to_csv('../../data/abril23/train_splited_for_test.csv', index = False)
val.to_csv('../../data/abril23/validation_from_train_splited_for_test.csv', index = False, sep = '\t')

In [456]:
print(train_split.shape)
print(val.shape)

(5768, 4)
(1442, 4)


In [472]:
val_output = pd.read_csv('../../data/abril23/train_validation_split_output.csv')

In [473]:
del val_output['Unnamed: 0']

In [474]:
val_output.head()

Unnamed: 0,short_form,context,long_form,sentences,Prediction
0,C,en este caso 15 litros de solución salina a 37º < start > C < end > para obtener un drenaje clar...,peak c,[CLS] peak c [SEP] en este caso 15 litros de solución salina a 37º < start > C < end > para obte...,0.00328
1,GGT,del perfil hepático de predominio colestásico ( fosfatasa alcalina y < start > GGT < end > más d...,gama glutamil transferasa,[CLS] gama glutamil transferasa [SEP] del perfil hepático de predominio colestásico ( fosfatasa ...,0.005237
2,UI,"GPT 96 UI/l , GGT 182 UI/l , FA 148 < start > UI < end > l ) . Valorada por Neurología , indican...",unidad internacional,"[CLS] unidad internacional [SEP] GPT 96 UI/l , GGT 182 UI/l , FA 148 < start > UI < end > l ) . ...",0.999654
3,anti-CCP,altos de factor reumatoide y anticuerpos antipéptido cíclico citrulinado ( < start > anti-CCP < ...,anticuerpos antipeptido ciclico citrulinado,[CLS] anticuerpos antipeptido ciclico citrulinado [SEP] altos de factor reumatoide y anticuerpos...,0.999379
4,GRE,"Fast Spin Eco , potenciadas en T1 , T2 , < start > GRE < end > y T1 con gadolinio , encontrando ...",gradientecho,"[CLS] gradientecho [SEP] Fast Spin Eco , potenciadas en T1 , T2 , < start > GRE < end > y T1 con...",0.658944


In [476]:
val_output.Prediction.describe()

count    1442.000000
mean        0.716069
std         0.426520
min         0.000776
25%         0.186323
50%         0.998313
75%         0.999427
max         0.999700
Name: Prediction, dtype: float64

In [477]:
val_output[val_output['Prediction'] >= 0.8]

Unnamed: 0,short_form,context,long_form,sentences,Prediction
2,UI,"GPT 96 UI/l , GGT 182 UI/l , FA 148 < start > UI < end > l ) . Valorada por Neurología , indican...",unidad internacional,"[CLS] unidad internacional [SEP] GPT 96 UI/l , GGT 182 UI/l , FA 148 < start > UI < end > l ) . ...",0.999654
3,anti-CCP,altos de factor reumatoide y anticuerpos antipéptido cíclico citrulinado ( < start > anti-CCP < ...,anticuerpos antipeptido ciclico citrulinado,[CLS] anticuerpos antipeptido ciclico citrulinado [SEP] altos de factor reumatoide y anticuerpos...,0.999379
5,UCI,", 7/10 y 1/10 durante su estancia en Urgencias , < start > UCI < end > Sala de Hospitalización y...",unidad cuidados intensivos,"[CLS] unidad cuidados intensivos [SEP] , 7/10 y 1/10 durante su estancia en Urgencias , < start ...",0.998991
6,B,El metoprolol pertenece a una clase de medicamentos llamados bloqueadores < start > B < end > Fu...,beta,[CLS] beta [SEP] El metoprolol pertenece a una clase de medicamentos llamados bloqueadores < sta...,0.998262
9,PIO,"una exploración rutinaria , se descubrió una presión intraocular ( < start > PIO < end > de 34 m...",presion intraocular,"[CLS] presion intraocular [SEP] una exploración rutinaria , se descubrió una presión intraocular...",0.999333
...,...,...,...,...,...
1433,LDH,"81 % neutrófilos ) , PCR = 15,93 mg/mL y < start > LDH < end > = 1,154 UI/L . Se realizaron los ...",lactatodeshidrogenasa,"[CLS] lactatodeshidrogenasa [SEP] 81 % neutrófilos ) , PCR = 15,93 mg/mL y < start > LDH < end >...",0.995825
1435,RMN,Ascitis . Se amplía el estudio mediante TC y angio- < start > RMN < end > abdominales : cava int...,resonancia magnetica nuclear,[CLS] resonancia magnetica nuclear [SEP] Ascitis . Se amplía el estudio mediante TC y angio- < s...,0.999469
1437,mmHg,se descubrió una presión intraocular ( PIO ) de 34 < start > mmHg < end > en el ojo derecho ( OD...,milimetro mercurio,[CLS] milimetro mercurio [SEP] se descubrió una presión intraocular ( PIO ) de 34 < start > mmHg...,0.999344
1438,dl,"134 U/l , GPT 91 U/l , BT 1,2 mg/ < start > dl < end > BD 0,5 mg/dl ) que fue diagnosticado por ...",decilitro,"[CLS] decilitro [SEP] 134 U/l , GPT 91 U/l , BT 1,2 mg/ < start > dl < end > BD 0,5 mg/dl ) que ...",0.999632


In [478]:
val_output[val_output['Prediction'] < 0.4]

Unnamed: 0,short_form,context,long_form,sentences,Prediction
0,C,en este caso 15 litros de solución salina a 37º < start > C < end > para obtener un drenaje clar...,peak c,[CLS] peak c [SEP] en este caso 15 litros de solución salina a 37º < start > C < end > para obte...,0.003280
1,GGT,del perfil hepático de predominio colestásico ( fosfatasa alcalina y < start > GGT < end > más d...,gama glutamil transferasa,[CLS] gama glutamil transferasa [SEP] del perfil hepático de predominio colestásico ( fosfatasa ...,0.005237
8,VSG,rutina demuestran una hemoglobina de 90mg/lt y una eritosedimentación ( < start > VSG < end > de...,velocidad eritrosedimentacion,[CLS] velocidad eritrosedimentacion [SEP] rutina demuestran una hemoglobina de 90mg/lt y una eri...,0.050198
10,kg,"kg de masa magra ( 88.4 % ) y 10,1 < start > kg < end > de masa grasa ( 11,6 % ) . El paciente",centimetro,"[CLS] centimetro [SEP] kg de masa magra ( 88.4 % ) y 10,1 < start > kg < end > de masa grasa ( 1...",0.038794
13,L,". En el análisis bioquímico destacaban una GGT 220 U/ < start > L < end > GPT 45 U/L , GOT 44 U/...",leucocito,[CLS] leucocito [SEP] . En el análisis bioquímico destacaban una GGT 220 U/ < start > L < end > ...,0.002517
...,...,...,...,...,...
1432,ALT,y GGT más de 10 veces el valor normal y < start > ALT < end > y AST menos de 3 veces el valor no...,alanine transferase,[CLS] alanine transferase [SEP] y GGT más de 10 veces el valor normal y < start > ALT < end > y ...,0.354852
1434,FA,: Fibrilación auricular - alta ; Fib-A - alta ; < start > FA < end > - alta ; FibA - altaJanuary...,fosfatasa alcalina,[CLS] fosfatasa alcalina [SEP] : Fibrilación auricular - alta ; Fib-A - alta ; < start > FA < en...,0.062370
1436,C,"factor V de Leiden , la resistencia a la proteína < start > C < end > activada , el anticoagulan...",cysteine,"[CLS] cysteine [SEP] factor V de Leiden , la resistencia a la proteína < start > C < end > activ...",0.004386
1440,PCR,"asintomática y siendo la hemoglobina , la VSG y la < start > PCR < end > normales . Consultó por...",polymerase chain reaction,"[CLS] polymerase chain reaction [SEP] asintomática y siendo la hemoglobina , la VSG y la < start...",0.002410
