# Library

In [255]:
import pandas as pd
import numpy as np
import os
import re
import collections
import unidecode
import nltk
from nltk.corpus import stopwords
import itertools 
from nltk.tokenize import word_tokenize
from string import punctuation
from functools import reduce
import ast
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode
import requests
from bs4 import BeautifulSoup

In [1]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/egarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
pd.set_option('display.max_colwidth', 100)

In [3]:
%matplotlib inline
from matplotlib import pyplot as plt

## Functions

In [4]:
def distance_levenshtein(str1, str2):
    d=dict()
    for i in range(len(str1)+1):
        d[i]=dict()
        d[i][0]=i
    for i in range(len(str2)+1):
        d[0][i] = i
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not str1[i-1] == str2[j-1]))
    return d[len(str1)][len(str2)]

In [5]:
def normalize_lf(row):
    leven2 = []
    for i in row:
        for j in row:
            if i != j:
                long = max(len(i),len(j))
                ratio = distance_levenshtein(i,j)/long
                if ratio < 0.2:
                    leven2.append(j)
    if leven2:
        leven2 = set(leven2)
        lista = []
        for i in leven2:
            #val = frec[frec['index'] == i]['long_form'].iloc[0]
            val = frec[frec['index'] == i]['Definition'].iloc[0]
            lista.append((i, val))
        lista = set(lista)
        most_freq = sorted(set(lista), key=lambda x: x[1], reverse = True)[0][0]
        sust = {}
        for i in set(leven2):
            sust[i] = most_freq
        
        return sust
    else:
        pass
    

## Process AbreMes DB

In [137]:
abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/pairs_processed.tsv", sep = '\t')

In [138]:
abremes.head()

Unnamed: 0,Abbreviation,Definition
0,ZVTN,zonas veredales transitorias de normalizacia3n
1,ZUA,zona de altima accia3n
2,ZU,zona de salud urbana
3,ZU,zonas urbanas
4,ZTPI,zimbardo time perspective inventory


In [104]:
abremes['Abbreviation'] = abremes['Abbreviation'].str.strip()

In [105]:
abremes['Definition'] = abremes['Definition'].str.lower()

In [107]:
abremes = abremes.dropna()

In [108]:
abremes['Definition'] = abremes['Definition'].map(lambda x: unidecode(x))

In [109]:
abremes = abremes[['Abbreviation', 'Definition']]
abremes = abremes.drop_duplicates()
abremes = abremes.dropna()

In [110]:
#abremes['Definition'] = abremes['Definition'].map(lambda x: re.sub(r'\W+', '', x))

In [111]:
abremes[abremes.Abbreviation == 'TC']

Unnamed: 0,Abbreviation,Definition
5802,TC,tomografaas computarizadas
5803,TC,tomografaa computarizada
5804,TC,tomografia computarizada
5805,TC,tomografaa craneal
5806,TC,trayectorias clanicas
...,...,...
5914,TC,trastornos de la comunicacia3n
5915,TC,anata3mica
5917,TC,/tomografaa computerizada
5918,TC,tomogra!ficas computarizadas


### Normalize long forms

In [112]:
sf_lf_test = abremes[['Abbreviation', 'Definition']].drop_duplicates()

In [113]:
sf_lf_list_test = sf_lf_test.groupby('Abbreviation', as_index=False).agg({'Definition': list})
sf_lf_list_test['len'] = sf_lf_list_test['Definition'].map(lambda x: len(x))
sf_lf_list_test.sort_values('len', ascending = False)

Unnamed: 0,Abbreviation,Definition,len
9932,II,"[situaciones especiales, periodontitis humana, ejecucia3n de tales mecanismos, farmacocina(c)tic...",428
4693,DE,"[edad media, resultados: la media, diferencia promedio, resultados: el valor medio, disfuncia3n ...",138
2663,CC,"[capacidad de campo, circunferencia de cintura, cuatro grupos: control, circunferencia de la cin...",112
14122,PC,"[proteanas carboniladas, paros cardaacos, pensamiento constructivo, peso de cabeza, proteana cru...",112
18586,TC,"[tomografaas computarizadas, tomografaa computarizada, tomografia computarizada, tomografaa cran...",109
...,...,...,...
8660,HC/N3,[hidratos de carbono enriquecida con n3],1
8662,HCAI,[hidrocefalia cra3nica del adulto idiopa!tica],1
8664,HCAMQ,[holistic complementary and alternative medicine questionnaire],1
8667,HCBe+,[hepatitis cra3nica b age+],1


#### Apply Levenshtein distance to normalize Long Forms

Get more frequent lf per sf

In [114]:
frec = abremes['Definition'].value_counts().reset_index()
frec

Unnamed: 0,index,Definition
0,#anombre?,45
1,por ciento,10
2,factor de necrosis tumoral alfa,9
3,internacional,9
4,respiratoria,8
...,...,...
49366,producto cruzado,1
49367,con raza3n de momios,1
49368,como seguros,1
49369,respectivasodds ratios,1


Create a dictionary where keys are the lf to normalize and the values the normalized form

In [115]:
norm_test = []
for i in sf_lf_list_test['Definition']:
    norm_test.append(normalize_lf(i))

In [116]:
norm_test = [i for i in norm_test if i != None]

In [117]:
norm_dict_test = {}
for i in norm_test:
    norm_dict_test.update(i)
#norm_dict

Finally normalize long forms over the dataframe

In [118]:
abremes.head()

Unnamed: 0,Abbreviation,Definition
0,ZVTN,zonas veredales transitorias de normalizacia3n
1,ZUA,zona de altima accia3n
2,ZU,zona de salud urbana
3,ZU,zonas urbanas
4,ZTPI,zimbardo time perspective inventory


In [119]:
abremes = abremes.replace({"Definition": norm_dict_test})

In [120]:
abremes.head()

Unnamed: 0,Abbreviation,Definition
0,ZVTN,zonas veredales transitorias de normalizacia3n
1,ZUA,zona de altima accia3n
2,ZU,zona de salud urbana
3,ZU,zonas urbanas
4,ZTPI,zimbardo time perspective inventory


In [121]:
abremes = abremes.drop_duplicates()

In [122]:
abremes[abremes.Abbreviation == 'II']

Unnamed: 0,Abbreviation,Definition
28580,II,situaciones especiales
28581,II,periodontitis humana
28582,II,ejecucia3n de tales mecanismos
28583,II,farmacocina(c)ticas
28584,II,activos y 6 sedentarios
...,...,...
29012,II,clanico de una sacrolumbalgia
29013,II,tiempos de don quijote
29014,II,hospitalariosen espaa+-a
29015,II,presenciada sin rcp-a


In [123]:
abremes.Definition.nunique()

41817

In [124]:
#abremes_dict = dict(zip(abremes.Abbreviation, abremes.Definition))

In [125]:
#import json

#with open('abremes_dictionary.txt', 'w') as file:
#     file.write(json.dumps(abremes_dict))

In [126]:
abremes.sort_values(by = 'Abbreviation',ascending = True).head()

Unnamed: 0,Abbreviation,Definition
55578,1 -MCP,1-metilciclopropeno
55576,1 min,"1,13 min"
55577,1 mÂ²,10 trampas
55570,"1,3-PD","1,3 propanodiol"
55575,1-149 meses,"51,31 + 47,98 meses"


In [127]:
abremes.to_csv("../../publicacion/AbreMES-DB/DB/pairs_processed.tsv", sep = '\t', index = False)

In [128]:
#Filter AbreMes just to get the SF os test

In [130]:
test = pd.read_csv('../../data/abril23/test_abbreviation.csv', sep = '\t')

In [132]:
test_abremes = abremes.merge(test, on = 'Abbreviation', how = 'inner')

In [133]:
test_abremes.shape

(3344, 2)

In [134]:
test_abremes.to_csv("../../publicacion/AbreMES-DB/DB/test_abremes.tsv", sep = '\t', index = False)

Filter to get just the definitions that starts with the same letter of the sf

In [147]:
test_abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/test_abremes.tsv", sep = '\t')

In [148]:
test_abremes['lista'] = test_abremes.Definition.map(lambda x: x.split())

In [149]:
test_abremes['primera'] = test_abremes.lista.map(lambda x: x[0][0])

In [150]:
test_abremes.head()

Unnamed: 0,Abbreviation,Definition,lista,primera
0,Zn,zinc,[zinc],z
1,Zn,zonas 1 y 2,"[zonas, 1, y, 2]",z
2,Zn,aznalca3llar,[aznalca3llar],a
3,XL,extra-large,[extra-large],e
4,WPW,wolff-parkinson-white,[wolff-parkinson-white],w


In [151]:
test_abremes.shape

(2985, 4)

In [152]:
def match(row):
    if row['Abbreviation'][0].lower() == row['primera'].lower():
        return 1
    else:
        return 0

In [153]:
test_abremes['match'] = test_abremes.apply(match, axis = 1)

In [155]:
test_abremes.match.value_counts()

1    2040
0     945
Name: match, dtype: int64

In [156]:
test_abremes = test_abremes[test_abremes.match == 1]

In [158]:
test_abremes = test_abremes[['Abbreviation', 'Definition']]

In [159]:
test_abremes.to_csv("../../publicacion/AbreMES-DB/DB/test_abremes.tsv", sep = '\t', index = False)

## Get texts AbreMes DB

Abremes pairs sf-lf processed (removing noise)

In [179]:
abremes = pd.read_csv("../../publicacion/AbreMES-DB/DB/pairs_processed.tsv", sep = '\t')

In [180]:
abremes.head()

Unnamed: 0,Abbreviation,Definition
0,ZVTN,zonas veredales transitorias de normalizacia3n
1,ZUA,zona de altima accia3n
2,ZU,zona de salud urbana
3,ZU,zonas urbanas
4,ZTPI,zimbardo time perspective inventory


In [192]:
abremes.Abbreviation.nunique()

21994

Ger abbreviations with the url of the text

In [210]:
abre = pd.read_csv("../../publicacion/AbreMES-DB/DB/abbreviations.tsv", sep = '\t')

In [219]:
abre['Definitions'] = abre['Definitions'].map(lambda x: x.split(','))

In [221]:
abre = abre[['Abbreviation', 'Definitions','Appears on']]

In [222]:
abre = abre.explode('Definitions')

In [223]:
abre.head()

Unnamed: 0,Abbreviation,Definitions,Appears on
0,Edaom,1,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0123-12942011000100006
1,el ser,2,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0123-12942012000100008
2,ICE,44878,"ibc-142240,ibc-145014,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0123-129420140..."
2,ICE,42211,"ibc-142240,ibc-145014,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0123-129420140..."
2,ICE,3,"ibc-142240,ibc-145014,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0123-129420140..."


Ger definitions with the url of the text

In [236]:
definitions = pd.read_csv("../../publicacion/AbreMES-DB/DB/definitions.tsv", sep = '\t', encoding = 'latin-1')

In [238]:
definitions = definitions[['# Definition ID', 'Definition']]

In [239]:
definitions.head()

Unnamed: 0,# Definition ID,Definition
0,1,Aprendizaje y OrientaciÃ³n Motivacional
1,2,de la persona en el rol rectoral
2,3,Ã­ndice de calidad del empleo
3,4,Experience Questionnaire
4,5,cognitivos artificiales


Join abbreviations and definitions and after that cross it with our processed Abremes file

In [243]:
abre = abre.merge(definitions, left_on = 'Definitions', right_on = '# Definition ID')

In [245]:
abre = abre[['Abbreviation', 'Definition', 'Appears on']]

In [246]:
abre.head(2)

Unnamed: 0,Abbreviation,Definition,Appears on
0,Edaom,Aprendizaje y OrientaciÃ³n Motivacional,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0123-12942011000100006
1,el ser,de la persona en el rol rectoral,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0123-12942012000100008


In [276]:
abreviaciones = abremes.merge(abre, on = ['Abbreviation', 'Definition'], how = 'left')

In [277]:
abreviaciones.head()

Unnamed: 0,Abbreviation,Definition,Appears on
0,ZVTN,zonas veredales transitorias de normalizacia3n,
1,ZUA,zona de altima accia3n,
2,ZU,zona de salud urbana,"ibc-29680,http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-31952011000500001"
3,ZU,zonas urbanas,"ibc-29680,http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-31952011000500001"
4,ZTPI,zimbardo time perspective inventory,


In [278]:
print(abremes.shape)
print(abreviaciones.dropna().shape)

(45066, 2)
(12731, 3)


In [279]:
abreviaciones = abreviaciones.dropna()

In [280]:
def buscar_url(row):
    patron = r"(http).*"
    resultado = re.search(patron, row['Appears on'])
    if resultado:
        resultado = resultado.group(0)
        
    return resultado

In [281]:
abreviaciones['Appears on'] = abreviaciones.apply(buscar_url, axis = 1)

In [284]:
abreviaciones = abreviaciones.dropna()

In [285]:
abreviaciones.shape

(10754, 3)

In [286]:
abreviaciones['Appears on'] = abreviaciones['Appears on'].map(lambda x: x.split(','))

In [288]:
abreviaciones = abreviaciones.explode('Appears on')

In [292]:
abreviaciones = abreviaciones[~abreviaciones['Appears on'].str.startswith('ibc-')]

In [297]:
abreviaciones = abreviaciones.drop_duplicates()

In [298]:
abreviaciones.shape

(293413, 3)

In [299]:
abreviaciones.head()

Unnamed: 0,Abbreviation,Definition,Appears on
2,ZU,zona de salud urbana,http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-31952011000500001
3,ZU,zonas urbanas,http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-31952011000500001
13,ZrN,zirconio,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0122-34612012000100002
18,ZR,zumo de remolacha,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0120-548X2009000100009
18,ZR,zumo de remolacha,http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-77432012000200007


In [302]:
#abreviaciones.to_csv("../../publicacion/AbreMES-DB/DB/abbrev_defin_url.tsv",index = False)

### Get the file manually cleaned

In [304]:
abreviaciones = pd.read_csv("../../publicacion/AbreMES-DB/DB/abbrev_defin_url.tsv", sep = '\t')

In [305]:
abreviaciones.head()

Unnamed: 0,Abbreviation,Definition,Appears on
0,ZrN,zirconio,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0122-34612012000100002
1,ZO,opsonizado,http://www.scielo.sa.cr/scielo.php?script=sci_arttext&pid=S0253-29482001000100002
2,ZO,opsonizado,http://scielo.isciii.es/scielo.php?script=sci_arttext&pid=S1887-85712015000100004
3,ZNI,zonas no interconectadas,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0120-35922009000100011
4,ZM,zonas metropolitanas,http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S0187-69612014000200008


In [306]:
abreviaciones['Appears on'].nunique()

17098

In [311]:
urls = abreviaciones['Appears on'].unique().tolist()

In [320]:
#Las escribimos en un archivo para luego leerlas
with open ('abremes_url.txt','a') as file:
    for i in urls:
        file.write(i + "\n")

In [314]:
def get_text(url):
    
    articles = []
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')
    articulo = soup.get_text()
    articles.append({'url': url, 'text': articulo})
    return articles

In [322]:
# Path
archivo = "abremes_url.txt"

# Read lines and store them in a list
with open(archivo, 'r') as file:
    urls = file.readlines()

urls = [linea.strip() for linea in urls]

In [None]:
textos3 = []
for a in urls[57:]:
    try:
        articulos = get_text(a)
        textos3 += articulos
    except AttributeError:
        print('Fallo en la petición', a)
        continue
    except requests.exceptions.Timeout:
        # Manejar el error de time-out
        print(f"Time-out en la solicitud a {url}. Continuando con la siguiente URL.")
        continue
    except requests.exceptions.TooManyRedirects as e:
        print(f"Error de redireccionamiento: {e}")
        continue
    except requests.exceptions.RequestException as e:
        print(f"Error de solicitud: {e}")
        continue

In [317]:
len(textos)

37

In [319]:
len(textos2)

12

In [None]:
len(textos3)


In [332]:
textos = textos+textos2+textos3

In [333]:
df = pd.DataFrame(textos)

In [334]:
df.head()

Unnamed: 0,url,text
0,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0122-34612012000100002,\n\nResistencia a la corrosión de recubrimientos de nitruros metálicos depositados sobre acero A...
1,http://www.scielo.sa.cr/scielo.php?script=sci_arttext&pid=S0253-29482001000100002,\n\nEfecto del dimetilsulfÃ³xido en la respuesta quimioluminiscente y el consumo de oxÃ­geno de ...
2,http://scielo.isciii.es/scielo.php?script=sci_arttext&pid=S1887-85712015000100004,\n\n\nÂ¿Mejora el torniquete la supervivencia del combatiente en zonas en conflicto?\n\n\n\n\n\n...
3,http://www.scielo.org.co/scielo.php?script=sci_arttext&pid=S0120-35922009000100011,\n\n¿Ha sido efectiva la promoción de soluciones energéticas en las zonas no interconectadas (ZN...
4,http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S0187-69612014000200008,\n\nPresencia e impacto espacial de los sectores creativos en las zonas metropolitanas de MÃ©xic...


In [337]:
df.to_csv("textos_abremes.csv", index = False)