# Context BoW

### NLP Technologies
### Miguel Soto

In [1]:
import sys
sys.path.append('/Users/mash/Documents/Git/ciencias_de_la_computacion/Doctorado/Scripts')

In [2]:
import text_preprocessing as tp
import pandas as pd, numpy as np
from collections import defaultdict
from typing import Union, List

Collecting es-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.5.0/es_core_news_sm-3.5.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to /Users/mash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def normalize_text(text: str) -> List[str]:
    """Normaliza y tokeniza el texto."""
    return text.lower().split()

def get_separate_contextual_bow(text: Union[str, List[str], pd.DataFrame], window_size: int = 2) -> dict:
    """Genera las bolsas de palabras contextuales separando contexto izquierdo y derecho."""
    # Preparar los datos de entrada
    if isinstance(text, pd.DataFrame):
        documents = [row for row in text.iloc[:,0].apply(normalize_text)]
    elif isinstance(text, list):
        documents = [normalize_text(doc) for doc in text]
    else: # asumimos que es un string
        documents = [normalize_text(text)]
    
    # Diccionario para almacenar contextos izquierdo, derecho y combinado
    bow_contextual = defaultdict(lambda: {"left": defaultdict(int), "right": defaultdict(int), "combined": defaultdict(int)})
    
    # Procesar cada documento
    for doc in documents:
        for index, word in enumerate(doc):
            # Contexto izquierdo
            left_context = doc[max(0, index - window_size):index]
            for context_word in left_context:
                bow_contextual[word]["left"][context_word] += 1
                bow_contextual[word]["combined"][context_word] += 1
            
            # Contexto derecho
            right_context = doc[index+1:min(len(doc), index + window_size + 1)]
            for context_word in right_context:
                bow_contextual[word]["right"][context_word] += 1
                bow_contextual[word]["combined"][context_word] += 1
    
    return bow_contextual


In [4]:
# Reading the file
filename = '../data/e990519_mod.htm'
text_file = open(filename, encoding='utf-8')
text = text_file.read()
text_file.close()

In [8]:
len(text.split())

111891

In [7]:
# Cleaning the text
prep_es = tp.Preprocessing(language='spanish')
cleaned_text = prep_es.main_preprocess(text, lemmatize=True, remove_stop_words=True)

In [6]:
tokens = cleaned_text.split()
print(f'Raw tokens: {len(tokens)}')

Raw tokens: 56716


In [23]:
# Trying the function with the cleaned text
separate_bow_contextual = get_separate_contextual_bow(cleaned_text, window_size=4)
for word, contexts in separate_bow_contextual.items():
    print(f"Palabra: {word}")
    print(f"\tContexto Izquierdo: {contexts['left']}")
    print(f"\tContexto Derecho: {contexts['right']}")
    print(f"\tContexto Combinado: {contexts['combined']}")

Palabra: mod
	Contexto Izquierdo: defaultdict(<class 'int'>, {'guarura': 1, 'motopatrullero': 1, 'ciudadano': 1, 'nacional': 1})
	Contexto Derecho: defaultdict(<class 'int'>, {'htm': 2, 'http': 2, 'ww': 2, 'excelsior': 2})
	Contexto Combinado: defaultdict(<class 'int'>, {'htm': 2, 'http': 2, 'ww': 2, 'excelsior': 2, 'guarura': 1, 'motopatrullero': 1, 'ciudadano': 1, 'nacional': 1})
Palabra: htm
	Contexto Izquierdo: defaultdict(<class 'int'>, {'mod': 2, 'motopatrullero': 1, 'ciudadano': 1, 'nacional': 1})
	Contexto Derecho: defaultdict(<class 'int'>, {'http': 2, 'ww': 2, 'excelsior': 2, 'com': 2})
	Contexto Combinado: defaultdict(<class 'int'>, {'mod': 2, 'http': 2, 'ww': 2, 'excelsior': 2, 'com': 2, 'motopatrullero': 1, 'ciudadano': 1, 'nacional': 1})
Palabra: http
	Contexto Izquierdo: defaultdict(<class 'int'>, {'mod': 2, 'htm': 2, 'sabre': 2, 'editorial': 30, 'nota': 86, 'siguiente': 130, 'noche': 2, 'arenoso': 2, 'magrebina': 2, 'pensamiento': 2, 'mx': 4, 'ayudarl': 2, 'terrorista':

In [24]:
def vectorize_bow(bow_dict):
    """Vectoriza la bolsa de palabras contextuales y retorna un vocabulario y los vectores."""
    vocab = sorted(set(word for context in bow_dict.values() for word in context['combined']))
    vocab_index = {word: i for i, word in enumerate(vocab)}
    
    vectors = {}
    for word, contexts in bow_dict.items():
        vector = np.zeros(len(vocab))
        for context_word, count in contexts['combined'].items():
            if context_word in vocab_index:
                vector[vocab_index[context_word]] = count
        vectors[word] = vector
    return vocab, vectors

def normalize_vectors(vectors):
    """Normaliza los vectores."""
    for word, vector in vectors.items():
        norm = np.linalg.norm(vector)
        if norm > 0:
            vectors[word] = vector / norm
    return vectors

def cosine_similarity(vector_a, vector_b):
    """Calcula la similitud coseno entre dos vectores."""
    return np.dot(vector_a, vector_b) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))

In [25]:

# Utilizando el código anterior para obtener las bolsas de palabras contextuales
vocab, vectors = vectorize_bow(separate_bow_contextual)
vectors = normalize_vectors(vectors)

In [26]:
print(f"vocabulario: {len(vocab)}")
print(f"Tamaño de un vector (bandera): {len(vectors['bandera'])}")

vocabulario: 6644
Tamaño de un vector (bandera): 6644


In [27]:
vectors

{'mod': array([0., 0., 0., ..., 0., 0., 0.]),
 'htm': array([0., 0., 0., ..., 0., 0., 0.]),
 'http': array([0.00598197, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 'ww': array([0.0055719, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]),
 'excelsior': array([0., 0., 0., ..., 0., 0., 0.]),
 'com': array([0., 0., 0., ..., 0., 0., 0.]),
 'mx': array([0., 0., 0., ..., 0., 0., 0.]),
 'html': array([0., 0., 0., ..., 0., 0., 0.]),
 'editorial': array([0.04072315, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 'miercoles': array([0.02846705, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 'mayo': array([0.07251465, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 'epigrama': array([0., 0., 0., ..., 0., 0., 0.]),
 'jorge': array([0.        , 0.        , 0.12909944, ..., 0.        , 0.        ,
        0.        ]),
 'mansilla': array([0., 0., 0., ..., 0., 0., 0

In [28]:
# Ejemplo de cálculo de similitud coseno entre dos palabras
word_a = 'bandera'
word_b = 'ejercito'
similarity = cosine_similarity(vectors[word_a], vectors[word_b])
print(f"Similitud coseno entre '{word_a}' y '{word_b}': {similarity}")

Similitud coseno entre 'bandera' y 'ejercito': 0.06412364700532214


In [29]:
# Obtener las similitudes de todas las palabras
def get_all_cosine_similarities(word, vocabulary, vectors):
    """Obtiene las similitudes coseno de una palabra con todas las demás."""
    similarity_dict = {}
    for word_b in vocabulary:
        similarity_dict[word_b] = cosine_similarity(vectors[word], vectors[word_b])
    
    # Ordenarlas de mayor a menor similitud
    similarity_dict = dict(sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True))
    return similarity_dict

In [30]:
# Similitud para una palabra (bandera)
palabra = 'crecimiento' 
word_similarity = get_all_cosine_similarities(palabra, vocab, vectors)
print(f'Similitud de "{palabra}" respecto a el vocabulario')
for word, similarity in word_similarity.items():
    print(f"{word}: {round(similarity, 4)}")

Similitud de "crecimiento" respecto a el vocabulario
crecimiento: 1.0
ultimo: 0.5593
o: 0.5357
crecer: 0.5349
prever: 0.5198
a: 0.5151
mexico: 0.5089
ganancia: 0.4972
trimestre: 0.4951
ciento: 0.4813
adir: 0.48
estimar: 0.4782
ocde: 0.4764
meta: 0.4716
proximo: 0.469
economia: 0.4644
recuperacion: 0.4526
cuatro: 0.4526
pasado: 0.4526
aumentar: 0.4523
cinco: 0.446
tres: 0.4429
plazo: 0.4321
economico: 0.432
unido: 0.4318
mayor: 0.4299
primero: 0.4276
hacer: 0.4244
campa: 0.4212
ambicioso: 0.42
recortar: 0.42
bruscamente: 0.42
fenasib: 0.4136
curso: 0.4098
dos: 0.4077
nueve: 0.4063
mientras: 0.4061
reducir: 0.4034
incremento: 0.4028
inflacion: 0.4018
solventar: 0.4005
inferior: 0.3975
mexicano: 0.3959
pre: 0.3932
segundo: 0.3918
contraer: 0.3907
cuatrimestre: 0.3907
hispana: 0.3907
bono: 0.3903
poder: 0.3845
invierno: 0.3844
dolareir: 0.381
mediados: 0.381
recobre: 0.381
menor: 0.3808
alrededor: 0.3799
solo: 0.378
representar: 0.375
bajo: 0.3744
presente: 0.3728
dato: 0.3728
ford: 0.3712

In [16]:
# Obtener las similitudes de todas las palabras y guardarlas en un diccionario
similarities_dict = {}
for word in vocab:
    similarities_dict[word] = get_all_cosine_similarities(word, vocab, vectors)

In [17]:
# Save similarities_dict into a txt
with open('similarities_dict.txt', 'w') as file:
    file.write(str(similarities_dict))