# Syntagmatic relations
## Natural Language Technologies
#### By: Cesar Macias

## Dependencies

In [1]:
import sys
sys.path.append('/Users/cesarmacias/Documents/GIT/CCogS-Mx/text-preprocessing/scripts')

In [16]:
import text_preprocessing as tp
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize

In [4]:
prep = tp.Preprocessing('spanish')

In [51]:
corpus = open('../data/Excelsior/raw/e990519_mod.htm', '+r', encoding='utf-8').read()

corpus = sent_tokenize(corpus)

preprocessed_corpus = prep.main_preprocess(data= corpus,
                                                    remove_apostrophes = False,
                                                    remove_stop_words= True,
                                                    lemmatize = True,
                                                    whitelist= 'ñáéíóú')

sentences = preprocessed_corpus

In [52]:
len(sentences)

1511

In [53]:
sentences

['mod htm editorial miercoles mayo epigrama jorge mansilla torre honor militar enterrar chihuahua soldado villista manuel s tarandir año querer vida honor decir villista tarar muerto flor sabre editorial nota siguiente editorial miercoles mayo pri cesarismo batracomaquia leon garcia soler votar favor seguir dar cesar ir funcionar decir alfonso martinez dominguez abandonar salon compañero partido ejercitar retorica parlamentario superar propio miedo ejercitar fuerza soberano decir inauguraban cuarto etapa partido pnr prm pri',
 'hablar votar favor consulta directo bas ampliar conjuro democracia abarcar simpatizante cuanto ciudadano disponer credencial elector fotografia',
 'guerra galaxias',
 'vencer contienda ganar mayor numero distrito electoral',
 'universal eleccion indirecto hablar',
 'satisfecho orondo jose antonio gonzalez fernandez intentar acuñar epitafio presidencialismo mexicano nombre candidato pasar pino esperar presidente partido telefonazo nombre alguien',
 'ahora llamar 

In [160]:
class SyntagmaticRelation:
    def __init__(self, sentences, word):
        self.sentences = sentences
        self.word_1 = word
        self.vocabulary = list(sorted(set(' '.join(self.sentences).split())))

    def get_probabilities(self, w_1, w_2):
        n = len(self.sentences)
        count_w1 = 0
        count_w2 = 0
        count_w1_w2 = 0
    
        for sent in self.sentences:
            if w_1 in sent.split():
                count_w1 += 1
            if w_2 in sent.split():
                count_w2 += 1
            if w_1 in sent.split() and w_2 in sent.split():
                count_w1_w2 += 1
        
        count_w11_w20 = count_w1 - count_w1_w2
        count_w10_w21 = count_w2 - count_w1_w2

        # word probabilities
        p_w1_1 = (count_w1 + 0.5) / (n + 1)
        p_w1_0 = 1 - p_w1_1

        p_w2_1 = (count_w2 + 0.5) / (n + 1)
        p_w2_0 = 1 - p_w2_1

        # coocurrences probabilities
        #p(w1 = 1, w2 = 1)
        p_w11_w21 = (count_w1_w2 + 0.25) / (n + 1)

        #p(w1 = 1, w2 = 0)
        p_w11_w20 =  (count_w11_w20 + 0.25) / (n + 1)

        #p(w1 = 0, w2 = 1)
        p_w10_w21 =  (count_w10_w21 + 0.25) / (n + 1)

        #p(w1 = 0, w2 = 0)
        p_w10_w20 = (n - count_w1_w2 - count_w11_w20 - count_w10_w21 + 0.25) / (n + 1)

        # print(f'p(w1=1): {p_w1_1}')
        # print(f'p(w1=0): {p_w1_0}')
        # print(f'p(w2=1): {p_w2_1}')
        # print(f'p(w2=0): {p_w2_0}')
        # print(f'p(w1=1, w2=1): {p_w11_w21}')
        # print(f'p(w1=1, w2=0): {p_w11_w20}')
        # print(f'p(w1=0, w2=1): {p_w10_w21}')
        # print(f'p(w1=0, w2=0): {p_w10_w20}')
        # print(f'sum(pw1): {p_w1_1 + p_w1_0}')
        # print(f'sum(pw2): {p_w2_1 + p_w2_0}')
        # print(f'sum(pw_1, w_2): {p_w11_w21 + p_w11_w20 + p_w10_w21 + p_w10_w20}')

        return [p_w1_0, p_w1_1], [p_w2_0, p_w2_1], [[p_w10_w20, p_w11_w20], [p_w10_w21, p_w11_w21]]

    # p(X=x|Y=y)= p(X=v, Y=u)/ p(Y=u)
    
    def get_entropy(self):
        h_dict = {}
        for word_2 in self.vocabulary:
            p_w1, _, p_w1_w2 = self.get_probabilities(self.word_1, word_2)
            entropy = 0
            for u in range(2):
                probs_w2_w1 = []
                for v in range(2):
                    cond_prob = p_w1_w2[v][u] / p_w1[u]
                    prob_w2_w1 = cond_prob * np.log2(cond_prob)
                    probs_w2_w1.append(prob_w2_w1)

                entropy += (-p_w1[u] * sum(probs_w2_w1))

            h_dict[word_2] = entropy

            
        return h_dict
    
    def get_mutual_information(self):
        m_i_dict = {}
        for word_2 in self.vocabulary:
            p_w1, p_w2, p_w1_w2 = self.get_probabilities(self.word_1, word_2)
            m_i = 0
            for u in range(2):
                for v in range(2):
                    m_i += (p_w1_w2[v][u] * np.log2(p_w1_w2[v][u] / (p_w1[u]*p_w2[v])))

            m_i_dict[word_2] = m_i
        return m_i_dict
    

In [161]:
synt_relations = SyntagmaticRelation(sentences, 'crecimiento')

In [162]:
entropies = synt_relations.get_entropy()

In [163]:
m_i = synt_relations.get_mutual_information()

In [164]:
def most_similar_words(n_words, dictionary, reverse= False):
    # Make a shallow copy of the similarity scores for the specific word

    # Sort the words based on their similarity score in descending order and select the top n_words
    most_similars = sorted(dictionary, key=dictionary.get, reverse=reverse)[:n_words]
    
    return most_similars

In [199]:
most_similar_words(10, entropies, True)

['poder',
 'hacer',
 'año',
 'mexico',
 'ser',
 'politico',
 'decir',
 'mexicano',
 'nuevo',
 'solo']

In [198]:
most_similar_words(20, m_i, True)

['crecimiento',
 'ocde',
 'perspectiva',
 'pib',
 'exportacion',
 'mercado',
 'aumento',
 'crisis',
 'financiero',
 'inflacion',
 'economia',
 'ritmo',
 'economico',
 'global',
 'prever',
 'rapido',
 'reduccion',
 'pais',
 'añadir',
 'sector']