In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords as sw
import gensim
import numpy as np

In [2]:
stopWords = set(sw.words("english"))

In [3]:
def lesk(word:str,sentence:str):
    """
    :param word: word to do de disambiguation
    :param sentence: sentence to disambiguate
    :return: best sense of the word
    """
    senses =wn.synsets(word)
    best_sense= senses[0]
    max_overlap = 0
    context = set(nltk.word_tokenize(sentence))
    context = context.difference(stopWords)
    for sense in senses[1:]:
        data = set(nltk.word_tokenize(sense.definition()+" "+" ".join( sense.examples())))
        signature = data.difference(stopWords)
        overlap = len(context.intersection(signature))
        if overlap >max_overlap:
            max_overlap = overlap
            best_sense = sense
            
    return best_sense
  

In [4]:
lesk("bank","Yesterday I went to the bank to withdraw the money and the credit card did not work")

Synset('depository_financial_institution.n.01')

In [5]:
def distanceCosine(a,b):
    dot = np.dot(a, b)
    norma = np.linalg.norm(a)
    normb = np.linalg.norm(b)
    cos = dot / (norma * normb)
    return cos

In [6]:
from nltk.data import find

# Cargar el modelo de embeding pre-entrenados del NLTK
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)


In [7]:
vector_mean =  model.vectors.mean(axis=0)

In [8]:
def wsd_embeddings(word:str,sentence:str):
    """
    :param word: word to do de disambiguation
    :param sentence: sentence to disambiguate
    :return: best sense of the word
    """
    distances = []
    senses =wn.synsets(word)
    context = set(nltk.word_tokenize(sentence))
    context = context.difference(stopWords)
    matrixContext = np.zeros((len(context),300))
    i= 0
    for word in context:
        if word in model.vocab:
            matrixContext[i]=model[word]
        else:
            matrixContext[i]=vector_mean
        i+=1
    vectorcontext = matrixContext.mean(axis=0)
    for sense in senses:
        data = set(nltk.word_tokenize(sense.definition()+" "+" ".join( sense.examples())))
        signature = data.difference(stopWords)
        matrixSiganture = np.zeros((len(signature),300))
        i= 0
        for word in signature:
            if word in model.vocab:
                matrixSiganture[i]=model[word]
            else:
                matrixSiganture[i]=vector_mean
            i+=1
        vectorSignature = matrixSiganture.mean(axis=0)

        distances.append((sense,distanceCosine(vectorcontext,vectorSignature)))
    
            
    return sorted(distances, key=lambda t:t[1],reverse=True)[0]
    

In [9]:
t = wsd_embeddings("bank", "Yesterday I went to the bank to withdraw the money and the credit carddid not work")

In [10]:
t[0].examples()

['She deposits her paycheck every month']