In [126]:
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from scipy.spatial import distance
import string

In [127]:
# Sentences chosen, ambiguous word chosen is 'play'
#The Queen's Park and Titwood clubs in Glasgow have each three greens, and as they can quite comfortably play six rinks on each, it is not uncommon to see 144 players making their game simultaneously.
first_sentence = "She cannot sing and she cannot play the piano, although, as some early experiments show, she could learn mechanically to beat out a tune on the keys."
second_sentence = "The strongest direct evidence seems to be that the nuclear substances are the only parts of the cells which are always equivalent in quantity, and that in the higher plants and animals the male organ or spermatozoid is composed almost entirely of the nucleus, and that the male nucleus is carried into the female cell without a particle of cytoplasm.i Since, however, the nucleus of the female cell is always accompanied by a larger or smaller quantity of cytoplasm, and that in a large majority of the power plants and animals the male cell also contains cytoplasm, it cannot yet be definitely stated that the cytoplasm does not play some part in the process."

In [128]:
wn.synsets("play")

[Synset('play.n.01'),
 Synset('play.n.02'),
 Synset('play.n.03'),
 Synset('maneuver.n.03'),
 Synset('play.n.05'),
 Synset('play.n.06'),
 Synset('bid.n.02'),
 Synset('play.n.08'),
 Synset('playing_period.n.01'),
 Synset('free_rein.n.01'),
 Synset('shimmer.n.01'),
 Synset('fun.n.02'),
 Synset('looseness.n.05'),
 Synset('play.n.14'),
 Synset('turn.n.03'),
 Synset('gambling.n.01'),
 Synset('play.n.17'),
 Synset('play.v.01'),
 Synset('play.v.02'),
 Synset('play.v.03'),
 Synset('act.v.03'),
 Synset('play.v.05'),
 Synset('play.v.06'),
 Synset('play.v.07'),
 Synset('act.v.05'),
 Synset('play.v.09'),
 Synset('play.v.10'),
 Synset('play.v.11'),
 Synset('play.v.12'),
 Synset('play.v.13'),
 Synset('play.v.14'),
 Synset('play.v.15'),
 Synset('play.v.16'),
 Synset('play.v.17'),
 Synset('play.v.18'),
 Synset('toy.v.02'),
 Synset('play.v.20'),
 Synset('dally.v.04'),
 Synset('play.v.22'),
 Synset('dally.v.01'),
 Synset('play.v.24'),
 Synset('act.v.10'),
 Synset('play.v.26'),
 Synset('bring.v.03'),
 Syn

In [129]:
stop_words = set(stopwords.words('english'))

porter = PorterStemmer()


def stopword_remover(lst):
    return " ".join([word for word in lst.split(" ") if word not in stop_words])


def stemmer(lst):
    return "".join([porter.stem(word) for word in lst if word])


def remove_punc(sent):
    sent_without_punc = ""
    for char in sent:
        if char not in string.punctuation:
            sent_without_punc += char
        else:
            sent_without_punc += " "
    return sent_without_punc


def text_preprocess(text):
    text = text.lower()
    stems = stemmer(text)
    stems = stopword_remover(stems)
    stems = remove_punc(stems)
    return stems

first_sentence_stemmed = text_preprocess(first_sentence)
second_sentence_stemmed = text_preprocess(second_sentence)
print(first_sentence_stemmed)
print(second_sentence_stemmed)

cannot sing cannot play piano  although  early experiments show  could learn mechanically beat tune keys 
strongest direct evidence seems nuclear substances parts cells always equivalent quantity  higher plants animals male organ spermatozoid composed almost entirely nucleus  male nucleus carried female cell without particle cytoplasm i since  however  nucleus female cell always accompanied larger smaller quantity cytoplasm  large majority power plants animals male cell also contains cytoplasm  cannot yet definitely stated cytoplasm play part process 


In [130]:
# look at distance from original sentence to each of the glosses and find the most similar
documents = [first_sentence_stemmed, second_sentence_stemmed]

for syn in wn.synsets("play"):
    gloss = text_preprocess(syn.definition())
    documents.append(gloss)

print(documents)


['cannot sing cannot play piano  although  early experiments show  could learn mechanically beat tune keys ', 'strongest direct evidence seems nuclear substances parts cells always equivalent quantity  higher plants animals male organ spermatozoid composed almost entirely nucleus  male nucleus carried female cell without particle cytoplasm i since  however  nucleus female cell always accompanied larger smaller quantity cytoplasm  large majority power plants animals male cell also contains cytoplasm  cannot yet definitely stated cytoplasm play part process ', 'dramatic work intended performance actors stage', 'theatrical performance drama', 'preset plan action team sports', 'deliberate coordinated movement requiring dexterity skill', 'state action feasible', 'utilization exercise', 'attempt get something', 'activity children guided imagination fixed rules', ' in games plays performances  time play proceeds', 'removal constraints', 'weak tremulous light', 'verbal wit mockery  often anoth

In [131]:
vectorizer = TfidfVectorizer()
doc_term_matrix = vectorizer.fit_transform(documents).toarray()

In [132]:
doc_term_matrix.shape

(54, 229)

In [133]:
vectorizer.get_feature_names_out()

array(['accepted', 'accompanied', 'act', 'action', 'activities',
       'activity', 'actors', 'advantage', 'agreed', 'allowing', 'almost',
       'also', 'although', 'always', 'amuse', 'amusement', 'animals',
       'another', 'as', 'attempt', 'audio', 'battle', 'beat', 'behave',
       'bet', 'bets', 'bounded', 'cannot', 'card', 'carelessly',
       'carried', 'cause', 'cell', 'cells', 'certain', 'chance',
       'characteristic', 'children', 'composed', 'consequence',
       'consider', 'constraints', 'contains', 'contend', 'continuous',
       'coordinated', 'could', 'cytoplasm', 'definitely', 'deliberate',
       'dexterity', 'direct', 'directed', 'discharge', 'discharged',
       'diversion', 'drama', 'dramatic', 'early', 'effect', 'emit',
       'employ', 'engage', 'engaged', 'entirely', 'equivalent',
       'evidence', 'exercise', 'exhaust', 'expense', 'experiments',
       'feasible', 'female', 'fixed', 'framework', 'freely', 'game',
       'games', 'gay', 'get', 'guided', 'hap

In [143]:
# documents 0 and 1 are original sentences,
# using cosine distance to other vectors 2-7 to find closest for each

most_similar = []
for sent_vec in doc_term_matrix[0:2]:
  # first 2 documents (0,1) are the original sentences
  max_similiarity = 0
  most_similar_doc = None

  for index, syn_vec in enumerate(doc_term_matrix[2:]):
    dist = distance.cosine(sent_vec, syn_vec)
    cos_similarity = 1 - dist

    if cos_similarity > max_similiarity:
      max_similiarity = cos_similarity
      most_similar_doc = index

  most_similar.append(most_similar_doc)

print(most_similar)
best_first_def = wn.synsets("play")[most_similar[0]].definition()
best_second_def = wn.synsets("play")[most_similar[1]].definition()

[19, 20]


In [144]:
print(first_sentence)
print(f"Closest definition: {best_first_def}")

She cannot sing and she cannot play the piano, although, as some early experiments show, she could learn mechanically to beat out a tune on the keys.
Closest definition: play on an instrument


In [145]:
print(second_sentence)
print(f"Closest definition: {best_second_def}")

The strongest direct evidence seems to be that the nuclear substances are the only parts of the cells which are always equivalent in quantity, and that in the higher plants and animals the male organ or spermatozoid is composed almost entirely of the nucleus, and that the male nucleus is carried into the female cell without a particle of cytoplasm.i Since, however, the nucleus of the female cell is always accompanied by a larger or smaller quantity of cytoplasm, and that in a large majority of the power plants and animals the male cell also contains cytoplasm, it cannot yet be definitely stated that the cytoplasm does not play some part in the process.
Closest definition: play a role or part
