### Libraries

In [1]:
import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import brown, wordnet as wn, wordnet_ic, stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import FastText, Word2Vec, Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz

### Task 1

**Let's find synsets, hyponyms, hypernyms and various semantic similiarities for words "rowan" and "crow"**

In [2]:
print(wn.synsets('rowan'))
wn.synsets('crow')

[Synset('rowan.n.01')]


[Synset('crow.n.01'),
 Synset('crow.n.02'),
 Synset('crow.n.03'),
 Synset('corvus.n.01'),
 Synset('brag.n.01'),
 Synset('crow.n.06'),
 Synset('gloat.v.01'),
 Synset('crow.v.02'),
 Synset('crow.v.03')]

**For rowan there is only 1 synset:**

In [3]:
wn.synset('rowan.n.01').definition()

'Eurasian tree with orange-red berrylike fruits'

**For crow there is a lot more to cover. We will use the first one (crow.n.01) for this exercise:**

In [4]:
crow_synsets = wn.synsets('crow')
for syn in crow_synsets:
    print(f"{syn.name()}: {syn.definition()}")

crow.n.01: black birds having a raucous call
crow.n.02: the cry of a cock (or an imitation of it)
crow.n.03: a member of the Siouan people formerly living in eastern Montana
corvus.n.01: a small quadrilateral constellation in the southern hemisphere near Virgo
brag.n.01: an instance of boastful talk
crow.n.06: a Siouan language spoken by the Crow
gloat.v.01: dwell on with satisfaction
crow.v.02: express pleasure verbally
crow.v.03: utter shrill sounds


In [5]:
crow_bird = wn.synset('crow.n.01')
types_of_crow = crow_bird.hyponyms()
print(types_of_crow)
crow_is_a = crow_bird.hypernyms()
print(crow_is_a)

[Synset('american_crow.n.01')]
[Synset('corvine_bird.n.01')]


In [6]:
rowan = wn.synset('rowan.n.01')
rowan.lowest_common_hypernyms(crow_bird)

[Synset('organism.n.01')]

**Then the script for retrieving the first hypernym and all hyponyms for 'car' and 'bus'**

In [7]:
def retrieve_hyper_hyponyms(word):
    synsets = wn.synsets(word)
    if not synsets:
        return None, None
    first_synset = synsets[0]
    hypernyms = first_synset.hypernyms()
    first_hypernym = hypernyms[0] if hypernyms else None

    hyponyms = first_synset.hyponyms()

    return first_hypernym, hyponyms

In [8]:
hypernym_car, hyponym_car = retrieve_hyper_hyponyms('car')
print('first hypernym of car:', hypernym_car)
print('hyponyms for cars:')
hyponym_car

first hypernym of car: Synset('motor_vehicle.n.01')
hyponyms for cars:


[Synset('touring_car.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('coupe.n.01'),
 Synset('pace_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('jeep.n.01'),
 Synset('electric.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('hot_rod.n.01'),
 Synset('compact.n.03'),
 Synset('cruiser.n.01'),
 Synset('hatchback.n.01'),
 Synset('sedan.n.01'),
 Synset('sports_car.n.01'),
 Synset('hardtop.n.01'),
 Synset('stock_car.n.01'),
 Synset('model_t.n.01'),
 Synset('cab.n.03'),
 Synset('racer.n.02'),
 Synset('minivan.n.01'),
 Synset('limousine.n.01'),
 Synset('used-car.n.01'),
 Synset('bus.n.04'),
 Synset('sport_utility.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('ambulance.n.01'),
 Synset('roadster.n.01'),
 Synset('convertible.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('subcompact.n.01')]

In [9]:
hypernym_bus, hyponym_bus = retrieve_hyper_hyponyms('bus')
print('first hypernym of bus:', hypernym_bus)
print('hyponyms for bus:')
hyponym_bus

first hypernym of bus: Synset('public_transport.n.01')
hyponyms for bus:


[Synset('minibus.n.01'), Synset('trolleybus.n.01'), Synset('school_bus.n.01')]

### Task 2

In [10]:
def rank_synset(word):
    synsets = wn.synsets(word, 'n')
    synset_count = [(synset, synset.lemmas()[0].count()) for synset in synsets]
    synset_rank = sorted(synset_count, key = lambda x: x[1], reverse = True)
    return synset_rank

rank_synset('car')

[(Synset('car.n.01'), 71),
 (Synset('car.n.02'), 2),
 (Synset('car.n.03'), 0),
 (Synset('car.n.04'), 0),
 (Synset('cable_car.n.01'), 0)]

### Task 3

In [11]:
def max_min_average(list):
    max_list = max(list)
    min_list = min(list)
    average = sum(list) / len(list)
    return max_list, min_list, average

In [12]:
def wu_palmer_sim_synsets(word1, word2):
    synsets_word1 = wn.synsets(word1, 'n')
    synsets_word2 = wn.synsets(word2, 'n')
    
    similarity_score = []
    
    for syn1 in synsets_word1:
        for syn2 in synsets_word2:
            similarity = syn1.wup_similarity(syn2)
            similarity_score.append(similarity)

    return max_min_average(similarity_score)

print(wu_palmer_sim_synsets('car', 'bus'))

(0.96, 0.09523809523809523, 0.46739299830604175)


In [13]:
first_hypernym_car = wn.synsets('car', 'n')[0].hypernyms()[0]
first_hypernym_bus = wn.synsets('bus', 'n')[0].hypernyms()[0]
print(first_hypernym_car, first_hypernym_bus)
print(first_hypernym_car.wup_similarity(first_hypernym_bus))

wu_palmer_sim_synsets(first_hypernym_car.name().split('.')[0], first_hypernym_bus.name().split('.')[0])


Synset('motor_vehicle.n.01') Synset('public_transport.n.01')
0.7368421052631579


(0.7368421052631579, 0.7368421052631579, 0.7368421052631579)

In [14]:
def hyponym_wu_palmer_sim_synsets(word1, word2):
    hyponyms_word1 = wn.synsets(word1, 'n')[0].hyponyms()
    hyponyms_word2 = wn.synsets(word2, 'n')[0].hyponyms()

    similarity_score = []

    for hyponym1 in hyponyms_word1:
        for hyponym2 in hyponyms_word2:
            similarity = hyponym1.wup_similarity(hyponym2)
            similarity_score.append(similarity)

    return max_min_average(similarity_score)

hyponym_wu_palmer_sim_synsets('car', 'bus')

(0.6086956521739131, 0.6086956521739131, 0.6086956521739131)

**Because we use wup-similarity and all the hyponyms are as far away from each other as every other one, the similarity is the same.**

### Task 4

In [15]:
brown_ic = wordnet_ic.ic('ic-brown.dat')
def jcn_sim_synsets(word1, word2):
    synsets_word1 = wn.synsets(word1, 'n')
    synsets_word2 = wn.synsets(word2, 'n')
    
    similarity_score = []
    
    for syn1 in synsets_word1:
        for syn2 in synsets_word2:
            similarity = syn1.jcn_similarity(syn2, brown_ic)
            similarity_score.append(similarity)

    return max_min_average(similarity_score)

print(jcn_sim_synsets('car', 'bus'))

(0.34659468740185323, 0.05161364962677664, 0.09387159388812354)


### Task 5

In [16]:
def idf_calc(word, docs):
    word_amount = sum(1 for doc in docs if word in doc)
    if word_amount > 0:
        return np.log(len(docs) / word_amount)
    else:
        return 0

def max_similarity(word, tokens):
    max_sim = 0
    for token in tokens:
        syn1 = wn.synsets(word)
        syn2 = wn.synsets(token)
        if syn1 and syn2:
            sim = syn1[0].wup_similarity(syn2[0])
            if sim and sim > max_sim:
                max_sim = sim
    return max_sim

def mihalcea_similarity(text1_tokens, text2_tokens, tokenized=True):
    if not tokenized:
        text1_tokens = word_tokenize(text1_tokens)
        text2_tokens = word_tokenize(text2_tokens)
    
    unique_words = set(text1_tokens + text2_tokens)
    idf_values = {word: idf_calc(word, [text1_tokens, text2_tokens]) for word in unique_words}
    max_text1 = {word: max_similarity(word, text2_tokens) for word in unique_words}
    max_text2 = {word: max_similarity(word, text1_tokens) for word in unique_words}

    sum_max_text1 = sum(max_text1[word] * idf_values[word] for word in unique_words)
    sum_max_text2 = sum(max_text2[word] * idf_values[word] for word in unique_words)
    idf_sum = sum(idf_values[word] for word in unique_words)

    similarity_score = float(1/2 * ((sum_max_text1 / idf_sum) * (sum_max_text2 / idf_sum)))

    return similarity_score

T1 = "Students feel unhappy today about the class today"
T2 = "Several students study hard at classes in recent days"

stop_words = set(stopwords.words('english'))
T1_tokens = word_tokenize(T1.lower())
T1_stop = [word for word in T1_tokens if word not in stop_words]
T2_tokens = word_tokenize(T2.lower())
T2_stop = [word for word in T2_tokens if word not in stop_words]

lemmatizer = WordNetLemmatizer()

T1_clean = [lemmatizer.lemmatize(word) for word in T1_stop]
T2_clean = [lemmatizer.lemmatize(word) for word in T2_stop]

print("No changes:", mihalcea_similarity(T1, T2, False))
print("Lowercase and stopword removal:", mihalcea_similarity(T1_stop, T2_stop))
print("Lowercase, stopword removal and lemmatization:", mihalcea_similarity(T1_clean, T2_clean))

No changes: 0.28281489889469863
Lowercase and stopword removal: 0.30624163374802627
Lowercase, stopword removal and lemmatization: 0.26785709895261356


**When using lemmatization (or stemming) the similarity drops because of how the similarity is calculated in the wup_similarity.**

### Task 6

In [17]:
def noun_transformation(tokens):
    noun_tokens = []
    for token in tokens:
        noun = wn.morphy(token, wn.NOUN)
        if noun:
            noun_tokens.append(noun)
        else:
            noun_tokens.append(token)
    return noun_tokens

T1_noun = noun_transformation(T1_stop)
T2_noun = noun_transformation(T2_stop)

print("Noun-transformation:", mihalcea_similarity(T1_noun, T2_noun))

Noun-transformation: 0.2645075291145569


**We get a lower similarity because of the noun-transformation. Again, we might get a better similarity if we'd use a different similarity than wup_similarity.**

### Task 7

In [18]:
data = [T1_tokens, T2_tokens]

fasttext_model = FastText(sentences=data, vector_size=300, window=5, min_count=1)
word2vec_model = Word2Vec(sentences=data, vector_size=300, window=5, min_count=1)

def average_embedding(model, tokens):
    embeddings = []
    for token in tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])

    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
        return avg_embedding
    else:
        return np.zeros(model.vector_size)

def cosine_sim(a, b):
    return cosine_similarity([a], [b])[0][0]

fasttext_vector_T1 = average_embedding(fasttext_model, T1_tokens)
fasttext_vector_T2 = average_embedding(fasttext_model, T2_tokens)
fasttext_similarity = cosine_sim(fasttext_vector_T1, fasttext_vector_T2)

word2vec_vector_T1 = average_embedding(word2vec_model, T1_tokens)
word2vec_vector_T2 = average_embedding(word2vec_model, T2_tokens)
word2vec_similarity = cosine_sim(word2vec_vector_T1, word2vec_vector_T2)

print("Fasttext:", fasttext_similarity)
print("word2vec:", word2vec_similarity)

Fasttext: 0.05813166
word2vec: 0.07250185


### Task 8

**We already have the cleaned tokens in T1_clean and T2_clean**

In [19]:
fuzzy_similarity = fuzz.ratio(T1_clean, T2_clean)
print(fuzzy_similarity)

64


### Final words

This was an interesting assignment. I had much better time to prepare for this, so I actually completed this this time. Although in the Task 7 I had trouble getting the doc2vec working, everything else went fairly smoothly. Took me around 4 hours to complete this whole set.