# Exercise 2

### Necessary imports

In [1]:
import pandas as pd
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet as wn
import warnings
warnings.filterwarnings("ignore")

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')

### Dataset loading

In [2]:
df = pd.read_csv('data/TLN-definitions-23.tsv', sep='\t')
df_door = df['door']
df_ladybug = df['ladybug']
df_pain = df['pain']
df_blurriness = df['blurriness']

### Preprocessing of definitions

In [3]:
def pre_process(df: pd.DataFrame) -> pd.DataFrame:
    # Stop words removal
    df_nostop = df.apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in (stop_words)]))

    # Lemmatization and tokenization (with puntuaction removal)
    df_lem = df_nostop.apply(
        lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
    )

    return df_nostop, df_lem

In [4]:
df_door_nostop, df_door_lem = pre_process(df_door)
df_ladybug_nostop, df_ladybug_lem = pre_process(df_ladybug)
df_pain_nostop, df_pain_lem = pre_process(df_pain)
df_blurriness_nostop, df_blurriness_lem = pre_process(df_blurriness)

### Getting the frequency of words

In [5]:
# I take the words with most frequency
def get_frequent_words(df: pd.DataFrame) -> pd.Series:
    words = []
    for row in df:
        for word in row:
            if len(word) > 2:
                words.append(word)
    return pd.Series(words).value_counts()

In [6]:
words_door = get_frequent_words(df_door_lem)
words_ladybug = get_frequent_words(df_ladybug_lem)
words_pain = get_frequent_words(df_pain_lem)
words_blurriness = get_frequent_words(df_blurriness_lem)

### Function for getting the synsets of a list of words of a concept with associated scores 

In [7]:
def search_best_synset(words: pd.Series) -> list:
    # I take the hyponyms of the words with most frequency
    hyponyms_dict = {}
    for word in words.index:
        synsets = wn.synsets(word)
        hyponyms = []
        for synset in synsets:
            # I take the hyponyms of the synset and also the hyponyms of the hyponyms and also the hyponyms of the hyponyms of the hyponyms
            hyponyms_1 = synset.hyponyms()
            hyponyms_2 = []
            for hyp in hyponyms_1:
                hyponyms_2.extend(hyp.hyponyms())
            hyponyms_3 = []
            for hyp in hyponyms_2:
                hyponyms_3.extend(hyp.hyponyms())
            hyponyms.extend(hyponyms_1)
            hyponyms.extend(hyponyms_2)
            hyponyms.extend(hyponyms_3)
        hyponyms_dict[word] = hyponyms

    results = []
    for key, value in hyponyms_dict.items():
        for hyp in value:
            # I take the definition and examples of the hypernyms
            hyp_def_examples = hyp.definition() + ', '.join(hyp.examples())
            
            # I found the words that are in mine top words and in the definition of the hypernyms
            # I associate a score to the hypernyms based on the frequency of the top words founded in the definition
            score = 0
            matched_words = []
            for word in words.index:
                if word in hyp_def_examples:
                    matched_words.append(word)
                    score += words[word]
            if score > 0:
                results.append((hyp, key, matched_words, score))

    return results

### I found the synsets and i print the 5 best synsets (based on score) and the words associated

In [8]:
# Maintain only the 5 best synset with highest score
def get_best_synset(synsets: list) -> list:
    synsets.sort(key=lambda x: x[3], reverse=True)
    best_5 = [synsets[0]]
    for synset in synsets[1:]:
        if len(best_5) < 5:
            exists = False
            for best_syn in best_5:
                if best_syn[0].name() == synset[0].name():
                    exists = True
            if not exists:
                best_5.append(synset)
    return best_5

door_synsets = get_best_synset(search_best_synset(words_door))
ladybug_synsets = get_best_synset(search_best_synset(words_ladybug))
pain_synsets = get_best_synset(search_best_synset(words_pain))
bluriness_synsets = get_best_synset(search_best_synset(words_blurriness))

print("-----------------------------------")
print('Door synsets:')
for synset in door_synsets:
    print('Synset:', synset[0])
    print('Synset definition:', synset[0].definition())
    print('Genus:', synset[1])
    print('Matched words:', synset[2])
    print('Score:', synset[3])

print("-----------------------------------")
print('Ladybug synsets:')
for synset in ladybug_synsets:
    print('Synset:', synset[0])
    print('Synset definition:', synset[0].definition())
    print('Genus:', synset[1])
    print('Matched words:', synset[2])
    print('Score:', synset[3])

print("-----------------------------------")
print('Pain synsets:')
for synset in pain_synsets:
    print('Synset:', synset[0])
    print('Synset definition:', synset[0].definition())
    print('Genus:', synset[1])
    print('Matched words:', synset[2])
    print('Score:', synset[3])

print("-----------------------------------")
print('Blurriness synsets:')
for synset in bluriness_synsets:
    print('Synset:', synset[0])
    print('Synset definition:', synset[0].definition())
    print('Genus:', synset[1])
    print('Matched words:', synset[2])
    print('Score:', synset[3])

-----------------------------------
Door synsets:
Synset: Synset('communication.n.03')
Synset definition: a connection allowing access between persons or places
Genus: connection
Matched words: ['room', 'access', 'passage', 'two', 'people', 'place', 'pas', 'allow', 'way', 'connection', 'person', 'rooms']
Score: 45
Synset: Synset('doorway.n.01')
Synset definition: the entrance (the space in a wall) through which you enter or leave a room or building; the space that a door can close
Genus: access
Matched words: ['room', 'space', 'wall', 'entrance', 'building', 'enter', 'close', 'way', 'leave', 'door']
Score: 41
Synset: Synset('corridor.n.01')
Synset definition: an enclosed passageway; rooms usually open onto it
Genus: passage
Matched words: ['room', 'passage', 'usually', 'open', 'closed', 'close', 'pas', 'way', 'rooms']
Score: 33
Synset: Synset('room.n.01')
Synset definition: an area within a building enclosed by walls and floor and ceiling
Genus: area
Matched words: ['room', 'wall', 'bu