# Exercise 2

### Necessary imports

In [16]:
import pandas as pd
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')

### Dataset loading

In [5]:
df = pd.read_csv('data/TLN-definitions-23.tsv', sep='\t')
df_door = df['door']
df_ladybug = df['ladybug']
df_pain = df['pain']
df_blurriness = df['blurriness']

### Preprocessing of definitions

In [59]:
# Stop words removal
df_door_nostop = df_door.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_ladybug_nostop = df_ladybug.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_pain_nostop = df_pain.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_blurriness_nostop = df_blurriness.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Lemmatization and tokenization (with puntuaction removal)
lemmatizer = WordNetLemmatizer()
df_door_lem = df_door_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_ladybug_lem = df_ladybug_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_pain_lem = df_pain_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_blurriness_lem = df_blurriness_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)

### Getting the words with highest frequency

In [71]:
# I take the words with most frequency
def get_frequent_words(df):
    words = []
    for row in df:
        for word in row:
            if len(word) > 2:
                words.append(word)
    return pd.Series(words).value_counts()

In [80]:
words_door = get_frequent_words(df_door_lem)[:20]
words_ladybug = get_frequent_words(df_ladybug_lem)[:20]
words_pain = get_frequent_words(df_pain_lem)[:20]
words_blurriness = get_frequent_words(df_blurriness_lem)[:20]

In [81]:
def search_best_synset(words: pd.Series, df: pd.DataFrame):
    # I take the hypernyms of the words with most frequency
    hypernyms = []
    for word in words.index:
        synset = lesk(df.to_list(), word, 'n')
        if synset:
            hypernyms.extend(synset.hypernyms())

    hypernyms = set(hypernyms)

    results = []
    for hyp in hypernyms:
        # I take the definition and examples of the hypernyms
        hyp_def_examples = hyp.definition() + ', '.join(hyp.examples())
        
        # I found the words that are in mine top words and in the definition of the hypernyms
        # I associate a score to the hypernyms based on the frequency of the top words founded in the definition
        score = 0
        matched_words = []
        for word in words.index:
            if word in hyp_def_examples:
                matched_words.append(word)
                score += words[word]
        if score > 0:
            results.append((hyp, matched_words, score))

    return results

In [91]:
# Maintain only the 5 best synset with highest score
def get_best_synset(results):
    results.sort(key=lambda x: x[2], reverse=True)
    return results[:5]


door_synsets = get_best_synset(search_best_synset(words_door, df_door))
ladybug_synsets = get_best_synset(search_best_synset(words_ladybug, df_ladybug))
pain_synsets = get_best_synset(search_best_synset(words_pain, df_pain))
bluriness_synsets = get_best_synset(search_best_synset(words_blurriness, df_blurriness))

print("-----------------------------------")
print('Door synsets:')
for synset in door_synsets:
    print('Synset:', synset[0])
    print('Matched words:', synset[1])
    print('Score:', synset[2])

print("-----------------------------------")
print('Ladybug synsets:')
for synset in ladybug_synsets:
    print('Synset:', synset[0])
    print('Matched words:', synset[1])
    print('Score:', synset[2])

print("-----------------------------------")
print('Pain synsets:')
for synset in pain_synsets:
    print('Synset:', synset[0])
    print('Matched words:', synset[1])
    print('Score:', synset[2])

print("-----------------------------------")
print('Blurriness synsets:')
for synset in bluriness_synsets:
    print('Synset:', synset[0])
    print('Matched words:', synset[1])
    print('Score:', synset[2])

-----------------------------------
Door synsets:
Synset: Synset('environment.n.01')
Matched words: ['room']
Score: 13
Synset: Synset('gathering.n.01')
Matched words: ['one', 'place']
Score: 6
Synset: Synset('unit.n.04')
Matched words: ['another', 'one']
Score: 6
Synset: Synset('wrestling_hold.n.01')
Matched words: ['used']
Score: 5
Synset: Synset('commercial_enterprise.n.02')
Matched words: ['used']
Score: 5
-----------------------------------
Ladybug synsets:
Synset: Synset('disk.n.01')
Matched words: ['round']
Score: 4
-----------------------------------
Pain synsets:
Synset: Synset('suffering.n.04')
Matched words: ['feeling', 'physical', 'pain']
Score: 30
Synset: Synset('pain.n.02')
Matched words: ['feeling', 'emotional', 'pain', 'emotion']
Score: 29
Synset: Synset('feeling.n.01')
Matched words: ['feeling', 'emotional', 'emotion']
Score: 27
Synset: Synset('quality.n.01')
Matched words: ['something']
Score: 4
Synset: Synset('fabric.n.01')
Matched words: ['felt']
Score: 3
-----------