#### In this exercise we need to find the correct synset starting from the definitions given by us (unito students) for 4 terms. To direct our search we will use the principle of "genus".

#### The approach
 1. Find the genus candidates (typically the most frequent terms in the definitions)
 2. Collect the wordnet synsets for:
    - each genus candidate
    - each hyponym of each genus candidate
    - each hypernym of each genus candidate
 3. Get the wordnet signature (definition + examples) of each collected synset
 4. Compare wordnet signature and the definitions through Lesk (overlap)
 5. Choose the synset that has the highest score


In [92]:
from collections import defaultdict, Counter
import re
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import Synset
from typing import Set, Dict, AnyStr, List

### Data Load

In [94]:
root_dir = '..'

# Load stop words from file
def import_stop_words() -> Set[AnyStr]:
    with open(f'{root_dir}/common-data/stop_words_FULL.txt') as f:
        stop_words = {line for line in f.read().splitlines()}
    return stop_words


# Load definitions and build a map
def import_definitions_map() -> Dict[AnyStr, List[Set[AnyStr]]]:
    definitions_map = defaultdict(list)
    with open(f'{root_dir}/common-data/definitions.csv') as f:

        for line in f.readlines()[1:]:
            splits = line.split("~|~")
            word = splits[0]
            definitions = splits[1:]
            
            for definition in definitions:
                if bow_words := bag_of_words(definition):
                    definitions_map[word].append(bow_words)

    return dict(definitions_map)
            

stop_words = import_stop_words()
definitions_map = import_definitions_map()

### Pre-processing

In [93]:
# Pre-processing of a sentence
def bag_of_words(sentence: AnyStr) -> Set[AnyStr]:
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


# Remove stopwords from a word list
def remove_stopwords(words: List[AnyStr]) -> List[AnyStr]:
    return [value.lower() for value in words if value.lower() not in stop_words]


# Get tokens from sentence
def tokenize_sentence(sentence: AnyStr) -> List[AnyStr]:
    words = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        words.append(lmtzr.lemmatize(tag[0]).lower())
    return words


# Remove punctuation and multiple spaces
def remove_punctuation(sentence: AnyStr) -> AnyStr:
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))

### Main functions

In [95]:
# Find the |words_num| most frequent words present in the definitions
def find_MFW(definitions: List[Set[AnyStr]], words_num: int) -> List[AnyStr]:
    concept_counter = Counter()
    
    for definition in definitions:
        concept_counter.update(definition)
    
    most_frequent_words = [entry[0] for entry in concept_counter.most_common(words_num)]
    return most_frequent_words

# Return first wordnet synset for a word
def get_synset(word: AnyStr) -> Synset: # maybe it's possible to do disambiguation using the words in the sentence (currently just choose the 1st synset)
    synset = None
    if synsets := wordnet.synsets(word):
        synset = synsets[0]

    return synset

# Compute the most likely synset from just the definitions, using |freq_words_num| most frequent words
def compute_results(freq_words_num: int) -> Dict[AnyStr, Dict[AnyStr, List[AnyStr]]]:
    results = defaultdict(dict)
    for concept, definitions in definitions_map.items():
        most_frequent_words = find_MFW(definitions, freq_words_num)

        hyponyms = []
        for word in most_frequent_words:
            if synset := get_synset(word):
                hyponyms.extend(synset.hyponyms())
        
        res = []
        for hyp in hyponyms:
            hyp_def = hyp.definition() + ','.join(hyp.examples())
            
            match_words = []
            for word in most_frequent_words:
                if word in hyp_def:
                    match_words.append(word) # score = |words found|
            
            res.append([hyp, match_words])
    
        # sort the list using the number of important words found
        sorted_res = sorted(res, key=lambda x: len(x[1]), reverse=True)
        for synset, match_words in sorted_res[:freq_words_num]:
            results[concept][synset.name()] = match_words

    return dict(results)


### Computation

In [96]:
freq_words_num = 7
results = compute_results(freq_words_num=freq_words_num)

# print results
for concept, items in results.items():
    print(concept.upper())
    for item, match_words in items.items():
        print(f"{item}: {match_words}")
    print()

EMOTION
affection.n.01: ['feeling', 'feel', 'express']
pain.n.02: ['feeling', 'feel', 'mental']
pleasure.n.01: ['feeling', 'feel', 'mental']
affect.n.01: ['feeling', 'feel']
agitation.n.03: ['feeling', 'feel']
ambivalence.n.01: ['feeling', 'feel']
astonishment.n.01: ['feeling', 'feel']
calmness.n.03: ['feeling', 'feel']
desire.n.01: ['feeling', 'feel']
despair.n.02: ['feeling', 'feel']

PERSON
homo_habilis.n.01: ['human', 'characteristic']
world.n.08: ['human', 'living']
beard.n.03: ['person', 'homo']
homosexual.n.01: ['person', 'homo']
life.n.08: ['person', 'living']
man.n.03: ['human', 'generic']
man_jack.n.01: ['individual', 'single']
self.n.02: ['person', 'individual']

REVENGE
hate.n.01: ['feeling', 'action', 'emotion']
lightning_rod.n.01: ['action', 'reaction', 'negative']
sounding_board.n.01: ['action', 'reaction', 'person']
dander.n.02: ['anger', 'feeling']
fury.n.01: ['anger', 'feeling']
indignation.n.01: ['anger', 'feeling']
infuriation.n.01: ['anger', 'feeling']
umbrage.n.01