# Exercise 2

### Necessary imports

In [2]:
import nltk
from nltk.corpus import framenet as fn
from nltk.corpus.reader.wordnet import Synset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('framenet_v17')
nltk.download('wordnet')
import hashlib
import re
from random import randint
from random import seed
from nltk.corpus import wordnet as wn
stop_words = set(stopwords.words('english'))
import json

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\framenet_v17.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Extracting the frames from name and surname

In [171]:
def get_frames_IDs():
    return [f.ID for f in fn.frames()]   

def getFrameSetForStudent(surname, list_len=5):
    nof_frames = len(fn.frames())
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % nof_frames)
    print('\nStudent: ' + surname)
    framenet_IDs = get_frames_IDs()
    i = 0
    offset = 0 
    seed(1)
    frames_extracted = {}
    while i < list_len:
        fID = framenet_IDs[(base_idx+offset)%nof_frames]
        f = fn.frame(fID)
        fNAME = f.name
        frames_extracted[fID] = fNAME
        print('\tID: {a:4d}\tFrame: {framename}'.format(a=fID, framename=fNAME))
        offset = randint(0, nof_frames)
        i += 1
    return frames_extracted

In [172]:
# I extract the frames for Lorenzo Botto and Gabriele Naretto and I merge them in a single dictionary

frames_student1 = getFrameSetForStudent('Lorenzo Botto')
frames_student2 = getFrameSetForStudent('Gabriele Naretto')
frames_extracted = {"lorenzo": frames_student1} | {"gabriele": frames_student2}


Student: Lorenzo Botto
	ID: 1882	Frame: Shopping
	ID: 1148	Frame: Attributed_information
	ID: 2231	Frame: Response_scenario
	ID: 2191	Frame: Turning_out
	ID:  380	Frame: Custom

Student: Gabriele Naretto
	ID: 2018	Frame: Collocation_image_schema
	ID: 1497	Frame: Giving_in
	ID:  334	Frame: Cause_temperature_change
	ID: 1211	Frame: Dressing
	ID: 1147	Frame: Ordinal_numbers


### Defining the support function for cleaning the sentences

In [173]:
# I clean the sentences removing the punctuation and the stop words

def clean_sentence(sentence: str | list) -> list:
    sentence_without_puntuaction = re.sub(r'[^\w\s]', '', str(sentence)).strip()
    word_tokens = word_tokenize(sentence_without_puntuaction)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return filtered_sentence

### Defining the lesk algoritm for found the best synset in wordnet

In [174]:
# function that given a list of synsets (hypernyms and hyponyms) returns the value of the maximum synset

def search_in_synsets(synsets: list, sentence: str, clean: bool) -> int:
    overlap = 0
    for synset in synsets:
        for definition_word in clean_sentence(synset.definition()) if clean else synset.definition().split():
            if definition_word in sentence:
                overlap += 1
        for example_word in clean_sentence(synset.examples()) if clean else synset.examples():
            if clean:
                if example_word in sentence:
                    overlap += 1
            else:
                for example_word in example_word.split():
                    if example_word in sentence:
                        overlap += 1
    return overlap
    

In [175]:
def lesk(word: str, sentence: list, clean: bool) -> Synset:
    synsets = wn.synsets(word)
    if not synsets:
        return None
    best_sense = synsets[0]
    max_overlap = 0
    for synset in synsets:
        overlap = 0
        overlap += search_in_synsets(synset.hypernyms(), sentence, clean)
        overlap += search_in_synsets(synset.hyponyms(), sentence, clean)
        
        # Cleaning example in wordnet 
        for definition_word in clean_sentence(synset.definition()) if clean else synset.definition().split():
            if definition_word in sentence:
                overlap += 1
        for example_word in clean_sentence(synset.examples()) if clean else synset.examples():
            if clean:
                if example_word in sentence:
                    overlap += 1
            else:
                for example_word in example_word.split():
                    if example_word in sentence:
                        overlap += 1
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = synset
        
    return best_sense

### Defining the cycle for finding the accuracy of the lesk algorithm vs. handwritten annotations

In [176]:
# I read the json file and extract the synsets annotated by hand
annotations = json.load(open('dataset/annotations.json'))
points = 0

# For all the name of frames we use Lesk algorithm for return the best sense in wordnet
for frames_id in frames_extracted["lorenzo"]:
    print("-----------------------------------")
    print("Frame name:", fn.frame(frames_id).name)
    
    # I check the synset of the frame name
    if "_" in fn.frame(frames_id).name:
        if lesk(fn.frame(frames_id).name.split("_")[0], clean_sentence(fn.frame(frames_id).definition), True).name() == annotations[fn.frame(frames_id).name.lower()]["frame_name"]:     
            points += 1
        print("Annotation:", annotations[fn.frame(frames_id).name.lower()]["frame_name"])
        print("Lesk synset:", lesk(fn.frame(frames_id).name.split("_")[0], clean_sentence(fn.frame(frames_id).definition), True).name())
    else:
        if lesk(fn.frame(frames_id).name, clean_sentence(fn.frame(frames_id).definition), True).name() == annotations[fn.frame(frames_id).name.lower()]["frame_name"]:
            points += 1
        print("Annotation:", annotations[fn.frame(frames_id).name.lower()]["frame_name"])
        print("Lesk synset:", lesk(fn.frame(frames_id).name.split("_")[0], clean_sentence(fn.frame(frames_id).definition), True).name())

    # TODO: I check the synset of the frame elements and the lexical units
    # print("Frame Elements in the frame:", list(fn.frame(frames_id).FE))
    # for unit in list(fn.frame(frames_id).FE):
    #     print(lesk(unit,fn.frame(frames_id).FE[unit].definition,True))
    
    # print("Lexical Units in the frame:", list(fn.frame(frames_id).lexUnit))
    # #for every lexical unit in the frame we use lesk algoritm for return the best sense in wordnet
    # for lexical in list(fn.frame(frames_id).lexUnit):
    #     print(lesk(fn.frame(frames_id).lexUnit[lexical].lexemes[0].name,fn.frame(frames_id).lexUnit[lexical].definition,True))
    
print("-----------------------------------")
print("Accuracy:", points / len(frames_extracted["lorenzo"]))

-----------------------------------
Frame name: Shopping
Annotation: shopping.n.01
Lesk synset: shopping.n.01
-----------------------------------
Frame name: Attributed_information
Annotation: impute.v.01
Lesk synset: impute.v.01
-----------------------------------
Frame name: Response_scenario
Annotation: reaction.n.03
Lesk synset: response.n.01
-----------------------------------
Frame name: Turning_out
Annotation: turn.v.19
Lesk synset: turn.v.01
-----------------------------------
Frame name: Custom
Annotation: custom.n.01
Lesk synset: custom.n.01
-----------------------------------
Accuracy: 0.6
