# Exercise 2

### Necessary imports

In [24]:
import nltk
from nltk.corpus import framenet as fn
from nltk.corpus.reader.wordnet import Synset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('framenet_v17')
nltk.download('wordnet')
import hashlib
import re
from random import randint
from random import seed
from nltk.corpus import wordnet as wn
stop_words = set(stopwords.words('english'))
import json
from enum import Enum

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     C:\Users\lores\AppData\Roaming\nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lores\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Extracting the frames from name and surname

In [25]:
def get_frames_IDs():
    return [f.ID for f in fn.frames()]   

def getFrameSetForStudent(surname, list_len=5):
    nof_frames = len(fn.frames())
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % nof_frames)
    print('\nStudent: ' + surname)
    framenet_IDs = get_frames_IDs()
    i = 0
    offset = 0 
    seed(1)
    frames_extracted = {}
    while i < list_len:
        fID = framenet_IDs[(base_idx+offset)%nof_frames]
        f = fn.frame(fID)
        fNAME = f.name
        frames_extracted[fID] = fNAME
        print('\tID: {a:4d}\tFrame: {framename}'.format(a=fID, framename=fNAME))
        offset = randint(0, nof_frames)
        i += 1
    return frames_extracted

In [26]:
# I extract the frames for Lorenzo Botto and Gabriele Naretto and I merge them in a single dictionary

frames_student1 = getFrameSetForStudent('Lorenzo Botto')
frames_student2 = getFrameSetForStudent('Gabriele Naretto')
frames_extracted = {"lorenzo": frames_student1} | {"gabriele": frames_student2}


Student: Lorenzo Botto
	ID: 1882	Frame: Shopping
	ID: 1148	Frame: Attributed_information
	ID: 2231	Frame: Response_scenario
	ID: 2191	Frame: Turning_out
	ID:  380	Frame: Custom

Student: Gabriele Naretto
	ID: 2018	Frame: Collocation_image_schema
	ID: 1497	Frame: Giving_in
	ID:  334	Frame: Cause_temperature_change
	ID: 1211	Frame: Dressing
	ID: 1147	Frame: Ordinal_numbers


### Defining the support function for cleaning the sentences

In [27]:
# I clean the sentences removing the punctuation and the stop words

def clean_sentence(sentence: str | list) -> list:
    sentence_without_puntuaction = re.sub(r'[^\w\s]', '', str(sentence)).strip()
    word_tokens = word_tokenize(sentence_without_puntuaction)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return filtered_sentence

### Defining the lesk algoritm for found the best synset in wordnet

For every synset (i.e., for each hyperonym and hyponym) we calculate the overlap between the gloss and the sentence and we return the result of the overlap.

In [28]:
# function that given a list of synsets (hypernyms and hyponyms) returns the value of the maximum synset

def search_in_synsets(synsets: list, sentence: str, clean: bool) -> int:
    overlap = 0
    for synset in synsets:
        for definition_word in clean_sentence(synset.definition()) if clean else synset.definition().split():
            if definition_word in sentence:
                overlap += 1
        for example_word in clean_sentence(synset.examples()) if clean else synset.examples():
            if clean:
                if example_word in sentence:
                    overlap += 1
            else:
                for example_word in example_word.split():
                    if example_word in sentence:
                        overlap += 1
    return overlap
    

### Lesk Algorithm

We take all the synsets of the word and we calculate the overlap, for each synset of the word, of all hypernyms and hyponyms and between the gloss and definition of the synset. We return the synset with the highest overlap.

In [29]:
def lesk(word: str, sentence: list, clean: bool) -> Synset:
    synsets = wn.synsets(word)
    if not synsets:
        return None
    best_sense = synsets[0]
    max_overlap = 0
    for synset in synsets:
        overlap = 0
        overlap += search_in_synsets(synset.hypernyms(), sentence, clean)
        overlap += search_in_synsets(synset.hyponyms(), sentence, clean)
        
        # Cleaning example in wordnet 
        for definition_word in clean_sentence(synset.definition()) if clean else synset.definition().split():
            if definition_word in sentence:
                overlap += 1
        for example_word in clean_sentence(synset.examples()) if clean else synset.examples():
            if clean:
                if example_word in sentence:
                    overlap += 1
            else:
                for example_word in example_word.split():
                    if example_word in sentence:
                        overlap += 1
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = synset
        
    return best_sense

### Defining the cycles for finding the accuracy of the lesk algorithm vs. handwritten annotations

In [30]:
# I read the json file and extract the synsets annotated by hand
annotations = json.load(open('dataset/annotations.json'))

In [31]:
# I create the enum class to manage the different types of cycles

class CycleType(Enum):
    FRAME = 0
    FRAME_FE = 1
    FRAME_FE_LU = 2

#### Function for calculating the accuracy

We use three types of structures for the sentences:
1. The first one (CycleType = FRAME) is a cycle with only frame definitions, frame elements definitions and lexical units definitions.
2. The second one (CycleType = FRAME_FE) is a cycle with frame definitions, frame elements definitions (with frames definition) and lexical units definitions (with frames definition).
3. The third one (CycleType = FRAME_FE_LU) is a cycle frame definitions (with frame elements definition and lexical units definition), frame elements definitions (with frames definition and lexical units definition) and lexical units definitions (with frame definition and frame elements definition).

In [32]:
# I use the function cycle_base for calculate the accuracy, passing only the definitions of frame, 
# frame elements and lexical units to Lesk algorithm

def calculate_accuracy(cycle_type: CycleType) -> float:
    points = 0
    total = 0
    
    # For all the student
    for student in frames_extracted:
        # For all the name of frames we use Lesk algorithm for return the best sense in wordnet
        for frames_id in frames_extracted[student]:            
            # I extract some information about the frame and the annotation for reusing them multiple times
            frame = fn.frame(frames_id)

            annotation_frame = annotations[frame.name.lower()]
            frame_name = frame.name.split("_")[0] if "_" in frame.name else frame.name

            # I create the structure of sentences if cycle_type is FRAME_FE_LU
            # In every case I call the Lesk algorithm for finding the best sense
            match cycle_type:
                case CycleType.FRAME_FE_LU:
                    list_sentences = [frame.definition]
                    for unit in list(frame.FE):
                        list_sentences.append(frame.FE[unit].definition)
                    for lexical in list(frame.lexUnit):
                        list_sentences.append(frame.lexUnit[lexical].definition.replace("COD: ", "").replace("FN: ", ""))
                    lesk_synset_name = lesk(frame_name, clean_sentence(list_sentences), True).name()
                case CycleType.FRAME_FE | CycleType.FRAME:
                    lesk_synset_name = lesk(frame_name, clean_sentence(frame.definition), True).name()
            
            annotation_frame_synset = annotation_frame["frame_name"]
            
            # I check if the synset returned by Lesk algorithm is the same of the annotation
            if lesk_synset_name == annotation_frame_synset:     
                points += 1
            total += 1
            
            # For every frame element in the frame we use Lesk algorithm for return the best sense in wordnet
            for unit in list(frame.FE):
                if unit.lower() in annotation_frame["frame_elements"]:
                    frame_element = frame.FE[unit]
                    frame_element_name = frame_element.name.split("_")[0] if "_" in frame_element.name else frame_element.name
                    #print("Frame Element Name:", frame_element_name)
                    
                    # I compare the different types of cycles for passing the correct parameter (list of sentences) 
                    # to Lesk algorithm
                    match cycle_type:
                        case CycleType.FRAME_FE_LU:
                            lesk_synset_name = lesk(frame_element_name, clean_sentence(list_sentences), True).name()
                        case CycleType.FRAME_FE:
                            list_sentences = [frame.definition, frame_element.definition]
                            lesk_synset_name = lesk(frame_element_name, clean_sentence(list_sentences), True).name()
                        case CycleType.FRAME:
                            lesk_synset_name = lesk(frame_element_name, clean_sentence(frame_element.definition), True).name()
                    
                    annotation_frame_synset = annotation_frame["frame_elements"][unit.lower()]

                    # I check if the synset returned by Lesk algorithm is the same of the annotation
                    if lesk_synset_name == annotation_frame_synset:
                        points += 1
                    total += 1
            
            # For every lexical unit in the frame we use Lesk algorithm for return the best sense in wordnet
            for lexical in list(frame.lexUnit):
                if frame.lexUnit[lexical].lexemes[0].name.lower() in annotation_frame["lexical_units"]:
                    lexical_unit = frame.lexUnit[lexical]
                    lexical_unit_lexeme = lexical_unit.lexemes[0].name
                    lexical_unit_definition = lexical_unit.definition.replace("COD: ", "").replace("FN: ", "")
                    #print("Lexical Unit Name:", lexical_unit_lexeme)

                    # I compare the different types of cycles for passing the correct parameter (list of sentences) 
                    # to Lesk algorithm
                    match cycle_type:
                        case CycleType.FRAME_FE_LU:
                            lesk_synset_name = lesk(lexical_unit_lexeme, clean_sentence(list_sentences), True).name()
                        case CycleType.FRAME_FE:
                            list_sentences = [frame.definition, lexical_unit_definition]
                            lesk_synset_name = lesk(lexical_unit_lexeme, clean_sentence(list_sentences), True).name()
                        case CycleType.FRAME:
                            lesk_synset_name = lesk(lexical_unit_lexeme, clean_sentence(lexical_unit_definition), True).name()
                    
                    annotation_frame_synset = annotation_frame["lexical_units"][lexical_unit_lexeme.lower()]

                    # I check if the synset returned by Lesk algorithm is the same of the annotation
                    if lesk_synset_name == annotation_frame_synset:
                        points += 1
                    total += 1
    
    return points / total

In [33]:

print("-----------------------------------")
print("Cycle with only frame definitions, frame elements definitions and lexical units definitions")
print("Accuracy:", calculate_accuracy(CycleType.FRAME))
print("-----------------------------------")
print("Cycle with frame definitions (with frame elements definition and lexical units definition), frame elements definitions (with frames definition and lexical units definition) and lexical units definitions (with frame definition and frame elements definition)")
print("Accuracy:", calculate_accuracy(CycleType.FRAME_FE_LU))
print("-----------------------------------")
print("Cycle with frame definitions, frame elements definitions (with frames definition) and lexical units definitions (with frames definition)")
print("Accuracy:", calculate_accuracy(CycleType.FRAME_FE))
print("-----------------------------------")

-----------------------------------
Cycle with only frame definitions, frame elements definitions and lexical units definitions
Accuracy: 0.375886524822695
-----------------------------------
Cycle with frame definitions (with frame elements definition and lexical units definition), frame elements definitions (with frames definition and lexical units definition) and lexical units definitions (with frame definition and frame elements definition)
Accuracy: 0.3475177304964539
-----------------------------------
Cycle with frame definitions, frame elements definitions (with frames definition) and lexical units definitions (with frames definition)
Accuracy: 0.3900709219858156
-----------------------------------
