# Exercise 2

### Necessary imports

In [100]:
import nltk
from nltk.corpus import framenet as fn
from nltk.corpus.reader.wordnet import Synset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('framenet_v17')
nltk.download('wordnet')
import hashlib
import re
from random import randint
from random import seed
from nltk.corpus import wordnet as wn
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## First part

### Extracting the frames from name and surname

In [2]:
def get_frames_IDs():
    return [f.ID for f in fn.frames()]   

def getFrameSetForStudent(surname, list_len=5):
    nof_frames = len(fn.frames())
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % nof_frames)
    print('\nStudent: ' + surname)
    framenet_IDs = get_frames_IDs()
    i = 0
    offset = 0 
    seed(1)
    frames_extracted = {}
    while i < list_len:
        fID = framenet_IDs[(base_idx+offset)%nof_frames]
        f = fn.frame(fID)
        fNAME = f.name
        frames_extracted[fID] = fNAME
        print('\tID: {a:4d}\tFrame: {framename}'.format(a=fID, framename=fNAME))
        offset = randint(0, nof_frames)
        i += 1
    return frames_extracted

In [7]:
# I extract the frames for Lorenzo Botto and Gabriele Naretto and I merge them in a single dictionary

frames_student1 = getFrameSetForStudent('Lorenzo Botto')
frames_student2 = getFrameSetForStudent('Gabriele Naretto')
frames_extracted = { "lorenzo":frames_student1} | {"gabriele" : frames_student2}


Student: Lorenzo Botto
	ID: 1882	Frame: Shopping
	ID: 1148	Frame: Attributed_information
	ID: 2231	Frame: Response_scenario
	ID: 2191	Frame: Turning_out
	ID:  380	Frame: Custom

Student: Gabriele Naretto
	ID: 2018	Frame: Collocation_image_schema
	ID: 1497	Frame: Giving_in
	ID:  334	Frame: Cause_temperature_change
	ID: 1211	Frame: Dressing
	ID: 1147	Frame: Ordinal_numbers


### execution of the lesk algoritm for found the best synset in wordnet

In [95]:
# I clean the sentences removing the punctuation and the stop words

def clean_sentence(sentence: str | list) -> list:
    sentence_without_puntuaction = re.sub(r'[^\w\s]', '', str(sentence)).strip()
    word_tokens = word_tokenize(sentence_without_puntuaction)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return filtered_sentence

In [97]:
def lesk(word: str, sentence: list, clean: bool) -> Synset:
    synsets = wn.synsets(word)
    if not synsets:
        return None
    best_sense = synsets[0]
    max_overlap = 0
    for synset in synsets:
        overlap = 0
        # Cleaning example in wordnet 
        for definition_word in clean_sentence(synset.definition()) if clean else synset.definition().split():
            if definition_word in sentence:
                overlap += 1
        for example_word in clean_sentence(synset.examples()) if clean else synset.examples():
            if clean:
                if example_word in sentence:
                    overlap += 1
            else:
                for example_word in example_word.split():
                    if example_word in sentence:
                        overlap += 1
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = synset
        
    return best_sense

In [101]:
for name in frames_extracted:
    print("name of the student",name)
    for frames_id in frames_extracted[name]:
        print("-----------------------------------")
        print("lesk algoritm on Frame name")
        #for the frame name we use lesk algoritm for return the best sense in wordnet
        print(fn.frame(frames_id).name)
        print(fn.frame(frames_id).definition)
        if("_" in fn.frame(frames_id).name):
            print(lesk(fn.frame(frames_id).name.split("_")[0],fn.frame(frames_id).definition,True))
        else:
            print(lesk(fn.frame(frames_id).name,fn.frame(frames_id).definition,True))

        print("Lexical units in the frame")
        print(list(fn.frame(frames_id).FE))
        #for every lexical unit in the frame we use lesk algoritm for return the best sense in wordnet
        for unit in list(fn.frame(frames_id).FE): 
            if("_" in fn.frame(frames_id).name):
                print(lesk(unit.split("_")[0],fn.frame(frames_id).definition,True))
            else:
                print(lesk(unit,fn.frame(frames_id).definition,True))

name of the student lorenzo
-----------------------------------
lesk algoritm on Frame name
Shopping
A Shopper looks for Goods in order to purchase them.  'She shopped for a new hat.'
Synset('shop.v.03')
Lexical units in the frame
['Shopper', 'Ground', 'Goods', 'Manner', 'Means', 'Outcome', 'Place', 'Time', 'Purpose', 'Degree', 'Depictive', 'Co-participant']
Synset('shopper.n.02')
Synset('ground.v.07')
Synset('good.n.01')
Synset('manner.n.01')
Synset('mean.n.01')
Synset('result.n.03')
Synset('topographic_point.n.01')
Synset('time.n.01')
Synset('purpose.n.01')
Synset('degree.n.01')
Synset('delineative.s.01')
None
-----------------------------------
lesk algoritm on Frame name
Attributed_information
A Proposition is attributed to a Speaker or a Text.  'According to the Press Trust of India in an article published on its Web site Thursday, the woman has been identified as 31-year-old Aafia Siddiqui, who was being sought by U.S. officials last week along with two other men, including one w

In [88]:
wn.synsets("ground")

[Synset('land.n.04'),
 Synset('reason.n.01'),
 Synset('earth.n.02'),
 Synset('footing.n.02'),
 Synset('ground.n.05'),
 Synset('background.n.02'),
 Synset('land.n.02'),
 Synset('ground.n.08'),
 Synset('ground.n.09'),
 Synset('ground.n.10'),
 Synset('flat_coat.n.01'),
 Synset('anchor.v.01'),
 Synset('ground.v.02'),
 Synset('ground.v.03'),
 Synset('ground.v.04'),
 Synset('ground.v.05'),
 Synset('ground.v.06'),
 Synset('ground.v.07'),
 Synset('ground.v.08'),
 Synset('ground.v.09'),
 Synset('prime.v.02'),
 Synset('ground.v.11'),
 Synset('establish.v.08'),
 Synset('crunch.v.02'),
 Synset('grate.v.04'),
 Synset('labor.v.02'),
 Synset('grind.v.04'),
 Synset('grind.v.05'),
 Synset('grind.v.06'),
 Synset('grind.v.07')]