In [1]:
from nltk.corpus import framenet, wordnet
from nltk.corpus.reader.framenet import AttrDict
from nltk.corpus.reader.wordnet import Synset
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from random import seed, randint
import hashlib
import re
import json
from typing import AnyStr, List, Set, Dict

In [2]:
with open('data/stop_words_FULL.txt') as f:
    stop_words = {line for line in f.read().splitlines()}

with open('data/frame_annotations.json') as f:
    frame_annotations = json.load(f)

### Pre-Processing

In [3]:
def bag_of_words(sentence: AnyStr) -> Set[AnyStr]:
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


def remove_stopwords(words: List[AnyStr]) -> List[AnyStr]:
    return [value for value in words if value not in stop_words]


# Get tokens from sentence
def tokenize_sentence(sentence: AnyStr) -> List[AnyStr]:
    words = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        words.append(lmtzr.lemmatize(tag[0]).lower())
    return words


# Remove punctuation and multiple spaces
def remove_punctuation(sentence: AnyStr) -> AnyStr:
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))


# Lexical unit names are in the form <lu>.PoS, so we get rid of the last part
def clean_lu_name(lu_name: AnyStr) -> AnyStr:
    return lu_name.split('.')[0]


# Lexical unit definitions are in the form <type>: def, so we get rid of the first part
def clean_lu_definition(lu_definition: AnyStr) -> AnyStr:
    return lu_definition.split(':')[1]

### Custom Objects

In [4]:
# Class that contains the CONTEXTS associated with the frame, its frame elements and its lexical units
class ContextsFrame:
    def __init__(self, frame_id: int, frame_name: AnyStr, frame_context: Set[AnyStr], frame_elements_contexts: Dict[AnyStr, Set[AnyStr]], lexical_units_contexts: Dict[AnyStr, Set[AnyStr]]):
        self.frame_id = frame_id
        self.frame_name = frame_name
        self.frame_context = frame_context
        self.frame_elements_contexts = frame_elements_contexts
        self.lexical_units_contexts = lexical_units_contexts
    
    def get_frame_id(self) -> int:
        return self.frame_id

    def get_frame_name(self) -> AnyStr:
        return self.frame_name
    
    def get_frame_context(self) -> Set[AnyStr]:
        return self.frame_context
    
    def get_frame_elements_contexts(self) -> Dict[AnyStr, Set[AnyStr]]:
        return self.frame_elements_contexts

    def get_lexical_units_contexts(self) -> Dict[AnyStr, Set[AnyStr]]:
        return self.lexical_units_contexts
    
    def __str__(self) -> AnyStr:
        header = f'[{self.frame_id}] {self.frame_name}'
        context = f'FRAME CONTEXT:\n{self.frame_context}'
        fe_contexts = f'ELEMENTS CONTEXTS:\n{self.frame_elements_contexts}'
        lu_contexts = f'LEXICAL UNITS CONTEXTS:\n{self.lexical_units_contexts}'
        return '\n'.join([header, context, fe_contexts, lu_contexts])
    

# Class that contains the SYNSETS associated with the frame, its frame elements and its lexical units
class SynsetsFrame:
    def __init__(self, frame_id: int, frame_name: AnyStr, frame_synset: Synset, frame_elements_synsets: Dict[AnyStr, Synset], lexical_units_synsets: Dict[AnyStr, Synset]):
        self.frame_id = frame_id
        self.frame_name = frame_name
        self.frame_synset = frame_synset
        self.frame_elements_synsets = frame_elements_synsets
        self.lexical_units_synsets = lexical_units_synsets

    def get_frame_id(self) -> int:
        return self.frame_id
    
    def get_frame_name(self) -> AnyStr:
        return self.frame_name

    def get_frame_synset(self) -> Synset:
        return self.frame_synset

    def get_frame_elements_synsets(self) -> Dict[AnyStr, Synset]:
        return self.frame_elements_synsets

    def get_lexical_units_synsets(self) -> Dict[AnyStr, Synset]:
        return self.lexical_units_synsets

    def __str__(self) -> AnyStr:
        header = f'[{self.frame_id}] {self.frame_name}'
        context = f'FRAME SYNSET: {self.frame_synset} ({self.frame_synset.definition()})'
        fe_synsets = f'ELEMENTS SYNSETS:\n{self.frame_elements_synsets}'
        lu_synsets = f'LEXICAL UNITS SYNSETS: {self.lexical_units_synsets}'
        return '\n'.join([header, context, fe_synsets, lu_synsets])
    
    def compare(self, other) -> float:
        total = 0
        equal = 0

        if not (self.frame_id == other.get_frame_id() and self.frame_name == other.get_frame_name()):
            return 0
        
        total += 1
        if self.frame_synset == other.get_frame_synset():
            equal += 1
        
        other_fes = other.get_frame_elements_synsets()
        common_fe_keys = self.frame_elements_synsets.keys() & other_fes.keys()
        for fe_key in common_fe_keys:
            total += 1
            equal += 1 if self.frame_elements_synsets[fe_key] == other_fes[fe_key] else 0
        
        other_lus = other.get_lexical_units_synsets()
        common_lu_keys = self.lexical_units_synsets.keys() & other_lus.keys()
        for lu_key in common_lu_keys:
            total += 1
            equal += 1 if self.lexical_units_synsets[lu_key] == other_lus[lu_key] else 0
        
        return equal / total
                


### Core Functions

In [5]:
# Hash the surname and use it as base index, then get 5 frames of framenet through (frames[base_idx + random_offset_i])
def get_frameset_for_student(surname: AnyStr, frame_num: int=5) -> List[AttrDict]:
    frames = list()
    frames_count = len(framenet.frames())
    framenet_IDs = [f.ID for f in framenet.frames()]
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % frames_count)
    i = 0
    seed(0)
    
    while i < frame_num:
        offset = randint(0, frames_count)
        frame_id = framenet_IDs[(base_idx + offset) % frames_count]
        f = framenet.frame(frame_id)

        if frame_id not in frames:
            frames.append(f)
            i += 1
        
    return frames


# Return the context of a WordNet Synset using definition, examples, hypernyms and hyponyms
def context_for_synset(synset: Synset) -> Set[AnyStr]:
    context = set()

    context.update(bag_of_words(synset.definition()))
    for example in synset.examples():
        context.update(bag_of_words(example))
    
    for hypernym in synset.hypernyms(): # differently from before, we also add hypernyms to the context
        context.update(bag_of_words(hypernym.definition()))
        for example in hypernym.examples():
            context.update(bag_of_words(example))
            
    for hyponym in synset.hyponyms(): # differently from before, we also add hyponyms to the context
        context.update(bag_of_words(hyponym.definition()))
        for example in hyponym.examples():
            context.update(bag_of_words(example))

    return context


# Select regent from frame's name
def select_regent(words: List[AnyStr]) -> AnyStr:
    best = 0
    tag = nltk.tag.pos_tag(words)
    for i in range(0, len(words)):
        if tag[i][1] == 'VB':
            return words[i]
        elif tag[i][1] == 'NN' and tag[best][1] != 'NN':
           best = i
    return words[best]


# Compute best synset intersecting FrameNet context and WordNet context (FrameNet mapped version of Lesk Algorithm)
def compute_score(wn_word: AnyStr, fn_context: Set[AnyStr]) -> Synset:
    synsets = wordnet.synsets(wn_word)
    if not synsets:
        return None

    best_synset = synsets[0]
    max_score = 0

    for synset in synsets:
        synset_context = context_for_synset(synset)
        score = len(fn_context & synset_context) + 1 # score is computed using bag of words's approach
        if score > max_score:
            max_score = score
            best_synset = synset
    return best_synset


# Get a ContextsFrame for each of the input frames
def get_contexts_frames(frames: List[AttrDict]) -> List[ContextsFrame]:
    context_frames = list()
    for frame in frames:
        frame_elements_contexts = dict()
        lexical_units_contexts = dict()
        frame_context = bag_of_words(frame.definition) # get context for the frame itself

        for fev in frame.FE.values():
            fe_context = bag_of_words(fev.definition) # get context for frame element
            frame_elements_contexts[fev.name] = fe_context
            frame_context.update(fe_context) # update frame_context
        
        for luv in frame.lexUnit.values():
            lu_context = bag_of_words(clean_lu_definition(luv.definition)) # get context for lexical units
            lexical_units_contexts[luv.name] = lu_context
            frame_context.update(lu_context) # update frame_context
        
        contextsFrame = ContextsFrame(frame.ID, frame.name, frame_context, frame_elements_contexts, lexical_units_contexts)
        context_frames.append(contextsFrame)
    return context_frames


# Get a SynsetsFrame for each of the input contextsFrame
def get_synsets_frames(contexts_frames: List[ContextsFrame]) -> List[SynsetsFrame]:
    synsets_frames = list()
    for contexts_frame in contexts_frames:
        frame_elements_synsets = dict()
        lexical_units_synsets = dict()

        for fe_name, fe_context in contexts_frame.get_frame_elements_contexts().items():
            score = compute_score(fe_name, fe_context) # compute the best sense (score) for each frame element
            if score:
                frame_elements_synsets[fe_name] = score
        
        for lu_name, lu_context in contexts_frame.get_lexical_units_contexts().items():
            score = compute_score(clean_lu_name(lu_name), lu_context) # compute the best sense (score) for each frame lexical unit
            if score:
                lexical_units_synsets[lu_name] = score
        
        frame_id = contexts_frame.get_frame_id()
        frame_name = contexts_frame.get_frame_name()
        frame_synset = compute_score(select_regent(frame_name.split('_')), contexts_frame.get_frame_context()) # compute the best sense (score) for the frame context

        synsetsFrame = SynsetsFrame(frame_id, frame_name, frame_synset, frame_elements_synsets, lexical_units_synsets)
        synsets_frames.append(synsetsFrame)
    return synsets_frames


def disambiguate_frames(frameset: List[AttrDict], verbose=True) -> List[SynsetsFrame]:
    contexts_frames = get_contexts_frames(frameset)
    synsets_frames = get_synsets_frames(contexts_frames)

    if verbose:
        separator = '-'*40
        print(f'{separator} FRAMESET FOR STUDENT {surname} {separator}')
        for frame in frameset:
            print(f'{frame.name}\n - Regent: {select_regent(frame.name.split("_"))}\n - Definition: {frame.definition}\n')

        print(f'\n{separator} OBTAINED CONTEXTS FRAMES {separator}\n')
        for frame in contexts_frames:
            print(f'{frame}\n')

        print(f'{separator} OBTAINED SYNSETS FRAMES {separator}\n')
        for frame in synsets_frames:
            print(f'{frame}\n')
    
    return synsets_frames

### Disambiguation & Evaluation

In [6]:
def load_annotated_synsets_frames(surname: AnyStr) -> List[SynsetsFrame]:
    synsets_frames = list()
    for f in frame_annotations[surname]:
        frame_id = f['id']
        frame_name = f['name']
        frame_synset = wordnet.synset(f['synset'])
        frame_elements_synsets = {fe_name: wordnet.synset(synset_id) for fe_name, synset_id in f['elements_synsets'].items()}
        lexical_units_synsets = {lu_name: wordnet.synset(synset_id) for lu_name, synset_id in f['lexical_units_synsets'].items()}
            
        synsets_frames.append(SynsetsFrame(frame_id, frame_name, frame_synset, frame_elements_synsets, lexical_units_synsets))
    
    return synsets_frames

In [7]:
surname = 'Favaro'
synsets_frames = disambiguate_frames(get_frameset_for_student(surname), verbose=False)
annotated_synsets_frames = load_annotated_synsets_frames(surname)

accuracy_list = [synset_frame.compare(annotated_synset_frame) for synset_frame, annotated_synset_frame in zip(synsets_frames, annotated_synsets_frames)]
avg_accuracy = sum(accuracy_list) / len(accuracy_list)

print(f'Average accuracy for {surname} frameset: {avg_accuracy:.2f}')

Average accuracy for Favaro frameset: 0.71


In [8]:
surname = 'Senese'
synsets_frames = disambiguate_frames(get_frameset_for_student(surname), verbose=False)
print(get_frameset_for_student(surname))
annotated_synsets_frames = load_annotated_synsets_frames(surname)

accuracy_list = [synset_frame.compare(annotated_synset_frame) for synset_frame, annotated_synset_frame in zip(synsets_frames, annotated_synsets_frames)]
avg_accuracy = sum(accuracy_list) / len(accuracy_list)

print(f'Average accuracy for {surname} frameset: {avg_accuracy:.2f}')

Average accuracy for Senese frameset: 0.12


In [9]:
for synset in wordnet.synsets('part'):
    print(synset, synset.definition())

Synset('part.n.01') something determined in relation to something that includes it
Synset('part.n.02') something less than the whole of a human artifact
Synset('part.n.03') a portion of a natural object
Synset('part.n.04') that which concerns a person with regard to a particular role or situation
Synset('region.n.01') the extended spatial location of something
Synset('function.n.03') the actions and activities assigned to or required or expected of a person or group
Synset('character.n.04') an actor's portrayal of someone in a play
Synset('share.n.01') assets belonging to or due to or contributed by an individual person or group
Synset('part.n.09') one of the portions into which something is regarded as divided and which together constitute a whole
Synset('part.n.10') a line of scalp that can be seen when sections of hair are combed in opposite directions
Synset('part.n.11') the melody carried by a particular voice or instrument in polyphonic music
Synset('contribution.n.01') the part 