In [1]:
import json
import pandas as pd
import numpy as np
from prettytable import PrettyTable
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as Wn
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from stop_words import get_stop_words

## Global Variables

In [2]:
## define stops words list using ntk and others libraries
stop_words = list(get_stop_words('en'))         #About 900 stopwords
nltk_stop_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_stop_words)

annotations_marco_1000_csv_path = "inputs\\1000annotazioni.csv"
states_quantitative_analysis_with_extend_lesk_csv_path = "outputs\\states_quantitative_analysis_with_extend_lesk.csv"
events_quantitative_analysis_with_extend_lesk_csv_path = "outputs\\events_quantitative_analysis_with_extend_lesk.csv"
states_quantitative_analysis_groupby_lemmas_with_extend_lesk_csv_path = "outputs\\states_quantitative_analysis_groupby_lemmas_with_extend_lesk.csv"
events_quantitative_analysis_groupby_lemmas_with_extend_lesk_csv_path = "outputs\\events_quantitative_analysis_groupby_lemmas_with_extend_lesk.csv"

wn_lemmatizer = WordNetLemmatizer()

## Commons Functions and Class

In [3]:
class Lesk() :
    def __init__(self, word, context, pos=None, synsets=None) :
        self.sense = self.run(word, context, pos, synsets)

    # This function compute synset witc have greater frequence.
    def computedMostFrequentSense(self, word, pos=None, synsets=None) :
        max_freq = 0
        sense = None

        if synsets is None:
            synsets = Wn.synsets(word)
        if pos:
            synsets = [ss for ss in synsets if str(ss.pos()) == pos]

        for synset in synsets :
            freq = 0  
            for lemma in synset.lemmas() :
                freq +=lemma.count()
            if freq > max_freq :
                max_freq = freq
                sense = synset
        return sense

    def getDefinitionExamples(self, sense) : 
            examples = []
            if len(sense.examples()) > 0 :
                for example in sense.examples() :
                    examples += nltk.word_tokenize(example)
            return [x for x in (nltk.word_tokenize(sense.definition()) + examples + sense.lemma_names()) if x not in stop_words]

    def computedOverlap(self, signature, context) :
        return len(list(set(signature).intersection(set(context))))
        
    
    def run(self, word, context, pos, synsets) :
        max_overlap = 0
        best_sense = self.computedMostFrequentSense(word, pos, synsets)
        senses = Wn.synsets(word) if synsets is None else synsets
        if pos:
            senses = [ss for ss in senses if str(ss.pos()) == pos]
        context_ = [x for x in context if x not in stop_words]
        for sense in  senses:
            signature = self.getDefinitionExamples(sense)
            overlap = self.computedOverlap(signature, context_)
            if overlap > max_overlap :
                max_overlap = overlap
                best_sense = sense
        return best_sense

    def get(self) :
        return self.sense

## Quantitative Analysis

In [4]:
nlp = spacy.load("en_core_web_sm")

verbs = {
    'states': {},
    'events': {}
}
df_marco = pd.read_csv(annotations_marco_1000_csv_path).replace(np.nan, '', regex=True)
print("Number of annotations over 1000 sentences: ", df_marco.shape[0])

for index, row in df_marco.iterrows() :
    state = str(row["STATE"]).strip().lower()
    event = str(row["EVENT"]).strip().lower()
    rep_event = str(row["REP-EVENT"]).strip().lower()
    asp_event = str(row["ASP-EVENT"]).strip().lower()
    location = str(row["LOC"]).strip()
    orgaization = str(row["ORG"]).strip()
    text = str(row['text'])
    is_verb = True
    key_verbs = verbs['states'] if len(event.strip()) == 0 else verbs['events']
    verb_state_ = state if len(event.strip()) == 0 else event
    doc1 = nlp(location)
    doc2 = nlp(orgaization)
    entities1 = [ent.label_ for ent in doc1.ents]
    entities2 = [ent.label_ for ent in doc2.ents]
    verb_states = [w for w in word_tokenize(verb_state_) if not w in stop_words]
    for verb_state in verb_states :
        best_sense = Lesk(verb_state, word_tokenize(text), pos=Wn.VERB).get()
        if best_sense is None :
            best_sense = Lesk(verb_state, word_tokenize(text)).get()
            is_verb = False
        definition = best_sense.definition() if best_sense is not None else verb_state
        best_sense = best_sense.name() if best_sense is not None else verb_state
        if best_sense in key_verbs :
            key_verbs[best_sense]['occorrences'] += 1
            key_verbs[best_sense]['rep_event'] += 1 if len(rep_event) > 0 else 0
            key_verbs[best_sense]['asp_event'] += 1 if len(asp_event) > 0 else 0 
            key_verbs[best_sense]['location'] += 1 if "GPE" in entities1 else 0
            key_verbs[best_sense]['organization'] += 1 if "ORG" in entities2 else 0
            if is_verb :
                key_verbs[best_sense]['targets'].add(wn_lemmatizer.lemmatize(verb_state, 'v'))
            else : 
                key_verbs[best_sense]['targets'].add(wn_lemmatizer.lemmatize(verb_state))
        else :
            key_verbs[best_sense] = {}
            key_verbs[best_sense]['occorrences'] = 1
            key_verbs[best_sense]['definition'] = definition
            key_verbs[best_sense]['rep_event'] = 1 if len(rep_event) > 0 else 0
            key_verbs[best_sense]['asp_event'] = 1 if len(asp_event) > 0 else 0
            key_verbs[best_sense]['location'] = 1 if "GPE" in entities1 else 0
            key_verbs[best_sense]['organization'] = 1 if "ORG" in entities2 else 0
            if is_verb :
                key_verbs[best_sense]['targets'] = set([wn_lemmatizer.lemmatize(verb_state, 'v')])
            else :
                key_verbs[best_sense]['targets'] = set([wn_lemmatizer.lemmatize(verb_state)])

Number of annotations over 1000 sentences:  1795


In [5]:
df_states_quant_analysis = pd.DataFrame(columns=[
    'state (WORDNET SENSE IF EXIST)', 
    'occorrences', 
    'definition', 
    'rep_event', 
    'asp_event', 
    'location', 
    'organization', 
    'targets'
])
for key, value in verbs['states'].items() :
    value_copy = value.copy()
    value_copy['state (WORDNET SENSE IF EXIST)'] = key
    value_copy['targets'] = ", ".join(value['targets'])
    df_states_quant_analysis = df_states_quant_analysis.append(value_copy, ignore_index=True)
df_states_quant_analysis = df_states_quant_analysis.sort_values(by='occorrences', ascending=False)
print(
    "Number of annotations (STATE) over 1000 sentences:", 
    df_states_quant_analysis['occorrences'].sum(),
    ", sia",
    (df_states_quant_analysis['occorrences'].sum() * 100) / df_marco.shape[0],
    "%."
)
df_states_quant_analysis.to_csv(states_quantitative_analysis_with_extend_lesk_csv_path, index=False)

df_events_quant_analysis = pd.DataFrame(columns=[
    'event (WORDNET SENSE IF EXIST)', 
    'occorrences', 
    'definition', 
    'rep_event', 
    'asp_event', 
    'location', 
    'organization', 
    'targets'
])
for key, value in verbs['events'].items() :
    value_copy = value.copy()
    value_copy['event (WORDNET SENSE IF EXIST)'] = key
    value_copy['targets'] = ", ".join(value['targets'])
    df_events_quant_analysis = df_events_quant_analysis.append(value_copy, ignore_index=True)
df_events_quant_analysis = df_events_quant_analysis.sort_values(by='occorrences', ascending=False)
print(
    "Number of annotations (EVENT) over 1000 sentences:", 
    df_events_quant_analysis['occorrences'].sum(),
    ", sia",
    (df_events_quant_analysis['occorrences'].sum() * 100) / df_marco.shape[0],
    "%."
)
df_events_quant_analysis.to_csv(events_quantitative_analysis_with_extend_lesk_csv_path, index=False)

Number of annotations (STATE) over 1000 sentences: 677 , sia 37.71587743732591 %.
Number of annotations (EVENT) over 1000 sentences: 853 , sia 47.5208913649025 %.


In [6]:
df_agg1 = df_states_quant_analysis.groupby('targets').agg({
    'state (WORDNET SENSE IF EXIST)': ' ## '.join, 
    'occorrences': sum, 
    'definition': ' ## '.join, 
    'rep_event': sum, 
    'asp_event': sum, 
    'location': sum, 
    'organization': sum,
    'targets': ' ## '.join
})
df_agg1 = df_agg1.sort_values(by='occorrences', ascending=False)
df_agg1.to_csv(states_quantitative_analysis_groupby_lemmas_with_extend_lesk_csv_path)

In [7]:
df_agg2 = df_events_quant_analysis.groupby('targets').agg({
    'event (WORDNET SENSE IF EXIST)': ' ## '.join, 
    'occorrences': sum, 
    'definition': ' ## '.join, 
    'rep_event': sum, 
    'asp_event': sum, 
    'location': sum, 
    'organization': sum,
    'targets': ' ## '.join
})
df_agg2 = df_agg2.sort_values(by='occorrences', ascending=False)
df_agg2.to_csv(events_quantitative_analysis_groupby_lemmas_with_extend_lesk_csv_path)

In [8]:
sentence = "Ramachandran, whose father wanted him to become a physician rather than a researcher, obtained an M.B.B.S. from Stanley Medical College in Chennai, India."
sense = lesk(word_tokenize(sentence), 'obtained', pos=Wn.VERB)
print(sense, sense.definition())

sentence = "He received the Kerala Sahitya Academi award in the `poetry' section for his collection, Nellickal Muraleedharante Kavithakal in 2004."
sense = lesk(word_tokenize(sentence), 'obtained', pos=Wn.VERB)
print(sense, sense.definition())

sentence = "Ramachandran, whose father wanted him to become a physician rather than a researcher, obtained an M.B.B.S. from Stanley Medical College in Chennai, India."
sense = Lesk('obtained', word_tokenize(sentence), pos=Wn.VERB).get()
print(sense, sense.definition())

sentence = "He received the Kerala Sahitya Academi award in the `poetry' section for his collection, Nellickal Muraleedharante Kavithakal in 2004."
sense = Lesk('obtained', word_tokenize(sentence), pos=Wn.VERB).get()
print(sense, sense.definition())

Synset('receive.v.02') receive a specified treatment (abstract)
Synset('receive.v.02') receive a specified treatment (abstract)
Synset('prevail.v.02') be valid, applicable, or true
Synset('receive.v.02') receive a specified treatment (abstract)
