# Exercise 3

### Necessary imports

In [165]:
from datasets import load_dataset
import spacy
import pandas as pd
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

nlp = spacy.load("en_core_web_sm")

### Dataset loading

In [40]:
df = pd.DataFrame(load_dataset("ag_news")['train'])

Found cached dataset ag_news (C:/Users/lores/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

### Little preprocessing

In [41]:
# Removing \\ from the text
df["text"] = df["text"].apply(
    lambda x: x.replace("\\", " ")
)

### Getting all the sentences that cointain the verb "give"

In [94]:
# Get number of sentences that contain the transitive verb "give"
sentences = []
for row in df['text']:
    if " give " in row:
        sentences.append(row)

### Finding all the objects and subjects of the verb "give"

In [129]:
# I find all the objects and subjects of the verb "give" and store them in a pandas dataframe
corpus_nan = pd.DataFrame(columns=['object', 'subject', 'obj_pos', 'subj_pos', 'sentence'])
for sentence in sentences:
    doc = nlp(sentence)
    obj = None
    subj = None
    obj_pos = None
    subj_pos = None
    for token in doc:
        if token.text == "give" and token.pos_ == 'VERB':
            for child in token.children:
                if child.dep_ in ["dobj", "pobj"] and obj is None:
                    obj = child.lemma_.lower()
                    obj_pos = child.pos_
                if child.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"] and subj is None:
                    subj = child.lemma_.lower()
                    subj_pos = child.pos_
    corpus_nan.loc[len(corpus_nan)] = [obj, subj, obj_pos, subj_pos, sentence]

corpus = corpus_nan[~corpus_nan.isnull().any(axis=1)]

### Get the synsets of the objects and subjects

In [190]:
# I find the synsets of the objects and subjects for each sentence and store them in a pandas dataframe
person = ["i", "you", "he", "she", "we", "they", "me", "him", "her", "his", "them", "someone", "us", "people", "anyone"]

synsets = pd.DataFrame(columns=['object', 'subject', 'obj_synset', 'subj_synset', 'sentence'])
for subj, subj_pos, obj, obj_pos, sentence in zip(corpus['subject'], corpus['subj_pos'], corpus['object'], corpus['obj_pos'], corpus['sentence']):
    subj_synset = None
    obj_synset = None
    if subj in person:
        subj_synset = 'person'
    if obj in person:
        obj_synset = 'person'
        
    # If the synset is not found, I use the lesk algorithm to find it
    if subj_synset is None:
        if subj_pos == 'NOUN':
            subj_synset = lesk(sentence, subj, 'n')
        elif subj_pos == 'VERB':
            subj_synset = lesk(sentence, subj, 'v')
        elif subj_pos == 'ADJ':
            subj_synset = lesk(sentence, subj, 'a')
        elif subj_pos == 'ADV':
            subj_synset = lesk(sentence, subj, 'r')
        else:
            subj_synset = lesk(sentence, subj)

    if obj_synset is None:
        if obj_pos == 'NOUN':
            obj_synset = lesk(sentence, obj, 'n')
        elif obj_pos == 'VERB':
            obj_synset = lesk(sentence, obj, 'v')
        elif obj_pos == 'ADJ':
            obj_synset = lesk(sentence, obj, 'a')
        elif obj_pos == 'ADV':
            obj_synset = lesk(sentence, obj, 'r')
        else:
            obj_synset = lesk(sentence, obj)
    
    # Get the supersense of the synsets
    if subj_synset is not None and type(subj_synset) is not str:
        subj_synset = subj_synset.lexname().split('.')[1]
    if obj_synset is not None and type(obj_synset) is not str:
        obj_synset = obj_synset.lexname().split('.')[1]
    synsets.loc[len(synsets)] = [obj, subj, obj_synset, subj_synset, sentence]

synsets = synsets[~synsets.isnull().any(axis=1)]

### Get the percentage of synsets

In [243]:
# Group by obj_synset and subj_synset
synsets_percentage = (synsets[["obj_synset", "subj_synset"]].value_counts()/len(synsets)*100).reset_index(name="percentage")

print("---------------------------------------------------")
print("Percentage of synsets for each pair of subject and object")
print("---------------------------------------------------")

for row in synsets_percentage.iterrows():
    print(f'"{row[1][1]}, {row[1][0]}": {row[1][2]:.2f}%')

---------------------------------------------------
Percentage of synsets for each pair of subject and object
---------------------------------------------------
"person, communication": 8.97%
"person, act": 5.38%
"person, cognition": 3.14%
"person, artifact": 3.14%
"cognition, communication": 2.69%
"person, person": 2.24%
"person, all": 2.24%
"person, attribute": 2.24%
"communication, act": 2.24%
"person, state": 1.79%
"person, event": 1.79%
"group, act": 1.79%
"group, attribute": 1.79%
"artifact, cognition": 1.79%
"cognition, act": 1.79%
"artifact, act": 1.35%
"cognition, quantity": 1.35%
"animal, attribute": 1.35%
"act, person": 1.35%
"substance, artifact": 1.35%
"person, time": 1.35%
"group, person": 1.35%
"cognition, state": 1.35%
"artifact, possession": 0.90%
"act, communication": 0.90%
"cognition, cognition": 0.90%
"person, group": 0.90%
"artifact, communication": 0.90%
"communication, communication": 0.90%
"group, communication": 0.90%
"person, possession": 0.90%
"cognition, pe