# Programming Assignment 2

In [1]:
# ! pip install -r ../requirements.txt
# ! python3 -m nltk.downloader -d .nltk_data all

In [2]:
import nltk
nltk.data.path.append('.nltk_data')

import re
import sys
import json
sys.path.insert(0, '../code')

import numpy as np
import pandas as pd
from functools import reduce

from scipy import sparse
from sklearn.preprocessing import normalize

from loader import *

from nltk import wsd
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import Lemma
from collections import Counter

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

## Load the data

In [3]:
data_f = "../code/multilingual-all-words.en.xml"
key_f = "../code/wordnet.en.key"
dev_instances, test_instances = load_instances(data_f)
dev_key, test_key = load_key(key_f)

# IMPORTANT: keys contain fewer entries than the instances; need to remove them
dev_instances = {k: v for (k, v) in dev_instances.items() if k in dev_key}
test_instances = {k: v for (k, v) in test_instances.items() if k in test_key}

# ready to use here
print(len(dev_instances))  # number of dev instances
print(len(test_instances))  # number of test instances

194
1450


In [4]:
[m for m in dir(dev_instances["d001.s001.t002"]) if not m.startswith("_")]

['context', 'id', 'index', 'lemma']

In [5]:
dev_key['d001.s001.t002']

['group%1:03:00::']

In [6]:
print(dev_instances["d001.s001.t002"])

d001.s001.t002	group	U.N. group draft plan to reduce emission	1


In [7]:
context = dev_instances["d001.s001.t002"].context
id = dev_instances["d001.s001.t002"].id
index = dev_instances["d001.s001.t002"].index
lemma = dev_instances["d001.s001.t002"].lemma

assert lemma == context[index]

## Look at synsets

In [8]:
synsets = wn.synsets('dog')
print(synsets)

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]


In [9]:
synsets = wn.synsets(lemma)

In [10]:
for syn in synsets:
    print("Synset:", syn.name())
    print("Lemmas:", [lemma.name() for lemma in syn.lemmas()])
    print("Definition:", syn.definition())
    print("Examples:", syn.examples())
    print()

Synset: group.n.01
Lemmas: ['group', 'grouping']
Definition: any number of entities (members) considered as a unit
Examples: []

Synset: group.n.02
Lemmas: ['group', 'radical', 'chemical_group']
Definition: (chemistry) two or more atoms bound together as a single unit and forming part of a molecule
Examples: []

Synset: group.n.03
Lemmas: ['group', 'mathematical_group']
Definition: a set that is closed, associative, has an identity element and every element has an inverse
Examples: []

Synset: group.v.01
Lemmas: ['group']
Definition: arrange into a group or groups
Examples: ['Can you group these shapes together?']

Synset: group.v.02
Lemmas: ['group', 'aggroup']
Definition: form a group or group together
Examples: []



## Lesk's Algorithm

In [11]:
def preprocess(sentence, stop_words=stop_words, lemmatizer=lemmatizer):
    """
    Preprocesses the sentence by lemmatizing and removing stop words and words without any alphanumeric characters.
    Assumes sentence is a list of tokenized words.
    """
    # Helper function to check for at least one alphanumeric character in a word
    contains_alnum = lambda word: any(char.isalnum() for char in word)

    # Lemmatize words, filter out stop words and words without any alphanumeric characters
    processed = {lemmatizer.lemmatize(w) for w in sentence if w not in stop_words and contains_alnum(w)}

    return processed


def lesk(lemma, context):
    """
    Lesk's algorithm implementation.
    Assumes preprocessed_context is a set of lemmatized words without stop words.
    """
    assert isinstance(lemma, str), "Lemma is not a string"
    assert len(context) > 0, "Empty context"

    max_overlap = 0
    best_sense = None

    context = preprocess(context)

    # Obtain the synsets for the lemma
    synsets = wn.synsets(lemma)

    # Default to the most common sense if synsets are available
    if synsets:
        best_sense = synsets[0]

    for sense in synsets:
        # Preprocess the signature (definition and examples)
        signature = preprocess(word_tokenize(sense.definition()), stop_words, lemmatizer)
        for example in sense.examples():
            signature |= preprocess(word_tokenize(example), stop_words, lemmatizer)

        # The overlap is the size of the intersection
        overlap = len(context & signature)

        # Keep track of the best overlap so far
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense


In [12]:
lemma = 'climate'
context = 'the U.N.-sponsored climate conference -- characterize so far by unruly posturing and mutual recrimination -- gain renewed focus Friday with the release of a document outline ambitious greenhouse-gas reduction over the next @card@ year , with industrialized_nation shoulder most of the burden in the near term .'.split()
best_sense = lesk(lemma, context)

In [13]:
sense_keys = [lemma.key() for lemma in best_sense.lemmas()]
sense_keys

['climate%1:26:00::', 'clime%1:26:00::']

In [14]:
for i, item in enumerate(dev_instances.items()):
    key, instance = item
    lemma = instance.lemma
    context = instance.context
    print(instance)
    print(lesk(lemma, context).definition())
    print()
    if i > 2:
        break

d001.s001.t002	group	U.N. group draft plan to reduce emission	1
arrange into a group or groups

d001.s001.t003	plan	U.N. group draft plan to reduce emission	3
a series of steps to be carried out or goals to be accomplished

d001.s001.t004	emission	U.N. group draft plan to reduce emission	6
the act of emitting; causing to flow forth

d001.s002.t001	climate	the U.N.-sponsored climate conference -- characterize so far by unruly posturing and mutual recrimination -- gain renewed focus Friday with the release of a document outline ambitious greenhouse-gas reduction over the next @card@ year , with industrialized_nation shoulder most of the burden in the near term .	2
the weather in some location averaged over some long period of time



## Lemma sense keys and synset numbers

The correspondence between lemma sense keys and synset numbers is stored in `wordnet.en.key`. We can just read from that file.

In [15]:
! head ../code/wordnet.en.key

d001 d001.s001.t002 group%1:03:00:: 
d001 d001.s001.t003 plan%1:09:00:: 
d001 d001.s001.t004 emission%1:27:00:: 
d001 d001.s002.t001 climate%1:26:00:: 
d001 d001.s002.t002 conference%1:14:00:: 
d001 d001.s002.t003 posturing%1:07:00:: 
d001 d001.s002.t004 recrimination%1:10:00:: 
d001 d001.s002.t005 focus%1:09:00:: 
d001 d001.s002.t006 friday%1:28:00:: 
d001 d001.s002.t007 release%1:22:00:: 


In [16]:
def get_lemma_sense_key_to_synset_number_correspondence():
    wordnet_key_file = '../code/wordnet.en.key'
    lsk_to_sn = {}
    with open(wordnet_key_file, 'r') as f:
        for line in f.read().strip().split("\n"):
            line = line.strip()
            _, lsk, sn = line.split(' ', 2)
            lsk_to_sn[lsk] = set(sn.split(' '))
    return lsk_to_sn

lsk_to_sn = get_lemma_sense_key_to_synset_number_correspondence()

In [17]:
# Show the top ten
{x[0]: x[1] for i, x in enumerate(lsk_to_sn.items()) if i < 10}

{'d001.s001.t002': {'group%1:03:00::'},
 'd001.s001.t003': {'plan%1:09:00::'},
 'd001.s001.t004': {'emission%1:27:00::'},
 'd001.s002.t001': {'climate%1:26:00::'},
 'd001.s002.t002': {'conference%1:14:00::'},
 'd001.s002.t003': {'posturing%1:07:00::'},
 'd001.s002.t004': {'recrimination%1:10:00::'},
 'd001.s002.t005': {'focus%1:09:00::'},
 'd001.s002.t006': {'friday%1:28:00::'},
 'd001.s002.t007': {'release%1:22:00::'}}

## Calculating accuracy

Now that I have this correspondence, I can calculate the accuracy of my lesk's algorithm implementation.

In [18]:
def build_wsd_data():
    wsd_data = []
    for key, instance in dev_instances.items():
        id = instance.id
        lemma = instance.lemma
        context = instance.context
    
        processed_context = preprocess(context)
    
        # Use lesk's algorithm to guess the synset
        synset = lesk(lemma, context)
    
        # Get the sense-keys for the predicted synset
        preds = set(lemma.key() for lemma in synset.lemmas())
        
        # Extract the synset number from the sense-key
        targets = lsk_to_sn[id]
        
        # Calculate if there is any overlap between the predicted sense and the target
        match = len(preds & targets) > 0
        
        wsd_data.append(
            dict(id=id, lemma=lemma, context=context, processed_context=processed_context, synset=synset, preds=preds, targets=targets, match=match)
        )

    return pd.DataFrame(wsd_data)

wsd_data = build_wsd_data()

In [19]:
wsd_data.head()

Unnamed: 0,id,lemma,context,processed_context,synset,preds,targets,match
0,d001.s001.t002,group,"[U.N., group, draft, plan, to, reduce, emission]","{emission, group, U.N., plan, draft, reduce}",Synset('group.v.01'),{group%2:31:00::},{group%1:03:00::},False
1,d001.s001.t003,plan,"[U.N., group, draft, plan, to, reduce, emission]","{emission, group, U.N., plan, draft, reduce}",Synset('plan.n.01'),"{program%1:09:00::, programme%1:09:00::, plan%...",{plan%1:09:00::},True
2,d001.s001.t004,emission,"[U.N., group, draft, plan, to, reduce, emission]","{emission, group, U.N., plan, draft, reduce}",Synset('emission.n.01'),"{emanation%1:04:00::, emission%1:04:00::}",{emission%1:27:00::},False
3,d001.s002.t001,climate,"[the, U.N.-sponsored, climate, conference, --,...","{outline, characterize, conference, climate, n...",Synset('climate.n.01'),"{climate%1:26:00::, clime%1:26:00::}",{climate%1:26:00::},True
4,d001.s002.t002,conference,"[the, U.N.-sponsored, climate, conference, --,...","{outline, characterize, conference, climate, n...",Synset('conference.n.01'),{conference%1:14:00::},{conference%1:14:00::},True


In [20]:
wsd_data.iloc[1,:]

id                                                      d001.s001.t003
lemma                                                             plan
context               [U.N., group, draft, plan, to, reduce, emission]
processed_context         {emission, group, U.N., plan, draft, reduce}
synset                                             Synset('plan.n.01')
preds                {program%1:09:00::, programme%1:09:00::, plan%...
targets                                               {plan%1:09:00::}
match                                                             True
Name: 1, dtype: object

I'm checking that the preprocessing step correctly resolves multi-word phrases to single entities and it appears it does.

In [21]:
wsd_data[wsd_data['processed_context'].apply(lambda x: 'America' in x)].head(2)

Unnamed: 0,id,lemma,context,processed_context,synset,preds,targets,match
99,d001.s016.t001,comment,"[Stern, make, his, comment, an, hour, after, C...","{common_sense, climate, help, make, lack, fina...",Synset('comment.v.01'),"{point_out%2:32:01::, comment%2:32:00::, notic...",{comment%1:10:00::},False
100,d001.s016.t002,hour,"[Stern, make, his, comment, an, hour, after, C...","{common_sense, climate, help, make, lack, fina...",Synset('hour.n.01'),"{hr%1:28:00::, 60_minutes%1:28:00::, hour%1:28...",{hour%1:28:01::},False


In [22]:
wsd_data.loc[101, ['context', 'processed_context']].values

array([list(['Stern', 'make', 'his', 'comment', 'an', 'hour', 'after', 'Chinese', 'vice', 'foreign_minister', 'he_yafei', 'say', 'America', "'s", 'top', 'climate', 'negotiator', 'be', 'either', 'lack', '``', 'common_sense', "''", 'or', 'be', '``', 'extremely', 'irresponsible', "''", 'for', 'say', 'earlier', 'in', 'the', 'week', 'that', 'the', 'united_states', 'would', 'not', 'help', 'China', 'financially', 'to', 'cope', 'with', 'global_warming', '.']),
       {'common_sense', 'climate', 'help', 'make', 'lack', 'financially', 'Stern', 'negotiator', 'foreign_minister', 'extremely', 'irresponsible', 'China', 'cope', 'America', 'would', 'hour', 'earlier', 'vice', "'s", 'comment', 'global_warming', 'united_states', 'top', 'either', 'say', 'Chinese', 'week', 'he_yafei'}],
      dtype=object)

## Calculate accuracy

In [23]:
print(f'Lesk\'s algorithm Accuracy: {100*wsd_data.match.mean():.1f}%')

Lesk's algorithm Accuracy: 57.2%


This seems like a decent score for this task considering the effort we put in.

Next we should calculate scores for:
- The most frequent sense baseline
- NLTK's implementation of Lesk's Algorithm

## Most frequent sense baseline

In [24]:
def most_frequent_synset(lemma):
    '''
    Given a lemma, this returns the most frequent sense for that lemma.
    '''
    return set(lemma.key() for lemma in wn.synsets(lemma)[0].lemmas())

In [25]:
wsd_data['most_frequent_synset'] = wsd_data.lemma.apply(most_frequent_synset)

In [26]:
most_frequent_synset_accuracy = wsd_data.apply(lambda x: len(x.most_frequent_synset & x.targets) > 0, axis=1).mean()
print(f'Most Frequent Synset Accuracy: {100*most_frequent_synset_accuracy:.1f}%')

Most Frequent Synset Accuracy: 67.5%


## NLTK Lesk Algorithm baseline

In [27]:
wsd_data['nltk_pred_synset'] = wsd_data.apply(lambda x: set(lemma.key() for lemma in wsd.lesk(x.context, x.lemma).lemmas()), axis = 1)
nltk_pred_synset_accuracy = wsd_data.apply(lambda x: len(x.nltk_pred_synset & x.targets) > 0, axis=1).mean()
print(f'NLTK Lesk\'s Algorithm Accuracy: {100*nltk_pred_synset_accuracy:.1f}%')

NLTK Lesk's Algorithm Accuracy: 34.0%


In [28]:
wsd_data.head()

Unnamed: 0,id,lemma,context,processed_context,synset,preds,targets,match,most_frequent_synset,nltk_pred_synset
0,d001.s001.t002,group,"[U.N., group, draft, plan, to, reduce, emission]","{emission, group, U.N., plan, draft, reduce}",Synset('group.v.01'),{group%2:31:00::},{group%1:03:00::},False,"{grouping%1:03:00::, group%1:03:00::}","{group%2:33:00::, aggroup%2:33:00::}"
1,d001.s001.t003,plan,"[U.N., group, draft, plan, to, reduce, emission]","{emission, group, U.N., plan, draft, reduce}",Synset('plan.n.01'),"{program%1:09:00::, programme%1:09:00::, plan%...",{plan%1:09:00::},True,"{program%1:09:00::, programme%1:09:00::, plan%...","{design%2:36:02::, project%2:36:01::, plan%2:3..."
2,d001.s001.t004,emission,"[U.N., group, draft, plan, to, reduce, emission]","{emission, group, U.N., plan, draft, reduce}",Synset('emission.n.01'),"{emanation%1:04:00::, emission%1:04:00::}",{emission%1:27:00::},False,"{emanation%1:04:00::, emission%1:04:00::}","{emanation%1:04:00::, emission%1:04:00::}"
3,d001.s002.t001,climate,"[the, U.N.-sponsored, climate, conference, --,...","{outline, characterize, conference, climate, n...",Synset('climate.n.01'),"{climate%1:26:00::, clime%1:26:00::}",{climate%1:26:00::},True,"{climate%1:26:00::, clime%1:26:00::}","{climate%1:26:00::, clime%1:26:00::}"
4,d001.s002.t002,conference,"[the, U.N.-sponsored, climate, conference, --,...","{outline, characterize, conference, climate, n...",Synset('conference.n.01'),{conference%1:14:00::},{conference%1:14:00::},True,{conference%1:14:00::},{conference%1:14:00::}


## Bootstrapping

Now we'll use the SemCor corpus to apply Yarowsky's algorithm.

In [29]:
%%time
def build_semcor_data():
    '''
    Uses the semcor corpus to assemble a dataset for bootstrap classification
    '''
    semcor_data = []
    for sent_id, tagged_sent in enumerate(semcor.tagged_sents(tag='sense')):
        sentence = ["_".join(tree.leaves()) for tree in tagged_sent]
        for i, tree in enumerate(tagged_sent):
            d = {}
            d['sentence'] = sentence
            d['sentence_text'] = ' '.join(sentence)
            d['processed_sentence'] = preprocess(sentence)
            d['span'] = sentence[i]
    
            if hasattr(tree, 'label') and isinstance(tree.label(), Lemma):
                d['sense_key'] = tree.label().key()
            else:
                continue
            semcor_data.append(d)
    
    return pd.DataFrame(semcor_data)

semcor_data = build_semcor_data()

CPU times: user 1min 34s, sys: 829 ms, total: 1min 35s
Wall time: 1min 35s


In [30]:
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Fulton_County_Grand_Jury,group%1:03:00::
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",said,say%2:32:00::
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Friday,friday%1:28:00::
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",investigation,investigation%1:09:00::
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Atlanta,atlanta%1:15:00::
...,...,...,...,...,...
224711,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",stung,sting%2:39:02::
224712,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",was,be%2:42:03::
224713,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",let,let%2:41:00::
224714,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",make,make%2:41:00::


## Blank out lemma sense keys

We need to replace the lemma sense keys that are not in our dev/test set with None. We keep the processed sentences, so that when we form term-document matrices we still have all of the tokens but we will only predict cases where the sense_key is valid and in our list.

In [31]:
lemma_sense_keys = set([k for v in list(dev_key.values()) + list(test_key.values()) for k in v])
semcor_data['sense_key'] = semcor_data.sense_key.apply(lambda x: x if x in lemma_sense_keys else None)
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Fulton_County_Grand_Jury,group%1:03:00::
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",said,
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Friday,friday%1:28:00::
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",investigation,
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Atlanta,
...,...,...,...,...,...
224711,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",stung,
224712,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",was,
224713,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",let,
224714,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{difference, reply, hurt, make, stung, Her, le...",make,


In [32]:
print(f'We have {(~semcor_data.sense_key.isnull()).sum():,d} valid sentence-sense_key pairs in the dataset.')

We have 14,114 valid sentence-sense_key pairs in the dataset.


## Calculate lemma sense ids

In [33]:
lemma_sense_keys = sorted(lemma_sense_keys)
lemma_sense_key_to_id = {v: k for k, v in enumerate(lemma_sense_keys)}

semcor_data['sense_id'] = semcor_data.sense_key.apply(lambda x: lemma_sense_key_to_id[x] if x else None)
semcor_data.head()

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key,sense_id
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Fulton_County_Grand_Jury,group%1:03:00::,305.0
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",said,,
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Friday,friday%1:28:00::,288.0
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",investigation,,
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Atlanta,,


## Drop sentences where no words have a valid sense key

In [34]:
keep_sentences = (semcor_data
    .groupby('sentence_text')['sense_key']
    .apply(lambda x: ~x.isna().all())
    .to_frame()
    .reset_index()
)
keep_sentences = (keep_sentences
    .loc[keep_sentences['sense_key'], ['sentence_text']]
    .reset_index(drop=True)
)

keep_sentences = set(keep_sentences.sentence_text.values)

semcor_data = semcor_data[semcor_data.sentence_text.isin(keep_sentences)]

## Vocab List

We now have enough to start making a model. We can start by constructing a vocab list and document matrices etc so we can formulate the problem as a classification task.

In [35]:
vocab_data = semcor_data.span.value_counts(ascending=True).to_frame().reset_index()

In [36]:
vocab_data

Unnamed: 0,span,count
0,Fulton_County_Grand_Jury,1
1,cackly,1
2,wails,1
3,mourning,1
4,shelves,1
...,...,...
21737,are,544
21738,be,616
21739,not,751
21740,was,1021


In [37]:
span_to_id = {span: id for id, span in enumerate(vocab_data.span)}

## Calculate document vectors

In [38]:
semcor_data = semcor_data.copy()
semcor_data.loc[:, 'document_vector'] = semcor_data.loc[:, 'sentence'].apply(lambda x: [span_to_id.get(span) for span in x if span in span_to_id.keys()])
semcor_data.loc[:, 'span_id'] = semcor_data.loc[:, 'span'].apply(lambda x: span_to_id[x])

In [39]:
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key,sense_id,document_vector,span_id
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Fulton_County_Grand_Jury,group%1:03:00::,305.0,"[0, 21725, 20585, 19237, 19277, 21232, 21270, ...",0
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",said,,,"[0, 21725, 20585, 19237, 19277, 21232, 21270, ...",21725
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Friday,friday%1:28:00::,288.0,"[0, 21725, 20585, 19237, 19277, 21232, 21270, ...",20585
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",investigation,,,"[0, 21725, 20585, 19237, 19277, 21232, 21270, ...",19237
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Atlanta,,,"[0, 21725, 20585, 19237, 19277, 21232, 21270, ...",19277
...,...,...,...,...,...,...,...,...
183966,"[``, I, 've, been, waiting, to, get, these, th...",`` I 've been waiting to get these things done...,"{month, get, 've, I, waiting, done, thing, said}",get,,,"[21712, 20521, 21622, 21650, 21504, 21527, 21725]",21622
183967,"[``, I, 've, been, waiting, to, get, these, th...",`` I 've been waiting to get these things done...,"{month, get, 've, I, waiting, done, thing, said}",things,,,"[21712, 20521, 21622, 21650, 21504, 21527, 21725]",21650
183968,"[``, I, 've, been, waiting, to, get, these, th...",`` I 've been waiting to get these things done...,"{month, get, 've, I, waiting, done, thing, said}",done,,,"[21712, 20521, 21622, 21650, 21504, 21527, 21725]",21504
183969,"[``, I, 've, been, waiting, to, get, these, th...",`` I 've been waiting to get these things done...,"{month, get, 've, I, waiting, done, thing, said}",months,month%1:28:01::,447.0,"[21712, 20521, 21622, 21650, 21504, 21527, 21725]",21527


Now that we have the term-document vectors we can drop the rows where the sense keys are not in our target set.

In [40]:
semcor_data = semcor_data.copy()
semcor_data = semcor_data[~semcor_data.sense_key.isnull()]
sentence_to_id = {v: k for k, v in enumerate(semcor_data.sentence_text.unique())}
semcor_data.loc[:, 'sentence_id'] = semcor_data.loc[:, 'sentence_text'].apply(lambda x: sentence_to_id[x])

semcor_data.loc[:, 'sense_id'] = semcor_data.sense_id.astype(np.int32)
semcor_data = semcor_data.reset_index(drop=True)
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key,sense_id,document_vector,span_id,sentence_id
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Fulton_County_Grand_Jury,group%1:03:00::,305.0,"[0, 21725, 20585, 19237, 19277, 21232, 21270, ...",0,0
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{recent, primary_election, evidence, Friday, F...",Friday,friday%1:28:00::,288.0,"[0, 21725, 20585, 19237, 19277, 21232, 21270, ...",20585,0
2,"[The, jury, further, said, in, term, end, pres...",The jury further said in term end presentments...,"{praise, jury, election, over-all, charge, Cit...",jury,jury%1:14:00::,378.0,"[20890, 21158, 21725, 20022, 20886, 21635, 103...",20890,1
3,"[The, jury, further, said, in, term, end, pres...",The jury further said in term end presentments...,"{praise, jury, election, over-all, charge, Cit...",term,term%1:28:00::,728.0,"[20890, 21158, 21725, 20022, 20886, 21635, 103...",20886,1
4,"[The, jury, further, said, in, term, end, pres...",The jury further said in term end presentments...,"{praise, jury, election, over-all, charge, Cit...",end,end%1:28:00::,234.0,"[20890, 21158, 21725, 20022, 20886, 21635, 103...",21635,1
...,...,...,...,...,...,...,...,...,...
14109,"[Without, further, discussion, he, appeared, t...",Without further discussion he appeared the nex...,"{light, board, pile, sticking, discussion, nex...",morning,morning%1:28:00::,450.0,"[21158, 21193, 21245, 20650, 21601, 1741, 1661...",21601,8350
14110,"[Before, you, use, ', em, the, light, company,...",Before you use ' em the light company 's got t...,"{light, got, fuse, use, box, extra, circuit, n...",company,company%1:14:01::,125.0,"[21699, 21491, 21546, 21232, 21461, 21194, 200...",21546,8351
14111,"[He, oughta, be, able, to, build, a, new, hous...",He oughta be able to build a new house with al...,"{He, house, build, new, oughta, able, contrapt...",house,house%1:06:00::,332.0,"[21738, 21287, 20014, 1741, 21720, 21695, 2168...",21695,8352
14112,"[Mr._Crombie, watched, his, wife, with, an, an...",Mr._Crombie watched his wife with an anxious e...,"{wife, watched, expression, anxious, Mr._Crombie}",wife,wife%1:18:00::,807.0,"[17405, 20817, 21686, 14092, 20658]",21686,8353


## Convert to sparse matrix

In [41]:
def create_term_document_matrix(document_vectors):
    
    # Flatten the Series to create a list of (doc_id, word_id) tuples
    rows, cols = zip(*((id, token) for id, doc in enumerate(document_vectors) for token in doc))
    
    # Create a sparse matrix
    # The shape parameters (n_rows, n_cols) should match your data dimensions
    n_rows = len(semcor_data)
    n_cols = semcor_data.document_vector.apply(max).max() + 1  # assuming token IDs start from 0
    data = [1] * len(rows)  # assuming a count of 1 for each occurrence
    term_document_matrix = sparse.coo_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
    
    # Convert to CSR format for efficient arithmetic and matrix-vector operations
    return term_document_matrix.tocsr()

term_document_matrix = create_term_document_matrix(semcor_data.document_vector)

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Example data - replace with your actual data
X = normalize(term_document_matrix, axis=1, norm='l2')
y = semcor_data.sense_id

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
import os
from sklearn.linear_model import LogisticRegression
from joblib import dump, load

# Define the path for the model
model_path = 'logistic.joblib'

# Check if the model exists
if os.path.exists(model_path):
    # Load the model
    model = load(model_path)
    print("Model loaded from", model_path)
else:
    # Instantiate the model
    model = LogisticRegression()

    # Fit the model
    model.fit(X_train, y_train)
    
    # Save the model
    dump(model, model_path)
    print("Model trained and saved to", model_path)

Model loaded from logistic.joblib


In [44]:
print(f"Accuracy: {100*np.mean(model.predict(X_test) == y_test):.1f}%")

Accuracy: 29.8%


# Predict on SemEval

Now that we've trained a model, we can use it to predict on the original set we were interested in.

In [45]:
wsd_data.loc[:, 'document_vector'] = wsd_data.processed_context.apply(lambda x: [span_to_id.get(span) for span in x if span in span_to_id.keys()])

In [46]:
wsd_term_document_matrix = create_term_document_matrix(wsd_data.document_vector)

In [47]:
# Example data - replace with your actual data
X = normalize(term_document_matrix, axis=1, norm='l2')

In [48]:
wsd_preds = model.predict(X).astype(np.int32)
wsd_preds

array([305, 305, 228, ..., 332, 807, 447], dtype=int32)

In [49]:
id_to_lemma_sense_keys = {v: k for k,v in lemma_sense_key_to_id.items()}

In [50]:
wsd_data['semcor_pred'] = pd.Series([id_to_lemma_sense_keys[pred] for pred in wsd_preds])

In [51]:
wsd_data.loc[:, ['targets', 'semcor_pred']]

Unnamed: 0,targets,semcor_pred
0,{group%1:03:00::},group%1:03:00::
1,{plan%1:09:00::},group%1:03:00::
2,{emission%1:27:00::},election%1:04:01::
3,{climate%1:26:00::},election%1:04:01::
4,{conference%1:14:00::},election%1:04:01::
...,...,...
189,{copenhagen%1:15:00::},day%1:28:00::
190,{big_league%1:14:00::},day%1:28:00::
191,{vice_president%1:18:00::},election%1:04:01::
192,{policy%1:10:00::},election%1:04:01::


In [52]:
print(f'Accuracy: {100 * wsd_data.apply(lambda x: x.semcor_pred in x.targets, axis=1).mean():.1f}%')

Accuracy: 0.5%


This is disappointingly low. Probably I need to make my own seed set.

## Using Llama 2

In [53]:
from transformers import AutoTokenizer
import transformers
import torch

model = "/network/weights/llama.var/llama2/Llama-2-7b-chat-hf"

In [54]:
tokenizer = AutoTokenizer.from_pretrained(model)

# Set pad_token_id
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Using pad_token, but it is not set yet.


CPU times: user 59.1 ms, sys: 23.8 ms, total: 82.9 ms
Wall time: 198 ms


In [55]:
%%time
text_generation_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 19.5 s, sys: 34.6 s, total: 54.2 s
Wall time: 2min 56s


In [56]:
%%time
sequences = text_generation_pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?

Answer: Based on your interest in "Breaking Bad" and "Band of Brothers," here are some other shows you might enjoy:

1. "The Sopranos" - This HBO series explores the life of a New Jersey mob boss, Tony Soprano, as he navigates the criminal underworld and deals with personal and family issues.
2. "The Wire" - Set in Baltimore, this show delves into the drug trade and the impact it has on the city and its residents. It's known for its gritty realism and complex characters.
3. "Narcos" - This Netflix series tells the true story of the rise and fall of Colombian drug lord Pablo Escobar and the Medellín cartel.
4. "Sons
CPU times: user 5.37 s, sys: 1.06 s, total: 6.43 s
Wall time: 7.95 s


## Generating a seed set

Ok this works - we can use it to generate our own seed sets for bootstrapping. First we need to collect the lemma sense key labels we are trying to fit for.

In [57]:
def synset_from_key(sense_key):
    lemma = wn.lemma_from_key(sense_key)
    return lemma.synset().name()

def get_synset_definition(synset_id):
    synset = wn.synset(synset_id)
    return synset.definition()

def get_synset_examples(synset_id):
    synset = wn.synset(synset_id)
    return synset.examples()

def generate_prompt(word, definition, examples):
    prompt = f'Below is one definition for the word/phrase "{word}" and some\n' \
    'example sentences using the same definition.\n' \
    '\n' \
    f'Definition ({word}): {definition}'

    if examples:
        formatted_examples = "\n- ".join(examples)
        prompt += f'\nExamples:\n- {formatted_examples}\n'

    prompt += '\nPlease generate 5 more example sentences than those provided.'
    return prompt

def extract_examples(text):
    pattern = r"(\*|•|-|\d[\).])\s*(.*)"
    matches = re.findall(pattern, text)
    return [match[1] for match in matches]

file_path = '../data/lemma_sense_key_data.csv'

# Check if the file exists
if os.path.exists(file_path):
    lemma_sense_key_data = pd.read_csv(file_path)
    lemma_sense_key_data['examples'] = lemma_sense_key_data['examples'].apply(json.loads)
else:
    # Code to build the DataFrame from scratch
    lemma_sense_keys = sorted(set(key for keys in dev_key.values() for key in keys))
    lemma_sense_key_data = pd.DataFrame({'sense_key': lemma_sense_keys})
    
    lemma_sense_key_data['synset_id'] = lemma_sense_key_data['sense_key'].apply(synset_from_key)
    lemma_sense_key_data['word'] = lemma_sense_key_data['synset_id'].str.split('.').apply(lambda x: x[0])
    lemma_sense_key_data['definition'] = lemma_sense_key_data['synset_id'].apply(get_synset_definition)
    lemma_sense_key_data['examples'] = lemma_sense_key_data['synset_id'].apply(get_synset_examples)
    lemma_sense_key_data['prompt'] = lemma_sense_key_data.apply(lambda x: generate_prompt(x['word'], x['definition'], x['examples']), axis=1)

    prompts = lemma_sense_key_data['prompt'].tolist()
    dataset = Dataset.from_dict({'prompt': prompts})
    batch_size = 14  # Adjust as needed
    
    results = []
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]['prompt']
        batch_results = text_generation_pipeline(
            batch,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            max_length=400
        )
        results.extend(batch_results)
    
    lemma_sense_key_data['generated_text'] = [r[0]['generated_text'] for r in results]
    lemma_sense_key_data.loc[:, 'generated_text'] = lemma_sense_key_data.apply(lambda x: x['generated_text'].replace(x['prompt'], ''), axis=1)
    lemma_sense_key_data['generated_examples'] = lemma_sense_key_data['generated_text'].apply(extract_examples)
    
    # Save the DataFrame
    lemma_sense_key_data_to_save = lemma_sense_key_data.copy()
    lemma_sense_key_data_to_save['examples'] = lemma_sense_key_data_to_save.examples.apply(json.dumps)
    lemma_sense_key_data_to_save['generated_examples'] = lemma_sense_key_data_to_save.generated_examples.apply(json.dumps)
    lemma_sense_key_data_to_save.to_csv(file_path, index=False)

In [61]:
lemma_sense_key_data

Unnamed: 0,sense_key,synset_id,word,definition,examples,prompt,generated_text,generated_examples
0,action%1:04:02::,action.n.01,action,something done (usually as opposed to somethin...,[there were stories of murders and other unnat...,"Below is one definition for the word/phrase ""a...",-the company promised to take immediate action...,"[""the company promised to take immediate actio..."
1,action%1:04:04::,action.n.09,action,an act by a government body or supranational o...,[recent federal action undermined the segregat...,"Below is one definition for the word/phrase ""a...",\nYour Turn! Here are five more example senten...,"[""The European Union's action in imposing econ..."
2,activist%1:18:00::,militant.n.01,militant,a militant reformer,[],"Below is one definition for the word/phrase ""m...",is someone who is determined and strong in\nf...,"[""She is a militant activist who has dedicated..."
3,advance%1:11:01::,improvement.n.01,improvement,a change for the better; progress in development,[],"Below is one definition for the word/phrase ""i...",or performance.\n\nExample sentences using th...,"[""This new software has made a significant imp..."
4,adviser%1:18:00::,adviser.n.01,adviser,an expert who gives advice,[an adviser helped students select their cours...,"Below is one definition for the word/phrase ""a...",- a financial adviser counseled clients on inv...,"[""a financial adviser counseled clients on inv..."
...,...,...,...,...,...,...,...,...
126,week%1:28:02::,week.n.03,week,a period of seven consecutive days starting on...,[],"Below is one definition for the word/phrase ""w...",and ending on Saturday\n\nExample sentences:\...,"[""I will take the next week off work to go on ..."
127,working_group%1:14:00::,working_group.n.01,working_group,a group of people working together temporarily...,[the working group was supposed to report back...,"Below is one definition for the word/phrase ""w...",- the working group included representatives f...,"[""the working group included representatives f..."
128,world%1:05:00::,world.n.08,world,all of the living human inhabitants of the earth,"[all the world loves a lover, she always used ...","Below is one definition for the word/phrase ""w...",- he has no concern for the world beyond his o...,"[""he has no concern for the world beyond his o..."
129,world%1:17:00::,earth.n.01,earth,the 3rd planet from the sun; the planet we liv...,"[the Earth moves around the sun, he sailed aro...","Below is one definition for the word/phrase ""e...",- the Earth is blue\n- she studied geology on ...,"[""the Earth is blue"", ""she studied geology on ..."
