# Programming Assignment 2

In [181]:
import nltk

import sys
sys.path.insert(0, '../code')

import numpy as np
import pandas as pd
from scipy import sparse
from functools import reduce

from loader import *

from nltk import wsd
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import Lemma
from collections import Counter

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

## Load the data

In [2]:
data_f = "../code/multilingual-all-words.en.xml"
key_f = "../code/wordnet.en.key"
dev_instances, test_instances = load_instances(data_f)
dev_key, test_key = load_key(key_f)

# IMPORTANT: keys contain fewer entries than the instances; need to remove them
dev_instances = {k: v for (k, v) in dev_instances.items() if k in dev_key}
test_instances = {k: v for (k, v) in test_instances.items() if k in test_key}

# ready to use here
print(len(dev_instances))  # number of dev instances
print(len(test_instances))  # number of test instances

194
1450


In [3]:
[m for m in dir(dev_instances["d001.s001.t002"]) if not m.startswith("_")]

['context', 'id', 'index', 'lemma']

In [4]:
dev_key['d001.s001.t002']

['group%1:03:00::']

In [5]:
print(dev_instances["d001.s001.t002"])

d001.s001.t002	group	U.N. group draft plan to reduce emission	1


In [6]:
context = dev_instances["d001.s001.t002"].context
id = dev_instances["d001.s001.t002"].id
index = dev_instances["d001.s001.t002"].index
lemma = dev_instances["d001.s001.t002"].lemma

assert lemma == context[index]

## Look at synsets

In [7]:
synsets = wn.synsets('dog')
print(synsets)

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]


In [8]:
synsets = wn.synsets(lemma)

In [9]:
for syn in synsets:
    print("Synset:", syn.name())
    print("Lemmas:", [lemma.name() for lemma in syn.lemmas()])
    print("Definition:", syn.definition())
    print("Examples:", syn.examples())
    print()

Synset: group.n.01
Lemmas: ['group', 'grouping']
Definition: any number of entities (members) considered as a unit
Examples: []

Synset: group.n.02
Lemmas: ['group', 'radical', 'chemical_group']
Definition: (chemistry) two or more atoms bound together as a single unit and forming part of a molecule
Examples: []

Synset: group.n.03
Lemmas: ['group', 'mathematical_group']
Definition: a set that is closed, associative, has an identity element and every element has an inverse
Examples: []

Synset: group.v.01
Lemmas: ['group']
Definition: arrange into a group or groups
Examples: ['Can you group these shapes together?']

Synset: group.v.02
Lemmas: ['group', 'aggroup']
Definition: form a group or group together
Examples: []



## Lesk's Algorithm

In [10]:
def preprocess(sentence, stop_words=stop_words, lemmatizer=lemmatizer):
    """
    Preprocesses the sentence by lemmatizing and removing stop words and words without any alphanumeric characters.
    Assumes sentence is a list of tokenized words.
    """
    # Helper function to check for at least one alphanumeric character in a word
    contains_alnum = lambda word: any(char.isalnum() for char in word)

    # Lemmatize words, filter out stop words and words without any alphanumeric characters
    processed = {lemmatizer.lemmatize(w) for w in sentence if w not in stop_words and contains_alnum(w)}

    return processed


def lesk(lemma, context):
    """
    Lesk's algorithm implementation.
    Assumes preprocessed_context is a set of lemmatized words without stop words.
    """
    assert isinstance(lemma, str), "Lemma is not a string"
    assert len(context) > 0, "Empty context"

    max_overlap = 0
    best_sense = None

    context = preprocess(context)

    # Obtain the synsets for the lemma
    synsets = wn.synsets(lemma)

    # Default to the most common sense if synsets are available
    if synsets:
        best_sense = synsets[0]

    for sense in synsets:
        # Preprocess the signature (definition and examples)
        signature = preprocess(word_tokenize(sense.definition()), stop_words, lemmatizer)
        for example in sense.examples():
            signature |= preprocess(word_tokenize(example), stop_words, lemmatizer)

        # The overlap is the size of the intersection
        overlap = len(context & signature)

        # Keep track of the best overlap so far
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense


In [11]:
lemma = 'climate'
context = 'the U.N.-sponsored climate conference -- characterize so far by unruly posturing and mutual recrimination -- gain renewed focus Friday with the release of a document outline ambitious greenhouse-gas reduction over the next @card@ year , with industrialized_nation shoulder most of the burden in the near term .'.split()
best_sense = lesk(lemma, context)

In [12]:
sense_keys = [lemma.key() for lemma in best_sense.lemmas()]
sense_keys

['climate%1:26:00::', 'clime%1:26:00::']

In [13]:
for i, item in enumerate(dev_instances.items()):
    key, instance = item
    lemma = instance.lemma
    context = instance.context
    print(instance)
    print(lesk(lemma, context).definition())
    print()
    if i > 2:
        break

d001.s001.t002	group	U.N. group draft plan to reduce emission	1
arrange into a group or groups

d001.s001.t003	plan	U.N. group draft plan to reduce emission	3
a series of steps to be carried out or goals to be accomplished

d001.s001.t004	emission	U.N. group draft plan to reduce emission	6
the act of emitting; causing to flow forth

d001.s002.t001	climate	the U.N.-sponsored climate conference -- characterize so far by unruly posturing and mutual recrimination -- gain renewed focus Friday with the release of a document outline ambitious greenhouse-gas reduction over the next @card@ year , with industrialized_nation shoulder most of the burden in the near term .	2
the weather in some location averaged over some long period of time



## Lemma sense keys and synset numbers

The correspondence between lemma sense keys and synset numbers is stored in `wordnet.en.key`. We can just read from that file.

In [14]:
! head ../code/wordnet.en.key

d001 d001.s001.t002 group%1:03:00:: 
d001 d001.s001.t003 plan%1:09:00:: 
d001 d001.s001.t004 emission%1:27:00:: 
d001 d001.s002.t001 climate%1:26:00:: 
d001 d001.s002.t002 conference%1:14:00:: 
d001 d001.s002.t003 posturing%1:07:00:: 
d001 d001.s002.t004 recrimination%1:10:00:: 
d001 d001.s002.t005 focus%1:09:00:: 
d001 d001.s002.t006 friday%1:28:00:: 
d001 d001.s002.t007 release%1:22:00:: 


In [15]:
def get_lemma_sense_key_to_synset_number_correspondence():
    wordnet_key_file = '../code/wordnet.en.key'
    lsk_to_sn = {}
    with open(wordnet_key_file, 'r') as f:
        for line in f.read().strip().split("\n"):
            line = line.strip()
            _, lsk, sn = line.split(' ', 2)
            lsk_to_sn[lsk] = set(sn.split(' '))
    return lsk_to_sn

lsk_to_sn = get_lemma_sense_key_to_synset_number_correspondence()

In [16]:
# Show the top ten
{x[0]: x[1] for i, x in enumerate(lsk_to_sn.items()) if i < 10}

{'d001.s001.t002': {'group%1:03:00::'},
 'd001.s001.t003': {'plan%1:09:00::'},
 'd001.s001.t004': {'emission%1:27:00::'},
 'd001.s002.t001': {'climate%1:26:00::'},
 'd001.s002.t002': {'conference%1:14:00::'},
 'd001.s002.t003': {'posturing%1:07:00::'},
 'd001.s002.t004': {'recrimination%1:10:00::'},
 'd001.s002.t005': {'focus%1:09:00::'},
 'd001.s002.t006': {'friday%1:28:00::'},
 'd001.s002.t007': {'release%1:22:00::'}}

## Calculating accuracy

Now that I have this correspondence, I can calculate the accuracy of my lesk's algorithm implementation.

In [17]:
def build_wsd_data():
    wsd_data = []
    for key, instance in dev_instances.items():
        id = instance.id
        lemma = instance.lemma
        context = instance.context
    
        processed_context = preprocess(context)
    
        # Use lesk's algorithm to guess the synset
        synset = lesk(lemma, context)
    
        # Get the sense-keys for the predicted synset
        preds = set(lemma.key() for lemma in synset.lemmas())
        
        # Extract the synset number from the sense-key
        targets = lsk_to_sn[id]
        
        # Calculate if there is any overlap between the predicted sense and the target
        match = len(preds & targets) > 0
        
        wsd_data.append(
            dict(id=id, lemma=lemma, context=context, processed_context=processed_context, synset=synset, preds=preds, targets=targets, match=match)
        )

    return pd.DataFrame(wsd_data)

wsd_data = build_wsd_data()

In [18]:
wsd_data.head()

Unnamed: 0,id,lemma,context,processed_context,synset,preds,targets,match
0,d001.s001.t002,group,"[U.N., group, draft, plan, to, reduce, emission]","{draft, plan, group, emission, reduce, U.N.}",Synset('group.v.01'),{group%2:31:00::},{group%1:03:00::},False
1,d001.s001.t003,plan,"[U.N., group, draft, plan, to, reduce, emission]","{draft, plan, group, emission, reduce, U.N.}",Synset('plan.n.01'),"{plan%1:09:00::, program%1:09:00::, programme%...",{plan%1:09:00::},True
2,d001.s001.t004,emission,"[U.N., group, draft, plan, to, reduce, emission]","{draft, plan, group, emission, reduce, U.N.}",Synset('emission.n.01'),"{emission%1:04:00::, emanation%1:04:00::}",{emission%1:27:00::},False
3,d001.s002.t001,climate,"[the, U.N.-sponsored, climate, conference, --,...","{far, focus, renewed, gain, ambitious, documen...",Synset('climate.n.01'),"{climate%1:26:00::, clime%1:26:00::}",{climate%1:26:00::},True
4,d001.s002.t002,conference,"[the, U.N.-sponsored, climate, conference, --,...","{far, focus, renewed, gain, ambitious, documen...",Synset('conference.n.01'),{conference%1:14:00::},{conference%1:14:00::},True


In [19]:
wsd_data.iloc[1,:]

id                                                      d001.s001.t003
lemma                                                             plan
context               [U.N., group, draft, plan, to, reduce, emission]
processed_context         {draft, plan, group, emission, reduce, U.N.}
synset                                             Synset('plan.n.01')
preds                {plan%1:09:00::, program%1:09:00::, programme%...
targets                                               {plan%1:09:00::}
match                                                             True
Name: 1, dtype: object

I'm checking that the preprocessing step correctly resolves multi-word phrases to single entities and it appears it does.

In [20]:
wsd_data[wsd_data['processed_context'].apply(lambda x: 'America' in x)].head(2)

Unnamed: 0,id,lemma,context,processed_context,synset,preds,targets,match
99,d001.s016.t001,comment,"[Stern, make, his, comment, an, hour, after, C...","{united_states, Chinese, negotiator, week, say...",Synset('comment.v.01'),"{point_out%2:32:01::, remark%2:32:01::, commen...",{comment%1:10:00::},False
100,d001.s016.t002,hour,"[Stern, make, his, comment, an, hour, after, C...","{united_states, Chinese, negotiator, week, say...",Synset('hour.n.01'),"{60_minutes%1:28:00::, hour%1:28:00::, hr%1:28...",{hour%1:28:01::},False


In [21]:
wsd_data.loc[101, ['context', 'processed_context']].values

array([list(['Stern', 'make', 'his', 'comment', 'an', 'hour', 'after', 'Chinese', 'vice', 'foreign_minister', 'he_yafei', 'say', 'America', "'s", 'top', 'climate', 'negotiator', 'be', 'either', 'lack', '``', 'common_sense', "''", 'or', 'be', '``', 'extremely', 'irresponsible', "''", 'for', 'say', 'earlier', 'in', 'the', 'week', 'that', 'the', 'united_states', 'would', 'not', 'help', 'China', 'financially', 'to', 'cope', 'with', 'global_warming', '.']),
       {'united_states', 'Chinese', 'negotiator', 'week', 'say', 'top', 'China', 'lack', 'make', 'hour', 'help', 'earlier', 'Stern', 'financially', 'comment', 'irresponsible', 'vice', 'either', 'cope', 'global_warming', "'s", 'America', 'climate', 'foreign_minister', 'common_sense', 'would', 'he_yafei', 'extremely'}],
      dtype=object)

## Calculate accuracy

In [22]:
print(f'Lesk\'s algorithm Accuracy: {100*wsd_data.match.mean():.1f}%')

Lesk's algorithm Accuracy: 57.2%


This seems like a decent score for this task considering the effort we put in.

Next we should calculate scores for:
- The most frequent sense baseline
- NLTK's implementation of Lesk's Algorithm

## Most frequent sense baseline

In [23]:
def most_frequent_synset(lemma):
    '''
    Given a lemma, this returns the most frequent sense for that lemma.
    '''
    return set(lemma.key() for lemma in wn.synsets(lemma)[0].lemmas())

In [24]:
wsd_data['most_frequent_synset'] = wsd_data.lemma.apply(most_frequent_synset)

In [25]:
most_frequent_synset_accuracy = wsd_data.apply(lambda x: len(x.most_frequent_synset & x.targets) > 0, axis=1).mean()
print(f'Most Frequent Synset Accuracy: {100*most_frequent_synset_accuracy:.1f}%')

Most Frequent Synset Accuracy: 67.5%


## NLTK Lesk Algorithm baseline

In [26]:
wsd_data['nltk_pred_synset'] = wsd_data.apply(lambda x: set(lemma.key() for lemma in wsd.lesk(x.context, x.lemma).lemmas()), axis = 1)
nltk_pred_synset_accuracy = wsd_data.apply(lambda x: len(x.nltk_pred_synset & x.targets) > 0, axis=1).mean()
print(f'NLTK Lesk\'s Algorithm Accuracy: {100*nltk_pred_synset_accuracy:.1f}%')

NLTK Lesk's Algorithm Accuracy: 34.0%


In [27]:
wsd_data.head()

Unnamed: 0,id,lemma,context,processed_context,synset,preds,targets,match,most_frequent_synset,nltk_pred_synset
0,d001.s001.t002,group,"[U.N., group, draft, plan, to, reduce, emission]","{draft, plan, group, emission, reduce, U.N.}",Synset('group.v.01'),{group%2:31:00::},{group%1:03:00::},False,"{group%1:03:00::, grouping%1:03:00::}","{group%2:33:00::, aggroup%2:33:00::}"
1,d001.s001.t003,plan,"[U.N., group, draft, plan, to, reduce, emission]","{draft, plan, group, emission, reduce, U.N.}",Synset('plan.n.01'),"{plan%1:09:00::, program%1:09:00::, programme%...",{plan%1:09:00::},True,"{plan%1:09:00::, program%1:09:00::, programme%...","{plan%2:36:00::, project%2:36:01::, design%2:3..."
2,d001.s001.t004,emission,"[U.N., group, draft, plan, to, reduce, emission]","{draft, plan, group, emission, reduce, U.N.}",Synset('emission.n.01'),"{emission%1:04:00::, emanation%1:04:00::}",{emission%1:27:00::},False,"{emission%1:04:00::, emanation%1:04:00::}","{emission%1:04:00::, emanation%1:04:00::}"
3,d001.s002.t001,climate,"[the, U.N.-sponsored, climate, conference, --,...","{far, focus, renewed, gain, ambitious, documen...",Synset('climate.n.01'),"{climate%1:26:00::, clime%1:26:00::}",{climate%1:26:00::},True,"{climate%1:26:00::, clime%1:26:00::}","{climate%1:26:00::, clime%1:26:00::}"
4,d001.s002.t002,conference,"[the, U.N.-sponsored, climate, conference, --,...","{far, focus, renewed, gain, ambitious, documen...",Synset('conference.n.01'),{conference%1:14:00::},{conference%1:14:00::},True,{conference%1:14:00::},{conference%1:14:00::}


## Bootstrapping

Now we'll use the SemCor corpus to apply Yarowsky's algorithm.

In [207]:
def build_semcor_data():
    '''
    Uses the semcor corpus to assemble a dataset for bootstrap classification
    '''
    semcor_data = []
    for sent_id, tagged_sent in enumerate(semcor.tagged_sents(tag='sense')):
        sentence = ["_".join(tree.leaves()) for tree in tagged_sent]
        for i, tree in enumerate(tagged_sent):
            d = {}
            d['sentence'] = sentence
            d['sentence_text'] = ' '.join(sentence)
            d['processed_sentence'] = preprocess(sentence)
            d['span'] = sentence[i]
    
            if hasattr(tree, 'label') and isinstance(tree.label(), Lemma):
                d['sense_key'] = tree.label().key()
            else:
                continue
            semcor_data.append(d)
    
    return pd.DataFrame(semcor_data)

semcor_data = build_semcor_data()

In [208]:
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Fulton_County_Grand_Jury,group%1:03:00::
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",said,say%2:32:00::
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Friday,friday%1:28:00::
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",investigation,investigation%1:09:00::
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Atlanta,atlanta%1:15:00::
...,...,...,...,...,...
224711,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",stung,sting%2:39:02::
224712,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",was,be%2:42:03::
224713,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",let,let%2:41:00::
224714,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",make,make%2:41:00::


## Blank out lemma sense keys

We need to replace the lemma sense keys that are not in our dev/test set with None. We keep the processed sentences, so that when we form term-document matrices we still have all of the tokens but we will only predict cases where the sense_key is valid and in our list.

In [209]:
lemma_sense_keys = set([k for v in list(dev_key.values()) + list(test_key.values()) for k in v])
semcor_data['sense_key'] = semcor_data.sense_key.apply(lambda x: x if x in lemma_sense_keys else None)
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Fulton_County_Grand_Jury,group%1:03:00::
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",said,
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Friday,friday%1:28:00::
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",investigation,
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Atlanta,
...,...,...,...,...,...
224711,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",stung,
224712,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",was,
224713,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",let,
224714,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",make,


In [210]:
print(f'We have {(~semcor_data.sense_key.isnull()).sum():,d} valid sentence-sense_key pairs in the dataset.')

We have 14,114 valid sentence-sense_key pairs in the dataset.


## Calculate lemma sense ids

In [211]:
lemma_sense_keys = sorted(lemma_sense_keys)
lemma_sense_key_to_id = {v: k for k, v in enumerate(lemma_sense_keys)}

semcor_data['sense_id'] = semcor_data.sense_key.apply(lambda x: lemma_sense_key_to_id[x] if x else None)
semcor_data.head()

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key,sense_id
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Fulton_County_Grand_Jury,group%1:03:00::,305.0
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",said,,
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Friday,friday%1:28:00::,288.0
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",investigation,,
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Atlanta,,


## Vocab List

We now have enough to start making a model. We can start by constructing a vocab list and document matrices etc so we can formulate the problem as a classification task.

In [212]:
vocab_data = semcor_data.span.value_counts(ascending=True).to_frame().reset_index()

In [213]:
vocab_data

Unnamed: 0,span,count
0,Fulton_County_Grand_Jury,1
1,chockfull,1
2,Classified,1
3,snapshots,1
4,Journalism,1
...,...,...
33591,not,1607
33592,are,1867
33593,be,2160
33594,was,4077


In [214]:
span_to_id = {span: id for id, span in enumerate(vocab_data.span)}

## Calculate document vectors

In [215]:
semcor_data['document_vector'] = semcor_data.sentence.apply(lambda x: [span_to_id.get(span) for span in x if span in span_to_id.keys()])
semcor_data['span_id'] = semcor_data.span.apply(lambda x: span_to_id[x])

In [216]:
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key,sense_id,document_vector,span_id
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Fulton_County_Grand_Jury,group%1:03:00::,305.0,"[0, 33590, 31097, 30785, 29054, 33534, 32761, ...",0
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",said,,,"[0, 33590, 31097, 30785, 29054, 33534, 32761, ...",33590
2,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Friday,friday%1:28:00::,288.0,"[0, 33590, 31097, 30785, 29054, 33534, 32761, ...",31097
3,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",investigation,,,"[0, 33590, 31097, 30785, 29054, 33534, 32761, ...",30785
4,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Atlanta,,,"[0, 33590, 31097, 30785, 29054, 33534, 32761, ...",29054
...,...,...,...,...,...,...,...,...
224711,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",stung,,,"[30966, 20906, 14741, 33594, 33542, 33473, 334...",20906
224712,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",was,,,"[30966, 20906, 14741, 33594, 33542, 33473, 334...",33594
224713,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",let,,,"[30966, 20906, 14741, 33594, 33542, 33473, 334...",33484
224714,"[Her, reply, stung, me, ,, but, this, was, too...","Her reply stung me , but this was too importan...","{reply, let, Her, stung, important, hurt, diff...",make,,,"[30966, 20906, 14741, 33594, 33542, 33473, 334...",33582


Now that we have the term-document vectors we can drop the rows where the sense keys are not in our target set.

In [235]:
semcor_data = semcor_data[~semcor_data.sense_key.isnull()]
sentence_to_id = {v: k for k, v in enumerate(semcor_data.sentence_text.unique())}
semcor_data['sentence_id'] = semcor_data.sentence_text.apply(lambda x: sentence_to_id[x])

semcor_data['sense_id'] = semcor_data.loc[:, 'sense_id'].astype(np.int32)
semcor_data = semcor_data.reset_index(drop=True)
semcor_data

Unnamed: 0,sentence,sentence_text,processed_sentence,span,sense_key,sense_id,document_vector,span_id,sentence_id
0,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Fulton_County_Grand_Jury,group%1:03:00::,305,"[0, 33590, 31097, 30785, 29054, 33534, 32761, ...",0,0
1,"[The, Fulton_County_Grand_Jury, said, Friday, ...",The Fulton_County_Grand_Jury said Friday an in...,"{Fulton_County_Grand_Jury, Atlanta, took_place...",Friday,friday%1:28:00::,288,"[0, 33590, 31097, 30785, 29054, 33534, 32761, ...",31097,0
2,"[The, jury, further, said, in, term, end, pres...",The jury further said in term end presentments...,"{praise, said, presentment, over-all, deserves...",jury,jury%1:14:00::,378,"[31734, 32919, 33590, 31785, 32231, 33457, 149...",31734,1
3,"[The, jury, further, said, in, term, end, pres...",The jury further said in term end presentments...,"{praise, said, presentment, over-all, deserves...",term,term%1:28:00::,728,"[31734, 32919, 33590, 31785, 32231, 33457, 149...",32231,1
4,"[The, jury, further, said, in, term, end, pres...",The jury further said in term end presentments...,"{praise, said, presentment, over-all, deserves...",end,end%1:28:00::,234,"[31734, 32919, 33590, 31785, 32231, 33457, 149...",33457,1
...,...,...,...,...,...,...,...,...,...
14109,"[Without, further, discussion, he, appeared, t...",Without further discussion he appeared the nex...,"{stained, morning, proceeded, according_to, di...",morning,morning%1:28:00::,450,"[32919, 32753, 33334, 32607, 33172, 16705, 287...",33172,8350
14110,"[Before, you, use, ', em, the, light, company,...",Before you use ' em the light company 's got t...,"{extra, circuit, 's, company, fuse, light, Bef...",company,company%1:14:01::,125,"[33544, 33411, 33153, 33534, 33506, 33391, 317...",33153,8351
14111,"[He, oughta, be, able, to, build, a, new, hous...",He oughta be able to build a new house with al...,"{able, build, He, house, oughta, new, contrapt...",house,house%1:06:00::,332,"[33593, 33240, 32995, 16705, 33561, 33444, 335...",33444,8352
14112,"[Mr._Crombie, watched, his, wife, with, an, an...",Mr._Crombie watched his wife with an anxious e...,"{anxious, wife, watched, expression, Mr._Crombie}",wife,wife%1:18:00::,807,"[29571, 33087, 33395, 19670, 32701]",33395,8353


## Convert to sparse matrix

In [236]:
def create_term_document_matrix(semcor_data):
    
    # Flatten the Series to create a list of (doc_id, word_id) tuples
    rows, cols = zip(*((id, token) for id, row in semcor_data.iterrows() for token in row.document_vector))
    
    # Create a sparse matrix
    # The shape parameters (n_rows, n_cols) should match your data dimensions
    n_rows = len(semcor_data)
    n_cols = semcor_data.document_vector.apply(max).max() + 1  # assuming token IDs start from 0
    data = [1] * len(rows)  # assuming a count of 1 for each occurrence
    term_document_matrix = sparse.coo_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
    
    # Convert to CSR format for efficient arithmetic and matrix-vector operations
    return term_document_matrix.tocsr()

term_document_matrix = create_term_document_matrix(semcor_data)

In [238]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Example data - replace with your actual data
X = term_document_matrix
y = semcor_data.sense_id

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)