#Word Sense Disambiguation (WSD)


In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag

* Collect a small corpus of example sentences of varying lengths from any newspaper or
magazine. Using WordNet or any standard dictionary, determine how many senses there are for
each of the open-class words in each sentence.

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample sentences from a newspaper/magazine
sentences = [
    "The stock market closed higher today.",
    "The president announced new policies during the press conference.",
    "Investors are optimistic about the company's future performance.",
    "The weather was unusually warm for this time of year.",
    "He quickly adapted to the fast-paced environment."
]

def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet POS tag."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def analyze_sentence(sentence):
    # Tokenize the sentence
    tokens = word_tokenize(sentence)

    # Tag the tokens with POS tags
    tagged_tokens = pos_tag(tokens)

    # Analyze each token
    for word, tag in tagged_tokens:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:  # If it's an open-class word
            senses = wn.synsets(word, pos=wn_tag)
            print(f"Word: '{word}' | POS: {tag} | Senses: {len(senses)}")
        else:
            print(f"Word: '{word}' | POS: {tag} | Not an open-class word")

# Analyze each sentence
for sentence in sentences:
    print(f"Sentence: '{sentence}'")
    analyze_sentence(sentence)
    print("\n" + "-"*50 + "\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Sentence: 'The stock market closed higher today.'
Word: 'The' | POS: DT | Not an open-class word
Word: 'stock' | POS: NN | Senses: 17
Word: 'market' | POS: NN | Senses: 5
Word: 'closed' | POS: VBD | Senses: 17
Word: 'higher' | POS: JJR | Senses: 9
Word: 'today' | POS: NN | Senses: 2
Word: '.' | POS: . | Not an open-class word

--------------------------------------------------

Sentence: 'The president announced new policies during the press conference.'
Word: 'The' | POS: DT | Not an open-class word
Word: 'president' | POS: NN | Senses: 6
Word: 'announced' | POS: VBD | Senses: 4
Word: 'new' | POS: JJ | Senses: 11
Word: 'policies' | POS: NNS | Senses: 3
Word: 'during' | POS: IN | Not an open-class word
Word: 'the' | POS: DT | Not an open-class word
Word: 'press' | POS: NN | Senses: 9
Word: 'conference' | POS: NN | Senses: 3
Word: '.' | POS: . | Not an open-class word

--------------------------------------------------

Sentence: 'Investors are optimistic about the company's future perf

* Implement Lesk algorithm for Word Sense Disambiguation (WSD)

In [None]:
def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet POS tag."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def lesk_algorithm(word, sentence):
    """Apply the Lesk algorithm to find the best sense of a word in a given context."""
    best_sense = None
    max_overlap = 0
    context = set(word_tokenize(sentence))

    for sense in wn.synsets(word):
        # Get the definition and examples of the sense
        signature = set(word_tokenize(sense.definition()))
        for example in sense.examples():
            signature.update(word_tokenize(example))

        # Compute the overlap between the context and the signature
        overlap = len(context.intersection(signature))

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense

# Sample sentence and word to disambiguate
sentence = "I went to the bank to deposit my money."
word = "bank"

# Run Lesk algorithm
best_sense = lesk_algorithm(word, sentence)

# Print the results
if best_sense:
    print(f"Best sense for word '{word}':")
    print(f"Sense: {best_sense.name()}")
    print(f"Definition: {best_sense.definition()}")
else:
    print(f"No sense found for the word '{word}' in the given context.")

Best sense for word 'bank':
Sense: depository_financial_institution.n.01
Definition: a financial institution that accepts deposits and channels the money into lending activities


* Using WordNet or a standard reference dictionary, tag each open-class word in your
corpus with its correct tag.

In [None]:
def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet POS tag."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def lesk_algorithm(word, sentence):
    """Apply the Lesk algorithm to find the best sense of a word in a given context."""
    best_sense = None
    max_overlap = 0
    context = set(word_tokenize(sentence))

    for sense in wn.synsets(word):
        # Get the definition and examples of the sense
        signature = set(word_tokenize(sense.definition()))
        for example in sense.examples():
            signature.update(word_tokenize(example))

        # Compute the overlap between the context and the signature
        overlap = len(context.intersection(signature))

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense

def tag_open_class_words(sentence):
    """Tag open-class words in the sentence with their correct WordNet sense."""
    tokens = word_tokenize(sentence)
    tagged_tokens = pos_tag(tokens)

    tagged_sentence = []

    for word, tag in tagged_tokens:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:  # If it's an open-class word
            sense = lesk_algorithm(word, sentence)
            if sense:
                tagged_sentence.append((word, sense.name()))
            else:
                tagged_sentence.append((word, "No sense found"))
        else:
            tagged_sentence.append((word, "Not an open-class word"))

    return tagged_sentence

# Sample corpus
sentences = [
    "The stock market closed higher today.",
    "The president announced new policies during the press conference.",
    "Investors are optimistic about the company's future performance.",
    "The weather was unusually warm for this time of year.",
    "He quickly adapted to the fast-paced environment."
]

# Tag each sentence
for sentence in sentences:
    print(f"Sentence: '{sentence}'")
    tagged_sentence = tag_open_class_words(sentence)
    for word, tag in tagged_sentence:
        print(f"Word: '{word}' | Tag: {tag}")
    print("\n" + "-"*50 + "\n")


Sentence: 'The stock market closed higher today.'
Word: 'The' | Tag: Not an open-class word
Word: 'stock' | Tag: broth.n.01
Word: 'market' | Tag: market.n.01
Word: 'closed' | Tag: close.v.07
Word: 'higher' | Tag: higher.s.01
Word: 'today' | Tag: today.n.01
Word: '.' | Tag: Not an open-class word

--------------------------------------------------

Sentence: 'The president announced new policies during the press conference.'
Word: 'The' | Tag: Not an open-class word
Word: 'president' | Tag: president_of_the_united_states.n.01
Word: 'announced' | Tag: announce.v.03
Word: 'new' | Tag: new.a.01
Word: 'policies' | Tag: policy.n.01
Word: 'during' | Tag: Not an open-class word
Word: 'the' | Tag: Not an open-class word
Word: 'press' | Tag: imperativeness.n.01
Word: 'conference' | Tag: No sense found
Word: '.' | Tag: Not an open-class word

--------------------------------------------------

Sentence: 'Investors are optimistic about the company's future performance.'
Word: 'Investors' | Tag: No