#POS using NLTK

In [3]:
# Importing the NLTK library
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Sample text
text = "NLTK is a powerful library for natural language processing."
words = word_tokenize(text)
# Performing PoS tagging
pos_tags = pos_tag(words)

# Displaying the PoS tagged result in separate lines
print("Original Text:")
print(text)

print("\nPoS Tagging Result:")
for word, pos_tag in pos_tags:
	print(f"{word}: {pos_tag}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original Text:
NLTK is a powerful library for natural language processing.

PoS Tagging Result:
NLTK: NNP
is: VBZ
a: DT
powerful: JJ
library: NN
for: IN
natural: JJ
language: NN
processing: NN
.: .


#POS using Spacy

In [4]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
#importing libraries
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "SpaCy is a popular natural language processing library."

# Process the text with SpaCy
doc = nlp(text)

# Display the PoS tagged result
print("Original Text: ", text)
print("PoS Tagging Result:")
for token in doc:
	print(f"{token.text}: {token.pos_}")


Original Text:  SpaCy is a popular natural language processing library.
PoS Tagging Result:
SpaCy: PROPN
is: AUX
a: DET
popular: ADJ
natural: ADJ
language: NOUN
processing: NOUN
library: NOUN
.: PUNCT


# chinking and Chunking

In [6]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog"

# Tokenize the sentence
tokens = word_tokenize(sentence)

# Perform POS tagging
pos_tags = pos_tag(tokens)

# Define a chunk grammar
chunk_grammar = """
    NP: {<DT>?<JJ>*<NN>}    # Noun Phrase: optional determiner (DT), any number of adjectives (JJ), followed by a noun (NN)
    VP: {<VB.*><RP|IN>}     # Verb Phrase: verb (VB.*), followed by a particle (RP) or preposition (IN)
"""

# Create a chunk parser
chunk_parser = RegexpParser(chunk_grammar)

# Parse the POS tagged sentence
chunked = chunk_parser.parse(pos_tags)

# Print the chunked structure
print(chunked)



(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  (VP jumps/VBZ over/IN)
  (NP the/DT lazy/JJ dog/NN))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


 # Named entity recognition (NER)

In [7]:
import pandas as pd
import spacy
import requests
from bs4 import BeautifulSoup
nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 200)
content = "Trinamool Congress leader Mahua Moitra has moved the Supreme Court against her expulsion from the Lok Sabha over the cash-for-query allegations against her. Moitra was ousted from the Parliament last week after the Ethics Committee of the Lok Sabha found her guilty of jeopardising national security by sharing her parliamentary portal's login credentials with businessman Darshan Hiranandani."

doc = nlp(content)

for ent in doc.ents:
	print(ent.text, ent.start_char, ent.end_char, ent.label_)


Congress 10 18 ORG
Mahua Moitra 26 38 PERSON
the Supreme Court 49 66 ORG
the Lok Sabha 94 107 PERSON
Moitra 157 163 ORG
Parliament 184 194 ORG
last week 195 204 DATE
the Ethics Committee 211 231 ORG
Darshan Hiranandani 373 392 PERSON


In [8]:
from spacy import displacy
displacy.render(doc, style="ent")


In [9]:
entities = [(ent.text, ent.label_, ent.lemma_) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['text', 'type', 'lemma'])
print(df)


                   text    type                 lemma
0              Congress     ORG              Congress
1          Mahua Moitra  PERSON          Mahua Moitra
2     the Supreme Court     ORG     the Supreme Court
3         the Lok Sabha  PERSON         the Lok Sabha
4                Moitra     ORG                Moitra
5            Parliament     ORG            Parliament
6             last week    DATE             last week
7  the Ethics Committee     ORG  the Ethics Committee
8   Darshan Hiranandani  PERSON   Darshan Hiranandani


# Word sense disambiguation:

In [10]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define the word and its context
word = "bank"
context = "I went to the bank to deposit my paycheck."

# Tokenize and lemmatize the context
lemmatizer = WordNetLemmatizer()
context_words = word_tokenize(context)
context_pos = nltk.pos_tag(context_words)

# Function to convert nltk POS tags to wordnet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

# Lemmatize context words with POS tags
context_lemmas = [lemmatizer.lemmatize(word, get_wordnet_pos(pos) or wn.NOUN) for word, pos in context_pos]

# Get synsets for the word
word_synsets = wordnet.synsets(word)

# Get synsets for the context words
context_synsets = [wordnet.synsets(lemma, get_wordnet_pos(pos) or wn.NOUN) for lemma, pos in context_pos]
context_synsets = [synset for sublist in context_synsets for synset in sublist]  # Flatten the list

# Function to calculate the best synset for the word based on the context
def best_synset_for_context(word_synsets, context_synsets):
    best_synset = None
    max_similarity = 0

    for synset in word_synsets:
        similarity_scores = [synset.wup_similarity(context_synset) for context_synset in context_synsets if synset.wup_similarity(context_synset) is not None]
        if similarity_scores:
            avg_similarity = sum(similarity_scores) / len(similarity_scores)
            if avg_similarity > max_similarity:
                best_synset = synset
                max_similarity = avg_similarity

    return best_synset

# Find the best synset for the word given the context
best_synset = best_synset_for_context(word_synsets, context_synsets)

# Print the best synset
print("Best synset:", best_synset.name())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Best synset: bank.v.05
