<a href="https://colab.research.google.com/github/mgnarag/NLP_LLM/blob/main/Segmenting_to_Lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Credits to: https://github.com/mochen862/nlp-tutorial/blob/main/nlp_hands_on_exercise.ipynb

In [1]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."
text

"Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

# **Segmentation**

In [2]:
# import
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Split text into sentences
sentences = sent_tokenize(text)
sentences

['Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry.',
 'The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066.',
 'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace.',
 "Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."]

In [4]:
sentences[2]

'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace.'

In [5]:
# Punctuation removal
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", sentences[2])
text

'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace '

# **Tokenization**

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
words = word_tokenize(text)
print(words)

['Queen', 'Camilla', 'was', 'crowned', 'alongside', 'him', 'before', 'a', 'huge', 'parade', 'back', 'to', 'Buckingham', 'Palace']


# **Stop Words**

In [8]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)

['Queen', 'Camilla', 'crowned', 'alongside', 'huge', 'parade', 'back', 'Buckingham', 'Palace']


In [12]:
# have a look at the stop words in nltk's corpus
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# **Stemming and Lemmatization**

In [13]:
nltk.download('wordnet') # download for lemmatization
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [14]:
# Stemming
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['queen', 'camilla', 'crown', 'alongsid', 'huge', 'parad', 'back', 'buckingham', 'palac']


In [15]:
# Lemmatize
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

['Queen', 'Camilla', 'crowned', 'alongside', 'huge', 'parade', 'back', 'Buckingham', 'Palace']


In [16]:
# Another stemming and lemmatization example
words2 = ['wait', 'waiting' , 'studies', 'studying', 'computers']

# Stemming
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words2]
print("Stemming output: {}".format(stemmed))

# Lemmatization
# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words2]
print("Lemmatization output: {}".format(lemmatized))

Stemming output: ['wait', 'wait', 'studi', 'studi', 'comput']
Lemmatization output: ['wait', 'waiting', 'study', 'studying', 'computer']


# **Parts of Speech Tagging**


In [17]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [18]:
from nltk import pos_tag

In [19]:
# tag each word with part of speech
pos_tag(words)

[('Queen', 'NNP'),
 ('Camilla', 'NNP'),
 ('crowned', 'VBD'),
 ('alongside', 'RB'),
 ('huge', 'JJ'),
 ('parade', 'NN'),
 ('back', 'RB'),
 ('Buckingham', 'NNP'),
 ('Palace', 'NNP')]

In [20]:
"""
POS

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

"""

'\nPOS\n\nCC: It is the conjunction of coordinating\nCD: It is a digit of cardinal\nDT: It is the determiner\nEX: Existential\nFW: It is a foreign word\nIN: Preposition and conjunction\nJJ: Adjective\nJJR and JJS: Adjective and superlative\nLS: List marker\nMD: Modal\nNN: Singular noun\nNNS, NNP, NNPS: Proper and plural noun\nPDT: Predeterminer\nWRB: Adverb of wh\nWP$: Possessive wh\nWP: Pronoun of wh\nWDT: Determiner of wp\nVBZ: Verb\nVBP, VBN, VBG, VBD, VB: Forms of verbs\nUH: Interjection\nTO: To go\nRP: Particle\nRBS, RB, RBR: Adverb\nPRP, PRP$: Pronoun personal and professional\n\n'

#**Named entity recognition**


In [21]:
from nltk import ne_chunk
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [22]:
ner_tree = ne_chunk(pos_tag(word_tokenize(sentences[2])))
print(ner_tree)

(S
  (PERSON Queen/NNP)
  (PERSON Camilla/NNP)
  was/VBD
  crowned/VBN
  alongside/RB
  him/PRP
  before/IN
  a/DT
  huge/JJ
  parade/NN
  back/RB
  to/TO
  (PERSON Buckingham/NNP Palace/NNP)
  ./.)


In [23]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)

(S
  Millions/NNS
  of/IN
  people/NNS
  across/IN
  the/DT
  (ORGANIZATION UK/NNP)
  and/CC
  beyond/IN
  have/VBP
  celebrated/VBN
  the/DT
  coronation/NN
  of/IN
  King/NNP
  (PERSON Charles/NNP III/NNP)
  -/:
  a/DT
  symbolic/JJ
  ceremony/NN
  combining/VBG
  a/DT
  religious/JJ
  service/NN
  and/CC
  pageantry/NN
  ./.
  The/DT
  ceremony/NN
  was/VBD
  held/VBN
  at/IN
  (ORGANIZATION Westminster/NNP Abbey/NNP)
  ,/,
  with/IN
  the/DT
  King/NNP
  becoming/VBG
  the/DT
  40th/CD
  reigning/VBG
  monarch/NN
  to/TO
  be/VB
  crowned/VBN
  there/RB
  since/IN
  1066/CD
  ./.
  (PERSON Queen/NNP Camilla/NNP)
  was/VBD
  crowned/VBN
  alongside/RB
  him/PRP
  before/IN
  a/DT
  huge/JJ
  parade/NN
  back/RB
  to/TO
  (PERSON Buckingham/NNP Palace/NNP)
  ./.
  Here/RB
  's/VBZ
  how/WRB
  the/DT
  day/NN
  of/IN
  splendour/NN
  and/CC
  formality/NN
  ,/,
  which/WDT
  featured/VBD
  customs/NNS
  dating/VBG
  back/RB
  more/JJR
  than/IN
  1,000/CD
  years/NNS
  ,/,
  unfolded/

In [24]:
text = "Twitter CEO Elon Musk arrived at the Staples Center in Los Angeles, California. "
ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)

(S
  (PERSON Twitter/NNP)
  (ORGANIZATION CEO/NNP Elon/NNP Musk/NNP)
  arrived/VBD
  at/IN
  the/DT
  (FACILITY Staples/NNP Center/NNP)
  in/IN
  (GPE Los/NNP Angeles/NNP)
  ,/,
  (GPE California/NNP)
  ./.)
