<a href="https://colab.research.google.com/github/mgnarag/NLP_LLM/blob/main/Segmenting_to_Lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Credits to: https://github.com/mochen862/nlp-tutorial/blob/main/nlp_hands_on_exercise.ipynb

In [None]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."
text

# **Segmentation**

In [None]:
# import
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
# Split text into sentences
sentences = sent_tokenize(text)
sentences

In [None]:
sentences[2]

In [None]:
# Punctuation removal
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", sentences[2])
text

# **Tokenization**

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
words = word_tokenize(text)
print(words)

# **Stop Words**

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)

In [None]:
# have a look at the stop words in nltk's corpus
print(stopwords.words("english"))

# **Stemming and Lemmatization**

In [None]:
nltk.download('wordnet') # download for lemmatization
nltk.download('omw-1.4')

In [None]:
# Stemming
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

In [None]:
# Lemmatize
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

In [None]:
# Another stemming and lemmatization example
words2 = ['wait', 'waiting' , 'studies', 'studying', 'computers']

# Stemming
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words2]
print("Stemming output: {}".format(stemmed))

# Lemmatization
# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words2]
print("Lemmatization output: {}".format(lemmatized))

# **Parts of Speech Tagging**


In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

In [None]:
from nltk import pos_tag

In [None]:
# tag each word with part of speech
pos_tag(words)

In [None]:
"""
POS

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

"""

#**Named entity recognition**


In [None]:
from nltk import ne_chunk
nltk.download('words')

In [None]:
ner_tree = ne_chunk(pos_tag(word_tokenize(sentences[2])))
print(ner_tree)

In [None]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)

In [None]:
text = "Twitter CEO Elon Musk arrived at the Staples Center in Los Angeles, California. "
ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)