# Tokenization

In [3]:
corpus = """ Hello welcome, to Mohsin's NLP tutorials.
Please do watch the entire course!" to become expert in NLP
"""

print(corpus)

 Hello welcome, to Mohsin's NLP tutorials.
Please do watch the entire course!" to become expert in NLP



In [4]:
## Tokenization
## paraghraphs --> Sentence 
from nltk.tokenize import sent_tokenize
document = sent_tokenize(corpus)

In [5]:
type(document)

list

In [6]:
for sentence in document:
    print(sentence)

 Hello welcome, to Mohsin's NLP tutorials.
Please do watch the entire course!"
to become expert in NLP


In [7]:
# Tokenization
## Paragraph --> words
### Sentence --> wrods
from nltk.tokenize import word_tokenize

word_tokenize(corpus)

['Hello',
 'welcome',
 ',',
 'to',
 'Mohsin',
 "'s",
 'NLP',
 'tutorials',
 '.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!',
 "''",
 'to',
 'become',
 'expert',
 'in',
 'NLP']

In [8]:
for sentence in document:
    print(word_tokenize(sentence))

['Hello', 'welcome', ',', 'to', 'Mohsin', "'s", 'NLP', 'tutorials', '.']
['Please', 'do', 'watch', 'the', 'entire', 'course', '!', "''"]
['to', 'become', 'expert', 'in', 'NLP']


In [9]:
from nltk.tokenize import wordpunct_tokenize

wordpunct_tokenize(corpus)

['Hello',
 'welcome',
 ',',
 'to',
 'Mohsin',
 "'",
 's',
 'NLP',
 'tutorials',
 '.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!"',
 'to',
 'become',
 'expert',
 'in',
 'NLP']

In [10]:
# Tree back work tokenization

from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

In [11]:
tokenizer.tokenize(corpus)

['Hello',
 'welcome',
 ',',
 'to',
 'Mohsin',
 "'s",
 'NLP',
 'tutorials.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!',
 "''",
 'to',
 'become',
 'expert',
 'in',
 'NLP']

# Stemming

In [12]:
# Classification Problem
# Comment of product is positive review or negative review
# Reviews --> [eating , eats , eaten]--> eat , [going , goes , gone]--> go

words = ["eating" , "eats" , "eaten" , "going" , "goes" , "gone" , "programming" , "programms" , "history" , "finally" , "finalized" ]


## PorterStemmer

In [13]:
# Porterstemmer

from nltk. stem import PorterStemmer

stemming = PorterStemmer()

for word in words:
    print(word+ "--->" + stemming.stem(word))

eating--->eat
eats--->eat
eaten--->eaten
going--->go
goes--->goe
gone--->gone
programming--->program
programms--->programm
history--->histori
finally--->final
finalized--->final


In [14]:
stemming.stem("congratulation")

'congratul'

In [15]:
stemming.stem("sitting")

'sit'

# RegexStemmer Class

In [16]:
from nltk.stem import RegexpStemmer

reg_stemmer = RegexpStemmer('ing$|s$|e$|able$|ed$', min=5)
reg_stemmer.stem("eated")

'eat'

# Snowboll Stemmer 

In [17]:
from nltk.stem import SnowballStemmer

snow_ball = SnowballStemmer('english')

for word in words:
    print(word + "---> "+ snow_ball.stem(word))

eating---> eat
eats---> eat
eaten---> eaten
going---> go
goes---> goe
gone---> gone
programming---> program
programms---> programm
history---> histori
finally---> final
finalized---> final


In [18]:
snow_ball.stem("fairly"),snow_ball.stem("Sportingly") 

('fair', 'sport')

# Lemmatization
- Lemmatization technique like stemmer. 
- Output after lemmatization is called Lemma 
- Which is the root word than root stem.

In [19]:
# Q&A , chatbots , text sumariozation

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

'''
POS- Noun-n
verb - v
adjective -a
adverb - r
'''

lemmatizer.lemmatize("going" , pos='v')

'go'

In [20]:
words = ["eating" , "eats" , "eaten" , "going" , "goes" , "gone" , "programing" , "programs" , "history" , "finally" , "finalized" ]

for word in words:
    print(word+ "--->"+lemmatizer.lemmatize(word , pos='n'))

eating--->eating
eats--->eats
eaten--->eaten
going--->going
goes--->go
gone--->gone
programing--->programing
programs--->program
history--->history
finally--->finally
finalized--->finalized


# Stopwords 

In [21]:
paragraph = """ Ramadan is the ninth month of the Islamic lunar calendar and is observed by Muslims worldwide as a month of fasting, prayer, reflection, and community. It is a time for spiritual growth, self-discipline, and empathy for those less fortunate. During Ramadan, Muslims abstain from food, drink, and other physical needs from dawn until sunset, focusing instead on spiritual pursuits such as reciting the Quran, performing extra prayers, and engaging in acts of charity. The fast is broken each evening with a meal called iftar, often shared with family and friends, and the month culminates in the joyous celebration of Eid al-Fitr, a time of feasting, prayer, and giving thanks for the blessings of Ramadan. """

In [22]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [23]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
print(stopwords.words('swedish'))

['och', 'det', 'att', 'i', 'en', 'jag', 'hon', 'som', 'han', 'på', 'den', 'med', 'var', 'sig', 'för', 'så', 'till', 'är', 'men', 'ett', 'om', 'hade', 'de', 'av', 'icke', 'mig', 'du', 'henne', 'då', 'sin', 'nu', 'har', 'inte', 'hans', 'honom', 'skulle', 'hennes', 'där', 'min', 'man', 'ej', 'vid', 'kunde', 'något', 'från', 'ut', 'när', 'efter', 'upp', 'vi', 'dem', 'vara', 'vad', 'över', 'än', 'dig', 'kan', 'sina', 'här', 'ha', 'mot', 'alla', 'under', 'någon', 'eller', 'allt', 'mycket', 'sedan', 'ju', 'denna', 'själv', 'detta', 'åt', 'utan', 'varit', 'hur', 'ingen', 'mitt', 'ni', 'bli', 'blev', 'oss', 'din', 'dessa', 'några', 'deras', 'blir', 'mina', 'samma', 'vilken', 'er', 'sådan', 'vår', 'blivit', 'dess', 'inom', 'mellan', 'sådant', 'varför', 'varje', 'vilka', 'ditt', 'vem', 'vilket', 'sitta', 'sådana', 'vart', 'dina', 'vars', 'vårt', 'våra', 'ert', 'era', 'vilkas']


In [25]:
import nltk
from nltk.stem import PorterStemmer

stemmer  = PorterStemmer()


In [26]:
# Paragraph --- > Sentences
sentence = nltk.sent_tokenize(paragraph)

In [27]:
type(sentence)

list

In [28]:
## Apply stopwords And filter and then apply stemming

for i in range(len(sentence)):
    words= nltk.word_tokenize(sentence[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentence[i] = ' '.join(words) # converting  all the list of  words into sentences

sentence

['ramadan ninth month islam lunar calendar observ muslim worldwid month fast , prayer , reflect , commun .',
 'it time spiritu growth , self-disciplin , empathi less fortun .',
 'dure ramadan , muslim abstain food , drink , physic need dawn sunset , focus instead spiritu pursuit recit quran , perform extra prayer , engag act chariti .',
 'the fast broken even meal call iftar , often share famili friend , month culmin joyou celebr eid al-fitr , time feast , prayer , give thank bless ramadan .']

# SnowballStemmer 

In [29]:
from nltk.stem import SnowballStemmer
snowballstemmer = SnowballStemmer('english')

In [30]:
## Apply stopwords And filter and then apply Snowball stemming

for i in range(len(sentence)):
    words= nltk.word_tokenize(sentence[i])
    words = [snowballstemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentence[i] = ' '.join(words) # converting  all the list of  words into sentences

sentence

['ramadan ninth month islam lunar calendar observ muslim worldwid month fast , prayer , reflect , commun .',
 'time spiritu growth , self-disciplin , empathi less fortun .',
 'dure ramadan , muslim abstain food , drink , physic need dawn sunset , focus instead spiritu pursuit recit quran , perform extra prayer , engag act chariti .',
 'fast broken even meal call iftar , often share famili friend , month culmin joyou celebr eid al-fitr , time feast , prayer , give thank bless ramadan .']

# Lemmatization ( stopwords)

In [31]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [32]:
## Apply stopwords And filter and then apply  Lemmetization stemming

for i in range(len(sentence)):
    words= nltk.word_tokenize(sentence[i])
    words = [lemmatizer.lemmatize(word , pos='v') for word in words if word not in set(stopwords.words('english'))]
    sentence[i] = ' '.join(words) # converting  all the list of  words into sentences

sentence

['ramadan ninth month islam lunar calendar observ muslim worldwid month fast , prayer , reflect , commun .',
 'time spiritu growth , self-disciplin , empathi less fortun .',
 'dure ramadan , muslim abstain food , drink , physic need dawn sunset , focus instead spiritu pursuit recit quran , perform extra prayer , engag act chariti .',
 'fast break even meal call iftar , often share famili friend , month culmin joyou celebr eid al-fitr , time feast , prayer , give thank bless ramadan .']

# Part of speech tags 

In [33]:
import nltk
sentence = nltk.sent_tokenize(paragraph)
sentence

[' Ramadan is the ninth month of the Islamic lunar calendar and is observed by Muslims worldwide as a month of fasting, prayer, reflection, and community.',
 'It is a time for spiritual growth, self-discipline, and empathy for those less fortunate.',
 'During Ramadan, Muslims abstain from food, drink, and other physical needs from dawn until sunset, focusing instead on spiritual pursuits such as reciting the Quran, performing extra prayers, and engaging in acts of charity.',
 'The fast is broken each evening with a meal called iftar, often shared with family and friends, and the month culminates in the joyous celebration of Eid al-Fitr, a time of feasting, prayer, and giving thanks for the blessings of Ramadan.']

In [34]:
# we will find out the pos tag

for i in range(len(sentence)):
    words= nltk.word_tokenize(sentence[i])
    words = [word for word in words if word not in set(stopwords.words('english'))]
    #sentence[i] = ' '.join(words) # converting  all the list of  words into sentences
    pos_tag=nltk.pos_tag(words)
    print(pos_tag )

[('Ramadan', 'NNP'), ('ninth', 'JJ'), ('month', 'NN'), ('Islamic', 'NNP'), ('lunar', 'NN'), ('calendar', 'NN'), ('observed', 'VBD'), ('Muslims', 'NNP'), ('worldwide', 'IN'), ('month', 'NN'), ('fasting', 'NN'), (',', ','), ('prayer', 'NN'), (',', ','), ('reflection', 'NN'), (',', ','), ('community', 'NN'), ('.', '.')]
[('It', 'PRP'), ('time', 'NN'), ('spiritual', 'JJ'), ('growth', 'NN'), (',', ','), ('self-discipline', 'JJ'), (',', ','), ('empathy', 'JJ'), ('less', 'JJR'), ('fortunate', 'NN'), ('.', '.')]
[('During', 'IN'), ('Ramadan', 'NNP'), (',', ','), ('Muslims', 'NNP'), ('abstain', 'VBP'), ('food', 'NN'), (',', ','), ('drink', 'NN'), (',', ','), ('physical', 'JJ'), ('needs', 'NNS'), ('dawn', 'NN'), ('sunset', 'NN'), (',', ','), ('focusing', 'VBG'), ('instead', 'RB'), ('spiritual', 'JJ'), ('pursuits', 'NNS'), ('reciting', 'VBG'), ('Quran', 'NNP'), (',', ','), ('performing', 'VBG'), ('extra', 'JJ'), ('prayers', 'NNS'), (',', ','), ('engaging', 'VBG'), ('acts', 'NNS'), ('charity', 'NN

In [35]:
words = "Taj mahal is beautiful mounment".split()

In [36]:
# Perform POS tagging on the list of words
nltk.pos_tag(words)

[('Taj', 'NNP'),
 ('mahal', 'NN'),
 ('is', 'VBZ'),
 ('beautiful', 'JJ'),
 ('mounment', 'NN')]

# Named Entity Recognition

In [37]:
sentece = "The Eiffel Tower was built from 1887 to 1889 by French enginneer Gustva Eiffel , whose company specialized in building metal framworks and structures.  "

"""
Person eg : Muhsin Ahmad
Place or Location : London
Dateeg: September , 23-09-2023
Time eg: 04:30pm
Money eg: 1 million dollar
Orginazation eg : Ineuron Privated Limited
Percent eg : 20% , twenty percent
"""

'\nPerson eg : Muhsin Ahmad\nPlace or Location : London\nDateeg: September , 23-09-2023\nTime eg: 04:30pm\nMoney eg: 1 million dollar\nOrginazation eg : Ineuron Privated Limited\nPercent eg : 20% , twenty percent\n'

In [38]:
import nltk
words = nltk.word_tokenize(sentece)

In [39]:
tag_element = nltk.pos_tag(words)

In [40]:
nltk.ne_chunk(tag_element).draw()

: 