Senetence and Word Tokenization using Spacy 

In [1]:
# Instalation 
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Perform Sentence and Word tokenaization

In [18]:
# Import required libraries 
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Text to tokenize
text = "SpaCy is an awesome library for natural language processing. It's easy to use and provides excellent features."

# Tokenize the text into words
doc = nlp(text)

# Word tokenization
print("Word tokens:")
dcount =0
for (token) in (doc):
    # print(dcount+1)
    dcount=dcount+1
    print(dcount,token.text)

# Sentence tokenization
print("\nSentence tokens:")
scount =0
for sentence in doc.sents:
    # print(sentence.text)
    scount=scount+1
    print(scount,sentence.text)


Word tokens:
1 SpaCy
2 is
3 an
4 awesome
5 library
6 for
7 natural
8 language
9 processing
10 .
11 It
12 's
13 easy
14 to
15 use
16 and
17 provides
18 excellent
19 features
20 .

Sentence tokens:
1 SpaCy is an awesome library for natural language processing.
2 It's easy to use and provides excellent features.


# Remove punctuation in text


In [None]:
def remove_punctuation(text):
    # Process the text using SpaCy
    doc = nlp(text)

    # Filter out tokens that are not punctuation
    tokens_without_punct = [token.text for token in doc if not token.is_punct]

    # Join the filtered tokens back into a string
    return ' '.join(tokens_without_punct)

# Example text
text_with_punctuation = "Hello, this is an example sentence! It has punctuation."

# Remove punctuation
text_without_punctuation = remove_punctuation(text_with_punctuation)
print(text_without_punctuation)


# Remove Stops words in a text

In [None]:
# remove punctualsion

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')


# Tag the words in a given text using POS tagger 

In [1]:
import nltk
text = "learn php from guru99"
tokens = nltk.word_tokenize(text)
print(tokens)
tag = nltk.pos_tag(tokens)
print(tag)
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp  =nltk.RegexpParser(grammar)
result = cp.parse(tag)
print(result)

['learn', 'php', 'from', 'guru99']
[('learn', 'JJ'), ('php', 'NN'), ('from', 'IN'), ('guru99', 'NN')]
(S (NP learn/JJ php/NN) from/IN (NP guru99/NN))


In [2]:
import spacy 

# Load English tokenizer, tagger, 
# parser, NER and word vectors 
nlp = spacy.load("en_core_web_sm") 

# Process whole documents 
text = ("""My name is Shaurya Uppal. 
I enjoy writing articles on GeeksforGeeks checkout 
my other article by going to my profile section.""") 

doc = nlp(text) 

# Token and Tag 
for token in doc: 
    print(token, token.pos_) 

# You want list of Verb tokens 
print("Verbs:", [token.text for token in doc if token.pos_ == "VERB"]) 


My PRON
name NOUN
is AUX
Shaurya PROPN
Uppal PROPN
. PUNCT

 SPACE
I PRON
enjoy VERB
writing VERB
articles NOUN
on ADP
GeeksforGeeks PROPN
checkout VERB

 SPACE
my PRON
other ADJ
article NOUN
by ADP
going VERB
to ADP
my PRON
profile NOUN
section NOUN
. PUNCT
Verbs: ['enjoy', 'writing', 'checkout', 'going']


# Steming and lematization

In [3]:
# stemming


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer

set(stopwords.words('english'))

text = """He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""

stop_words = set(stopwords.words('english')) 
  
word_tokens = word_tokenize(text) 
    
filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_sentence.append(w) 

Stem_words = []
ps =PorterStemmer()
for w in filtered_sentence:
    rootWord=ps.stem(w)
    Stem_words.append(rootWord)
print(filtered_sentence)
print(Stem_words)



#steamming in spacy is not possible

['He', 'determined', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claims', 'wood-cuting', 'fishery', 'rihgts', '.', 'He', 'ready', 'becuase', 'rights', 'become', 'much', 'less', 'valuable', ',', 'indeed', 'vaguest', 'idea', 'wood', 'river', 'question', '.']
['he', 'determin', 'drop', 'litig', 'monastri', ',', 'relinguish', 'claim', 'wood-cut', 'fisheri', 'rihgt', '.', 'he', 'readi', 'becuas', 'right', 'becom', 'much', 'less', 'valuabl', ',', 'inde', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


# User defined Function for Coustom stop words 

In [4]:
def remove_coustom_stopwords(text, custom_stopwords):
    # Tokenize the text into words
    words = text.split()
    print(words)
    
    # Filter out the custom stop words
    filtered_words = [word for word in words if word.lower() not in custom_stopwords]
    print(filtered_words)
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)

    return filtered_text

# Define your custom list of stop words
custom_stopwords = ["iit","the","to"]

# Example usage
text = "This is a sample text with some custom stop words you want to remove. IIT is far better than MIT"
filtered_text = remove_coustom_stopwords(text, custom_stopwords)
print(filtered_text)


['This', 'is', 'a', 'sample', 'text', 'with', 'some', 'custom', 'stop', 'words', 'you', 'want', 'to', 'remove.', 'IIT', 'is', 'far', 'better', 'than', 'MIT']
['This', 'is', 'a', 'sample', 'text', 'with', 'some', 'custom', 'stop', 'words', 'you', 'want', 'remove.', 'is', 'far', 'better', 'than', 'MIT']
This is a sample text with some custom stop words you want remove. is far better than MIT
