In [14]:
import spacy

In [15]:
# Load English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm")

In [16]:
# Example text
text = "This is an example sentence. John go to the school."

In [17]:
# Process the text
doc = nlp(text)

Tokenization:
Tokenization is the process of splitting text into individual words or tokens.

In [18]:
# Iterate over tokens
for token in doc:
    print(token.text)

This
is
an
example
sentence
.
John
go
to
the
school
.


Part-of-speech (POS) Tagging:
POS tagging assigns a grammatical label to each token, such as noun, verb, adjective, etc.

In [19]:
# Iterate over tokens with POS tags
for token in doc:
    print(token.text, token.pos_)


This PRON
is AUX
an DET
example NOUN
sentence NOUN
. PUNCT
John PROPN
go VERB
to ADP
the DET
school NOUN
. PUNCT


Lemmatization:
Lemmatization reduces words to their base or root form.

In [20]:
# Iterate over tokens with lemmatized forms
for token in doc:
    print(token.text, token.lemma_)


This this
is be
an an
example example
sentence sentence
. .
John John
go go
to to
the the
school school
. .


Removing Stopwords:
Stopwords are common words (e.g., "the", "is", "and") that are often removed during preprocessing.

In [23]:
from spacy.lang.en.stop_words import STOP_WORDS

In [24]:
# Remove stopwords
filtered_tokens = [token.text for token in doc if token.text.lower() not in STOP_WORDS]

# Join filtered tokens back into a sentence
filtered_text = ' '.join(filtered_tokens)

print(filtered_text)

example sentence . John school .


In [21]:
# Filter out stopwords
filtered_tokens = [token.text for token in doc if not token.is_stop]
filtered_tokens

['example', 'sentence', '.', 'John', 'school', '.']

Named Entity Recognition (NER):
NER identifies named entities such as persons, organizations, locations, etc.

In [22]:
# Extract named entities
for ent in doc.ents:
    print(ent.text, ent.label_)


John PERSON
