## Installing and downloading model

In [2]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!python -m spacy download en_core_web_sm

2023-04-05 09:47:34.729232: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-05 09:47:37.695532: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')



## Tokenization

In [7]:

import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Spacy is an open-source NLP library designed for various preprocessing tasks."

# Tokenize the text using the loaded model
doc = nlp(text)

# Print individual tokens
for token in doc:
    print(token.text)

Spacy
is
an
open
-
source
NLP
library
designed
for
various
preprocessing
tasks
.


## Stop words removal

In [8]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Spacy is an open-source NLP library designed for various preprocessing tasks."

# Tokenize the text using the loaded model
doc = nlp(text)

# Remove stop words from the tokenized text
filtered_tokens = [token.text for token in doc if not token.is_stop]

# Print filtered tokens
print(filtered_tokens)

['Spacy', 'open', '-', 'source', 'NLP', 'library', 'designed', 'preprocessing', 'tasks', '.']


## Lemmatization

In [9]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "The quick brown foxes are jumping over the lazy dogs."

# Tokenize and lemmatize the text using the loaded model
doc = nlp(text)

# Extract lemmas from the tokenized text
lemmas = [token.lemma_ for token in doc]

# Print lemmatized tokens
print(lemmas)

['the', 'quick', 'brown', 'fox', 'be', 'jump', 'over', 'the', 'lazy', 'dog', '.']


## Part-of-speech (POS) tagging

In [10]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "The quick brown foxes are jumping over the lazy dogs."

# Tokenize and POS-tag the text using the loaded model
doc = nlp(text)

# Extract and print POS tags for each token
for token in doc:
    print(f"{token.text}: {token.pos_}")

The: DET
quick: ADJ
brown: ADJ
foxes: NOUN
are: AUX
jumping: VERB
over: ADP
the: DET
lazy: ADJ
dogs: NOUN
.: PUNCT


## Named Entity Recognition (NER)

In [11]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Apple Inc. is an American multinational technology company headquartered in Cupertino, California."

# Perform NER using the loaded model
doc = nlp(text)

# Extract and print named entities and their types
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

Apple Inc.: ORG
American: NORP
Cupertino: GPE
California: GPE


## Dependency parsing

In [12]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "The quick brown fox jumps over the lazy dog."

# Perform dependency parsing using the loaded model
doc = nlp(text)

# Extract and print tokens, dependency labels, and head tokens
for token in doc:
    print(f"{token.text}: {token.dep_} -> {token.head.text}")


The: det -> fox
quick: amod -> fox
brown: amod -> fox
fox: nsubj -> jumps
jumps: ROOT -> jumps
over: prep -> jumps
the: det -> dog
lazy: amod -> dog
dog: pobj -> over
.: punct -> jumps


## Customizing Spacy's pipeline

In [18]:
import spacy
from spacy.language import Language

@Language.component('custom_component')
def custom_component(doc):
    # Custom processing logic
    return doc

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Add a custom component to the pipeline
nlp.add_pipe('custom_component', last=True)

# Disable the named entity recognizer in the pipeline
with nlp.disable_pipes("ner"):
    # Process the text with the customized pipeline
    text = "The quick brown fox jumps over the lazy dog."
    doc = nlp(text)

# Print tokens, POS tags, and dependency labels
for token in doc:
    print(f"{token.text}: {token.pos_}, {token.dep_}")

The: DET, det
quick: ADJ, amod
brown: ADJ, amod
fox: NOUN, nsubj
jumps: VERB, ROOT
over: ADP, prep
the: DET, det
lazy: ADJ, amod
dog: NOUN, pobj
.: PUNCT, punct
