In [None]:
# spaCy Complete Tutorial - 1 Hour Crash Course
# ===============================================

# Installation (run this in terminal first):
# pip install spacy
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_md  # Optional: for word vectors

import spacy
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

print("spaCy 1-Hour Complete Tutorial")
print("=" * 40)

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
    print("✅ spaCy model loaded successfully!")
except OSError:
    print("❌ Please install the English model: python -m spacy download en_core_web_sm")
    exit()

spaCy 1-Hour Complete Tutorial
✅ spaCy model loaded successfully!


In [None]:
# =============================================================================
# SECTION 1: BASICS & DOCUMENT PROCESSING (8 minutes)
# =============================================================================

print("\n\n1. BASICS & DOCUMENT PROCESSING")
print("-" * 35)

# Creating spaCy documents
text1 = "Hello world! This is a simple example."
doc1 = nlp(text1)

text2 = """Apple Inc. is an American multinational technology company headquartered in Cupertino, California.
Tim Cook is the CEO. The company was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. """
doc2 = nlp(text2)

text3 = """He was running over the top of the hill and sits on a road for eating his breakfast """
doc3 = nlp(text3)

print(f"Original text: {text1}")
print(f"spaCy Doc object: {doc1}")
print(f"Type: {type(doc1)}")

# Document properties
print(f"\nDocument properties:")
print(f"Length: {len(doc2)} tokens")
print(f"Text: {doc2.text[:100]}...")
print(f"Language: {doc2.lang_}")

# Iterating through tokens
print(f"\nFirst 10 tokens:")
for i, token in enumerate(doc2):
    print(f"  {i+1:2d}: '{token.text}' (pos: {token.pos_}, lemma: {token.lemma_})")




1. BASICS & DOCUMENT PROCESSING
-----------------------------------
Original text: Hello world! This is a simple example.
spaCy Doc object: Hello world! This is a simple example.
Type: <class 'spacy.tokens.doc.Doc'>

Document properties:
Length: 38 tokens
Text: Apple Inc. is an American multinational technology company headquartered in Cupertino, California. 
...
Language: en

First 10 tokens:
   1: 'Apple' (pos: PROPN, lemma: Apple)
   2: 'Inc.' (pos: PROPN, lemma: Inc.)
   3: 'is' (pos: AUX, lemma: be)
   4: 'an' (pos: DET, lemma: an)
   5: 'American' (pos: ADJ, lemma: american)
   6: 'multinational' (pos: ADJ, lemma: multinational)
   7: 'technology' (pos: NOUN, lemma: technology)
   8: 'company' (pos: NOUN, lemma: company)
   9: 'headquartered' (pos: VERB, lemma: headquarter)
  10: 'in' (pos: ADP, lemma: in)
  11: 'Cupertino' (pos: PROPN, lemma: Cupertino)
  12: ',' (pos: PUNCT, lemma: ,)
  13: 'California' (pos: PROPN, lemma: California)
  14: '.' (pos: PUNCT, lemma: .)
  15: '


In [None]:
for i, token in enumerate(doc3):
  print(i, token, " - ",token.lemma_)


0 He  -  he
1 was  -  be
2 running  -  run
3 over  -  over
4 the  -  the
5 top  -  top
6 of  -  of
7 the  -  the
8 hill  -  hill
9 and  -  and
10 sits  -  sit
11 on  -  on
12 a  -  a
13 road  -  road
14 for  -  for
15 eating  -  eat
16 his  -  his
17 breakfast  -  breakfast


In [None]:
# =============================================================================
# SECTION 2: TOKENIZATION & TOKEN ATTRIBUTES (10 minutes)
# =============================================================================

print("\n\n2. TOKENIZATION & TOKEN ATTRIBUTES")
print("-" * 35)

text = "The quick brown fox jumps over the lazy dog! It's running at 25.5 mph."
doc = nlp(text)

print(f"Text: {text}")
print(f"\nDetailed Token Analysis:")
print(f"{'Token':<12} {'POS':<8} {'Tag':<6} {'Explanation on Tag': <15} {'Lemma':<30} {'Shape':<8} {'Alpha':<5} {'Stop':<4}")
print("-" * 70)

for token in doc:
    print(f"{token.text:<12} {token.pos_:<8} {token.tag_:<6} {spacy.explain(token.tag_)} {token.lemma_:<12} "
        f"{token.shape_:<8} {token.is_alpha:<5} {token.is_stop}")



# Token properties explanation
print(f"\nToken Properties Explained:")
properties = [
    ("text", "Raw token text"),
    ("pos_", "Part-of-speech tag (simplified)"),
    ("tag_", "Detailed part-of-speech tag"),
    ("lemma_", "Base form of the token"),
    ("shape_", "Word shape (Xxxx, dddd, etc.)"),
    ("is_alpha", "Is the token alphabetic?"),
    ("is_stop", "Is the token a stop word?"),
    ("is_punct", "Is the token punctuation?"),
    ("is_digit", "Is the token a digit?"),
    ("like_num", "Does the token resemble a number?")
]

for prop, desc in properties:
    print(f"  {prop:<12}: {desc}")

# Advanced token filtering
print(f"\nFiltered Tokens:")
# Non-stop, alphabetic words
meaningful_tokens = [token.lemma_.lower() for token in doc
                    if not token.is_stop and token.is_alpha and len(token.text) > 2]
print(f"Meaningful words: {meaningful_tokens}")

# Numbers and quantities
numbers = [token.text for token in doc if token.like_num]
print(f"Numbers found: {numbers}")




2. TOKENIZATION & TOKEN ATTRIBUTES
-----------------------------------
Text: The quick brown fox jumps over the lazy dog! It's running at 25.5 mph.

Detailed Token Analysis:
Token        POS      Tag    Explanation on Tag Lemma                          Shape    Alpha Stop
----------------------------------------------------------------------
The          DET      DT     determiner the          Xxx      1     True
quick        ADJ      JJ     adjective (English), other noun-modifier (Chinese) quick        xxxx     1     False
brown        ADJ      JJ     adjective (English), other noun-modifier (Chinese) brown        xxxx     1     False
fox          NOUN     NN     noun, singular or mass fox          xxx      1     False
jumps        VERB     VBZ    verb, 3rd person singular present jump         xxxx     1     False
over         ADP      IN     conjunction, subordinating or preposition over         xxxx     1     True
the          DET      DT     determiner the          xxx      1   

In [None]:
# =============================================================================
# SECTION 3: PART-OF-SPEECH TAGGING & DEPENDENCY PARSING (12 minutes)
# =============================================================================

print("\n\n3. POS TAGGING & DEPENDENCY PARSING")
print("-" * 35)

sentence = "The beautiful sunset painted the sky with vibrant colors yesterday evening."
doc = nlp(sentence)

print(f"Sentence: {sentence}")

# Part-of-speech tagging
print(f"\nPart-of-Speech Analysis:")
print(f"{'Token':<12} {'POS':<8} {'Tag':<6} {'Description':<20}")
print("-" * 50)

pos_descriptions = {
    'DET': 'Determiner', 'ADJ': 'Adjective', 'NOUN': 'Noun', 'VERB': 'Verb',
    'ADP': 'Adposition', 'ADV': 'Adverb', 'PROPN': 'Proper Noun'
}

for token in doc:
    desc = pos_descriptions.get(token.pos_, token.pos_)
    print(f"{token.text:<12} {token.pos_:<8} {token.tag_:<6} {desc:<20}")

# Dependency parsing
print(f"\nDependency Relations:")
print(f"{'Token':<12} {'Relation':<12} {'Head':<12} {'Children'}")
print("-" * 60)

for token in doc:
    children = [child.text for child in token.children]
    children_str = ", ".join(children) if children else "None"
    print(f"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {children_str}")

# Visualizing dependencies (text-based)
print(f"\nDependency Tree (simplified):")
def print_dependencies(token, depth=0):
    print("  " * depth + f"├─ {token.text} ({token.dep_})")
    for child in token.children:
        print_dependencies(child, depth + 1)

# Find the root token
root = [token for token in doc if token.dep_ == "ROOT"][0]
print_dependencies(root)

# Extract specific grammatical patterns
print(f"\nExtracted Patterns:")
subjects = [token.text for token in doc if token.dep_ in ("nsubj", "nsubjpass")]
objects = [token.text for token in doc if token.dep_ in ("dobj", "pobj")]
verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]

print(f"Subjects: {subjects}")
print(f"Objects: {objects}")
print(f"Verbs: {verbs}")



3. POS TAGGING & DEPENDENCY PARSING
-----------------------------------
Sentence: The beautiful sunset painted the sky with vibrant colors yesterday evening.

Part-of-Speech Analysis:
Token        POS      Tag    Description         
--------------------------------------------------
The          DET      DT     Determiner          
beautiful    ADJ      JJ     Adjective           
sunset       NOUN     NN     Noun                
painted      VERB     VBD    Verb                
the          DET      DT     Determiner          
sky          NOUN     NN     Noun                
with         ADP      IN     Adposition          
vibrant      ADJ      JJ     Adjective           
colors       NOUN     NNS    Noun                
yesterday    NOUN     NN     Noun                
evening      NOUN     NN     Noun                
.            PUNCT    .      PUNCT               

Dependency Relations:
Token        Relation     Head         Children
------------------------------------------

In [None]:
# =============================================================================
# SECTION 4: NAMED ENTITY RECOGNITION (10 minutes)
# =============================================================================

print("\n\n4. NAMED ENTITY RECOGNITION (NER)")
print("-" * 35)

text = """Apple Inc. was founded by Steve Jobs in Cupertino, California in 1976.
The company is now worth over $2 trillion dollars. Tim Cook became CEO in 2011.
Microsoft, Google, and Amazon are major competitors. The headquarters is located at
One Apple Park Way, and they employ over 100,000 people worldwide."""

doc = nlp(text)

print(f"Text: {text[:100]}...")

# Extract all named entities
print(f"\nNamed Entities Found:")
print(f"{'Entity':<20} {'Label':<12} {'Description':<25}")
print("-" * 60)

entity_descriptions = {
    'PERSON': 'People, including fictional',
    'ORG': 'Companies, agencies, institutions',
    'GPE': 'Countries, cities, states',
    'MONEY': 'Monetary values',
    'DATE': 'Absolute or relative dates',
    'CARDINAL': 'Numerals that do not fall under other types',
    'ORDINAL': 'First, second, etc.',
    'FACILITY': 'Buildings, airports, highways, bridges',
    'LOC': 'Non-GPE locations'
}

for ent in doc.ents:
    desc = entity_descriptions.get(ent.label_, ent.label_)
    print(f"{ent.text:<20} {ent.label_:<12} {desc:<25}")

# Group entities by type
entities_by_type = {}
for ent in doc.ents:
    if ent.label_ not in entities_by_type:
        entities_by_type[ent.label_] = []
    entities_by_type[ent.label_].append(ent.text)

print(f"\nEntities Grouped by Type:")
for label, entities in entities_by_type.items():
    print(f"  {label}: {entities}")

# Custom entity patterns
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Pattern for email addresses (simplified)
email_pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL", [email_pattern])

# Pattern for phone numbers (simplified)
phone_pattern = [{"SHAPE": "ddd-ddd-dddd"}]
matcher.add("PHONE", [phone_pattern])

test_text = "Contact us at john.doe@company.com or call 555-123-4567 for support."
test_doc = nlp(test_text)
matches = matcher(test_doc)

print(f"\nCustom Pattern Matching:")
print(f"Text: {test_text}")
for match_id, start, end in matches:
    label = nlp.vocab.strings[match_id]
    span = test_doc[start:end]
    print(f"  Found {label}: {span.text}")



4. NAMED ENTITY RECOGNITION (NER)
-----------------------------------
Text: Apple Inc. was founded by Steve Jobs in Cupertino, California in 1976. 
The company is now worth ove...

Named Entities Found:
Entity               Label        Description              
------------------------------------------------------------
Apple Inc.           ORG          Companies, agencies, institutions
Steve Jobs           PERSON       People, including fictional
Cupertino            GPE          Countries, cities, states
California           GPE          Countries, cities, states
1976                 DATE         Absolute or relative dates
over $2 trillion dollars MONEY        Monetary values          
Tim Cook             PERSON       People, including fictional
2011                 DATE         Absolute or relative dates
Microsoft            ORG          Companies, agencies, institutions
Google               ORG          Companies, agencies, institutions
Amazon               ORG          Compan

In [None]:
# =============================================================================
# SECTION 5: SENTENCE SEGMENTATION & TEXT PROCESSING (8 minutes)
# =============================================================================

print("\n\n5. SENTENCE SEGMENTATION & TEXT PROCESSING")
print("-" * 35)

text = """Natural language processing is fascinating! It involves multiple steps.
First, we tokenize the text. Then, we analyze grammar. Finally, we extract meaning.
Dr. Smith published a paper on this topic in 2020. The research was groundbreaking."""

doc = nlp(text)

# Sentence segmentation
print(f"Original text: {text}")
print(f"\nSentences detected: {len(list(doc.sents))}")

for i, sent in enumerate(doc.sents, 1):
    print(f"  {i}: {sent.text.strip()}")

# Sentence-level analysis
print(f"\nSentence Analysis:")
for i, sent in enumerate(doc.sents, 1):
    sent_doc = nlp(sent.text)
    entities = [ent.text for ent in sent_doc.ents]
    nouns = [token.text for token in sent_doc if token.pos_ == "NOUN"]
    verbs = [token.lemma_ for token in sent_doc if token.pos_ == "VERB"]

    print(f"\nSentence {i}: {sent.text[:50]}...")
    print(f"  Entities: {entities if entities else 'None'}")
    print(f"  Nouns: {nouns}")
    print(f"  Verbs: {verbs}")

# Text statistics
print(f"\nText Statistics:")
total_tokens = len(doc)
sentences = list(doc.sents)
avg_sent_length = total_tokens / len(sentences) if sentences else 0

print(f"  Total tokens: {total_tokens}")
print(f"  Total sentences: {len(sentences)}")
print(f"  Average sentence length: {avg_sent_length:.1f} tokens")
print(f"  Unique lemmas: {len(set(token.lemma_ for token in doc))}")



5. SENTENCE SEGMENTATION & TEXT PROCESSING
-----------------------------------
Original text: Natural language processing is fascinating! It involves multiple steps. 
First, we tokenize the text. Then, we analyze grammar. Finally, we extract meaning. 
Dr. Smith published a paper on this topic in 2020. The research was groundbreaking.

Sentences detected: 7
  1: Natural language processing is fascinating!
  2: It involves multiple steps.
  3: First, we tokenize the text.
  4: Then, we analyze grammar.
  5: Finally, we extract meaning.
  6: Dr. Smith published a paper on this topic in 2020.
  7: The research was groundbreaking.

Sentence Analysis:

Sentence 1: Natural language processing is fascinating!...
  Entities: None
  Nouns: ['language', 'processing']
  Verbs: []

Sentence 2: It involves multiple steps. 
...
  Entities: None
  Nouns: ['steps']
  Verbs: ['involve']

Sentence 3: First, we tokenize the text....
  Entities: ['First']
  Nouns: ['text']
  Verbs: ['tokenize']

Sentence