In [11]:
# Information Extraction — Named Entity Recognition (spaCy)
# Goal: Extract structured information from text.

# In this notebook, we'll focus on Named Entity Recognition (NER):

# PERSON (people)
# ORG (organizations)
# GPE (locations like cities/countries)
# DATE, TIME, etc.
# Tool: spaCy (industry-standard NLP library)

# Tip: This notebook is written in a high-comment style so you can follow each line.

# Step 0 — Install and load spaCy
# In Google Colab, you may need to install spaCy and download an English model. We use en_core_web_sm (small model) because it is fast and beginner-friendly.

# Install spaCy (quiet mode to reduce output noise)
# If you already have spaCy installed, this will be quick.

!pip -q install spacy

In [12]:


# Download the small English model (this is a separate download)
# Models contain the trained weights spaCy uses for tasks like NER.

!python -m spacy download en_core_web_sm -q


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
# Step 1 — Import libraries
# spacy for NLP
# pandas to store extracted entities in a table

import spacy
import pandas as pd


In [14]:
# Step 2 — Load the NLP pipeline
# spacy.load(...) loads the trained model and creates an NLP pipeline. Once loaded, we can process text by calling nlp(text).

# Load the English pipeline
nlp = spacy.load("en_core_web_sm")


In [15]:
# Step 3 — Create example text (our mini corpus)
# In real applications, these could be:

# student feedback comments
# news articles
# emails
# support tickets
# We keep it small so it’s easy to see what is happening.


texts = [
    "Nina Graham joined Simon Fraser University in Vancouver in 2022.",
    "Apple announced new AI features in June 2024 in California.",
    "I commuted 33 km to teach Python and R on Monday evening.",
    "Microsoft Teams meetings are recorded for students with accommodations."
]


In [16]:
# Step 4 — Extract entities
# Important objects:

# doc = nlp(text) creates a processed document
# doc.ents is the list of entities spaCy found
# We will store results in a list of dictionaries (rows) and convert to a DataFrame.


rows = []  # we'll store one row per entity found

In [17]:


for doc_id, text in enumerate(texts, start=1):

    # Process the text through spaCy pipeline
    doc = nlp(text)

    # Each ent is an entity object with .text and .label_
    for ent in doc.ents:

        rows.append({
            "doc_id": doc_id,         # which document this entity came from
            "text": text,             # original text
            "entity_text": ent.text,  # the exact string spaCy marked as an entity
            "entity_label": ent.label_ # entity type (PERSON, ORG, GPE, DATE, etc.)
        })

In [18]:
entities_df = pd.DataFrame(rows)

In [19]:



entities_df


Unnamed: 0,doc_id,text,entity_text,entity_label
0,1,Nina Graham joined Simon Fraser University in ...,Nina Graham,PERSON
1,1,Nina Graham joined Simon Fraser University in ...,Simon Fraser,PERSON
2,1,Nina Graham joined Simon Fraser University in ...,Vancouver,GPE
3,1,Nina Graham joined Simon Fraser University in ...,2022,DATE
4,2,Apple announced new AI features in June 2024 i...,Apple,ORG
5,2,Apple announced new AI features in June 2024 i...,AI,GPE
6,2,Apple announced new AI features in June 2024 i...,June 2024,DATE
7,2,Apple announced new AI features in June 2024 i...,California,GPE
8,3,I commuted 33 km to teach Python and R on Mond...,33 km,QUANTITY
9,3,I commuted 33 km to teach Python and R on Mond...,Python,PERSON


In [20]:
# Step 5 — Understand the output
# Look at entity_label values. Common ones:

# PERSON: person name
# ORG: organization name
# GPE: geopolitical entity (city/country)
# DATE: date expression
# NER is not perfect. Models can make mistakes. So always validate for your domain.

# Show just the columns we care about most
entities_df[["doc_id", "entity_text", "entity_label"]]


Unnamed: 0,doc_id,entity_text,entity_label
0,1,Nina Graham,PERSON
1,1,Simon Fraser,PERSON
2,1,Vancouver,GPE
3,1,2022,DATE
4,2,Apple,ORG
5,2,AI,GPE
6,2,June 2024,DATE
7,2,California,GPE
8,3,33 km,QUANTITY
9,3,Python,PERSON
