In [1]:
import spacy
import pandas as pd

In [2]:
# 1. Loading the dataset 
# sep='\t' means "this file is tab-separated", which is common for .tsv files
df = pd.read_csv("stocks-1.tsv", sep='\t')  

# 2. Extracting unique company names and stock symbols from the table
# We only want one of each company and one of each stock symbol (no duplicates)
unique_companies = df['CompanyName'].dropna().unique()
unique_symbols = df['Symbol'].dropna().unique()

In [3]:
# 3. Creating patterns for EntityRuler 
# We tag company names as "ORG" (short for "organization")
# We tag stock symbols as "STOCK"
company_patterns = [{"label": "ORG", "pattern": company} for company in unique_companies]
symbol_patterns = [{"label": "STOCK", "pattern": symbol} for symbol in unique_symbols]

In [4]:

#  4. Setting up a blank English spaCy model and add EntityRuler 
# like setting up a machine to scan for patterns
nlp = spacy.blank("en")
# The EntityRuler is the part of spaCy that finds things based on our patterns
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(company_patterns + symbol_patterns) # Add both company and stock patterns


In [5]:

#  5. Defining Test Paragraphs
# These paragraphs mention real companies and stock symbols
# If our pattern-matching works, spaCy should highlight them correctly
paragraphs = [
    # Paragraph 1
    """Helmerich & Payne (HP) saw its stock rise by 1.5%, 
    fueled by optimistic forecasts in the Energy Equipment & Services sector. 
    In contrast, Check-Cap (CHEK) faced a decline of 2.3% following its announcement 
    of increased costs related to supply chain disruptions.

    Meanwhile, Vallon Pharmaceuticals (VLON) gained 0.8% after strong quarterly earnings, 
    outperforming its peers in the Biotechnology space. 
    Sequans Communications (SQNS) also recorded a modest increase of 0.5%, 
    reflecting investors' confidence in its ability to navigate challenges 
    in the Semiconductors & Semiconductor Equipment industry.""",

    # Paragraph 2
    """Aemetis (AMTX) saw its stock rise by 1.5%, 
    fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. 
    In contrast, Ferro Corporation (FOE) faced a decline of 2.3% 
    following its announcement of increased costs related to supply chain disruptions.

    Meanwhile, RingCentral (RNG) gained 0.8% after strong quarterly earnings, 
    outperforming its peers in the Software space. 
    ACI Worldwide (ACIW) also recorded a modest increase of 0.5%, 
    reflecting investors' confidence in its ability to navigate challenges 
    in the Software industry.""",

    # Paragraph 3
    """On a mixed trading day, Par Pacific Holdings (PARR) saw its stock rise by 1.5%, 
    fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. 
    In contrast, Nano Dimension (NNDM) faced a decline of 2.3% 
    following its announcement of increased costs related to supply chain disruptions.

    Meanwhile, Beyond Meat (BYND) gained 0.8% after strong quarterly earnings, 
    outperforming its peers in the Food Products space. 
    Apollo Investment (AINV) also recorded a modest increase of 0.5%, 
    reflecting investors' confidence in its ability to navigate challenges 
    in the Capital Markets industry."""
]


In [6]:

#  6. Applying the NLP pipeline to each paragraph and print entities
# # For each paragraph, run the NLP pipeline and print the entities found
from spacy import displacy

for i, text in enumerate(paragraphs, 1):
    print(f"Paragraph {i}")
    doc = nlp(text)
    displacy.render(doc, style="ent", jupyter=True)

Paragraph 1


Paragraph 2


Paragraph 3
