In [None]:
# Importing the necessary libraries
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler 

In [7]:
df = pd.read_csv("stocks-1.tsv", sep='\t')  # loads the dataset

In [None]:
unique_companies = df['CompanyName'].dropna().unique() # Collects unique company names from the 'CompanyName' column
unique_symbols = df['Symbol'].dropna().unique() # Collects unique stock symbols from the 'Symbol' column

In [None]:
company_patterns = [{"label": "COMPANY", "pattern": name} for name in unique_companies] # Creates a dictionary pattern with label 'COMPANY' for each company name
symbol_patterns = [{"label": "STOCK", "pattern": symbol} for symbol in unique_symbols] # Creates a dictionary pattern with label 'STOCK' for each stock symbol

In [15]:
nlp = spacy.blank("en") # Creates a blank English NLP pipeline with no preloaded model necessary
ruler = nlp.add_pipe("entity_ruler") # Adds the "EntityRuler" component to the pipeline
ruler.add_patterns(company_patterns + symbol_patterns) # Adds both company and stock symbol patterns to the ruler

In [None]:
# Sample texts for entity recognition testing
paragraphs = [
    """Helmerich & Payne (HP) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Energy Equipment & Services sector. In contrast, Check-Cap (CHEK) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

    Meanwhile, Vallon Pharmaceuticals (VLON) gained 0.8% after strong quarterly earnings, outperforming its peers in the Biotechnology space. Sequans Communications (SQNS) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Semiconductors & Semiconductor Equipment industry.""",

    """Aemetis (AMTX) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Ferro Corporation (FOE) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

    Meanwhile, RingCentral (RNG) gained 0.8% after strong quarterly earnings, outperforming its peers in the Software space. ACI Worldwide (ACIW) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Software industry.""",

    """On a mixed trading day, Par Pacific Holdings (PARR) saw its stock rise by 1.5%, fueled by optimistic forecasts in the Oil, Gas & Consumable Fuels sector. In contrast, Nano Dimension (NNDM) faced a decline of 2.3% following its announcement of increased costs related to supply chain disruptions.

    Meanwhile, Beyond Meat (BYND) gained 0.8% after strong quarterly earnings, outperforming its peers in the Food Products space. Apollo Investment (AINV) also recorded a modest increase of 0.5%, reflecting investors' confidence in its ability to navigate challenges in the Capital Markets industry."""
]
# This processes each paragraph and extract entities

In [18]:
for i, paragraph in enumerate(paragraphs, 1): # Loops through each paragraph and apply the NLP pipeline
    doc = nlp(paragraph)  # Processes the paragraph with spaCy
    print(f"\nEntities in Paragraph {i}:")
    
    for ent in doc.ents:  # Extract and display all recognized entities with their labels
        print(f"{ent.text} ({ent.label_})") # Prints the entity text and its label


Entities in Paragraph 1:
Helmerich & Payne (COMPANY)
HP (STOCK)
Check-Cap (COMPANY)
CHEK (STOCK)
Vallon Pharmaceuticals (COMPANY)
VLON (STOCK)
Sequans Communications (COMPANY)
SQNS (STOCK)

Entities in Paragraph 2:
Aemetis (COMPANY)
AMTX (STOCK)
Ferro Corporation (COMPANY)
FOE (STOCK)
RingCentral (COMPANY)
RNG (STOCK)
ACI Worldwide (COMPANY)
ACIW (STOCK)

Entities in Paragraph 3:
Par Pacific Holdings (COMPANY)
PARR (STOCK)
Nano Dimension (COMPANY)
NNDM (STOCK)
Beyond Meat (COMPANY)
BYND (STOCK)
Apollo Investment (COMPANY)
AINV (STOCK)
