# Entity Recognition

In this notebook, we try 2 different approahces for getting the stock name or ticker from each WSB post. 
1. Named-entity recognition (NER)
2. REGEX

### Named-entity recognition

In [4]:
from transformers import pipeline
from collections import Counter
import pandas as pd
import numpy as np

# Read DF
df = pd.read_csv("cleaned_unlabeled.csv").text_cleaned

# Initialize the NER pipeline (using BERT)
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")

# Reconstruct and clean entities
def reconstruct_entities(ner_results):
    entities = []
    current_entity = {"entity": None, "text": "", "start": None, "end": None}
    
    for token in ner_results:
        if token["entity"].startswith("B-"):  
            if current_entity["text"]:  
                entities.append(current_entity)
            # Start a new entity
            current_entity = {
                "entity": token["entity"][2:],  
                "text": token["word"].replace("##", ""),  
                "start": token["start"],
                "end": token["end"],
            }
        elif token["entity"].startswith("I-") and current_entity["entity"]: 
            current_entity["text"] += token["word"].replace("##", "")  
            current_entity["end"] = token["end"] 

    # Append the last entity
    if current_entity["text"]:
        entities.append(current_entity)
    
    return entities

# Extract and clean entities from the sentences
all_entities = []
for sentence in df:
    # Skip empty values
    if pd.isna(sentence) or sentence == "":
        continue
        
    try:
        ner_results = ner_pipeline(sentence)
        reconstructed = reconstruct_entities(ner_results)
        # Clean and normalize the text of each entity
        for entity in reconstructed:
            cleaned_text = entity["text"].strip().upper() 
            all_entities.append(cleaned_text)
    except Exception as e:
        print(f"Error processing sentence: {sentence}")
        print(f"Error: {e}")
        continue

# Count occurrences of each entity
entity_counts = Counter(all_entities)

# Get the top 10 entities
top_10_entities = entity_counts.most_common(10)
print("\nTop 10 Entities:")
for entity, count in top_10_entities:
    print(f"{entity}: {count}")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



Top 10 Entities:
GM: 1706
AMC: 699
GME: 361
U: 74
G: 60
UMC: 58
AM: 55
T: 53
O: 50
K: 43


### REGEX

In [16]:
import pandas as pd
import re
from tqdm import tqdm

def find_company_mentions(df, companies_list):
    """
    Scan text for company names and tickers.
    """
    # Create patterns for companies
    company_patterns = {}
    for company, ticker in companies_list:
        patterns = [
            rf'\b{re.escape(company)}\b',  # Full company name
            rf'\${ticker}\b',              # $TICKER
            rf'\b{ticker}\b'               # TICKER without $
        ]
        company_patterns[(company, ticker)] = re.compile('|'.join(patterns), re.IGNORECASE)
    
    # Process each row
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text = str(row['text_cleaned'])
        
        # Find all mentioned companies in this text
        mentioned_companies = []
        for (company, ticker), pattern in company_patterns.items():
            if pattern.search(text):
                mentioned_companies.append(ticker)
        
        # Create row with original text and mentioned companies
        row_dict = {
            'original_text': text,
            'mentioned_companies': ','.join(mentioned_companies) if mentioned_companies else ''
        }
        results.append(row_dict)
    
    # Create new DataFrame
    result_df = pd.DataFrame(results)
    
    # Count total mentions for each company
    all_mentions = []
    for row in result_df['mentioned_companies']:
        if row:
            all_mentions.extend(row.split(','))
    mention_counts = pd.Series(all_mentions).value_counts()
    
    print("\nTotal mentions per company:")
    print(mention_counts)
    
    return result_df

# Top 30 companies:
wallstreetbets_companies = [
    ("GameStop", "GME"),
    ("AMC Entertainment", "AMC"),
    ("Tesla", "TSLA"),
    ("NVIDIA", "NVDA"),
    ("AMD", "AMD"),
    ("Apple", "AAPL"),
    ("Microsoft", "MSFT"),
    ("Palantir", "PLTR"),
    ("BlackBerry", "BB"),
    ("Bed Bath & Beyond", "BBBY"),
    ("Virgin Galactic", "SPCE"),
    ("Ford", "F"),
    ("Meta", "META"),
    ("Amazon", "AMZN"),
    ("Alphabet", "GOOGL"),
    ("Rivian", "RIVN"),
    ("Lucid", "LCID"),
    ("SoFi", "SOFI"),
    ("Snapchat", "SNAP"),
    ("Robinhood", "HOOD"),
    ("Coinbase", "COIN"),
    ("Moderna", "MRNA"),
    ("Pfizer", "PFE"),
    ("Zoom", "ZM"),
    ("DraftKings", "DKNG"),
    ("Tilray", "TLRY"),
    ("Sundial Growers", "SNDL"),
    ("NIO", "NIO"),
    ("Plug Power", "PLUG"),
    ("ChargePoint", "CHPT")
]

# Read your CSV
df = pd.read_csv("cleaned_unlabeled.csv")

# Find mentions
result_df = find_company_mentions(df, wallstreetbets_companies)

# Save the results to a CSV file
result_df.to_csv('company_mentions.csv', index=False)



100%|██████████| 51216/51216 [00:24<00:00, 2078.88it/s]



Total mentions per company:
GME      13717
AMC       4872
HOOD      4763
BB        2119
TSLA      1023
PLTR       873
AMZN       520
AAPL       505
SNDL       380
F          372
AMD        347
SPCE       313
MSFT       227
NIO        225
TLRY       194
SOFI       191
NVDA       145
COIN       135
DKNG       119
BBBY       111
PLUG       101
MRNA        97
SNAP        92
PFE         84
ZM          81
GOOGL       39
LCID        23
META        18
RIVN        17
CHPT        16
Name: count, dtype: int64


In [15]:
# See new DF
result_df


Unnamed: 0,original_text,mentioned_companies
0,its not about the money its about sending a me...,
1,math professor scott steiner says the numbers ...,GME
2,exit the system the ceo of nasdaq pushed to ha...,GME
3,new sec filing for gme! can someone less retar...,GME
4,not to distract from gme just thought our amc ...,"GME,AMC"
...,...,...
51211,what i learned investigating sava fud spreader...,
51212,daily popular tickers thread for august 02 202...,"GME,AMD"
51213,hitler reacts to the market being irrational,
51214,daily discussion thread for august 02 2021 you...,
