In [1]:
import os
import pandas as pd
import re
import zipfile
from bs4 import BeautifulSoup
from tqdm import tqdm
import csv
from requests_html import HTMLSession
from time import sleep
import numpy as np
from near_regex import *

# Loading Sentiment Words

In [4]:
import csv

def load_sentiment_words():
    
    #Here, I Load the LM and ML sentiment word lists from files or dictionaries and return them in a dictionary. I referenced some parts of the code
    #from the Build_Sample_Excercises approach
    
    # ML negative words 
    with open('ML_negative_unigram.txt', 'r') as f:
        ML_negative = f.read().splitlines()  # Words stored line by line

    # ML positive words 
    with open("ML_positive_unigram.txt", "r", encoding="utf-8") as f:
        ML_positive = f.read().splitlines()  # Words stored line by line

    # LM negative words from CSV (column A has 2009 for negative words)
    #Words that have 0 for positive and negative columns are excluded as they are neutral
    LM_negative = []
    with open("LM_MasterDictionary_1993-2021.csv", "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)  
        for row in reader:
            if row and row[1] == '2009':  # Here, i'm checking if column B has 2009 for negative words. 
                LM_negative.append(row[0])  

    # LM positive words CSV (all positive words have the number 2009 in column B)
    LM_positive = []
    with open("LM_MasterDictionary_1993-2021.csv", "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            if row and row[2] == '2009':  # Checking if column C has 2009 for positive words
                LM_positive.append(row[0])  

    # Dictionary containing all sentiment words
    return {
        'LM_positive': LM_positive,
        'LM_negative': LM_negative,
        'ML_positive': ML_positive,
        'ML_negative': ML_negative
    }


In [6]:
load_sentiment_words()

{'LM_positive': ['ABLE',
  'ABUNDANCE',
  'ABUNDANT',
  'ACCLAIMED',
  'ACCOMPLISH',
  'ACCOMPLISHED',
  'ACCOMPLISHES',
  'ACCOMPLISHING',
  'ACCOMPLISHMENT',
  'ACCOMPLISHMENTS',
  'ACHIEVE',
  'ACHIEVED',
  'ACHIEVEMENT',
  'ACHIEVEMENTS',
  'ACHIEVES',
  'ACHIEVING',
  'ADEQUATELY',
  'ADVANCEMENT',
  'ADVANCEMENTS',
  'ADVANCES',
  'ADVANCING',
  'ADVANTAGE',
  'ADVANTAGED',
  'ADVANTAGEOUS',
  'ADVANTAGEOUSLY',
  'ADVANTAGES',
  'ALLIANCE',
  'ALLIANCES',
  'ASSURE',
  'ASSURED',
  'ASSURES',
  'ASSURING',
  'ATTAIN',
  'ATTAINED',
  'ATTAINING',
  'ATTAINMENT',
  'ATTAINMENTS',
  'ATTAINS',
  'ATTRACTIVE',
  'ATTRACTIVENESS',
  'BEAUTIFUL',
  'BEAUTIFULLY',
  'BENEFICIALLY',
  'BENEFITED',
  'BENEFITING',
  'BENEFITTED',
  'BENEFITTING',
  'BETTER',
  'BOLSTERED',
  'BOLSTERING',
  'BOLSTERS',
  'BOOM',
  'BOOMING',
  'BOOST',
  'BOOSTED',
  'BREAKTHROUGH',
  'BREAKTHROUGHS',
  'BRILLIANT',
  'CHARITABLE',
  'COLLABORATE',
  'COLLABORATED',
  'COLLABORATES',
  'COLLABORATING',
 

# ML and LM Word Count

In [9]:
sentiment_dicts = load_sentiment_words()
print("LM Positive:", len(sentiment_dicts['LM_positive']))
print("LM Negative:", len(sentiment_dicts['LM_negative']))
print("ML Positive:", len(sentiment_dicts['ML_positive']))
print("ML Negative:", len(sentiment_dicts['ML_negative']))

LM Positive: 345
LM Negative: 2305
ML Positive: 75
ML Negative: 94


# Text Cleaning

In [12]:
def clean_html_text(text):

    #Cleaning up the raw HTML text by removing URLs, special characters, etc.
  
    text = re.sub(r'http[s]?://\S+', '', text)  # Removed URLs 
    text = re.sub(r'[\n\r\t0-9\-\●]+', ' ', text)  # Removed unwanted characters that made the text seem messy
    text = re.sub(r'\s+', ' ', text)  # Removed multiple comma spaces that were breaking up the text
    text = text.replace('\u200b', '')  # Remove zero-width space characters
    text = re.sub(r'[\u2122\u00AE]', '', text)  # Removed trademark symbols
    text = text.replace('⌧', '')  # Remove unwanted boxes that had checks and crosses in them
    text = re.sub(r',+', ',', text)  # Replaced multiple commas with a single one to make the text seem clearer for sentiment word scanning
    return text.strip()


# Calculating Sentiment

In [15]:
def calculate_sentiment(text, positive_words, negative_words):
    sentiment_positive = 0
    sentiment_negative = 0

#I used the assistance of online resources to do this since I was having trouble with extracting words from the tuple    
    # Processing positive sentiment
    for word_tuple in positive_words:
        word = word_tuple[0]  
        sentiment_positive += text.count(word.lower())

    # Processing negative sentiment
    for word_tuple in negative_words:
        word = word_tuple[0]  
        sentiment_negative += text.count(word.lower())

    return sentiment_positive, sentiment_negative


# Contextual Sentiment

In [18]:
# Measure contextual sentiment for specific topics
def calculate_contextual_sentiment(text, topic_positive_words, topic_negative_words):
    """
    Calculate contextual sentiment score based on the presence of specific topic words in the document.
    """
    positive_hits = len(re.findall(r'\b(' + '|'.join(topic_positive_words) + r')\b', text.lower()))
    negative_hits = len(re.findall(r'\b(' + '|'.join(topic_negative_words) + r')\b', text.lower()))

    # Score scale from 1 to 10 based on hits and capping the score at 10 for both
    positive_score = min(10, positive_hits)  
    negative_score = min(10, negative_hits)  

    return positive_score, negative_score


# Processing 10-K Files

In [21]:
def process_10k_files(zipfile_path, sentiment_dictionaries):
    """
   Here, i'm extracting the 10-k files from the zip file, cleaning them, calculating the sentiment values using LM and ML disctionaries
   and then creating a dictionary
    """
    
    results = []
    LM_positive = sentiment_dictionaries['LM_positive']
    LM_negative = sentiment_dictionaries['LM_negative']
    ML_positive = sentiment_dictionaries['ML_positive']
    ML_negative = sentiment_dictionaries['ML_negative']

#Contextual topics: Financial performance, Geopolitical impact, Business competition  
    
  # Word lists for each contextual sentiment
    good_financial_positive = ['growth', 'outperformance', 'exceeded', 'record']
    good_financial_negative = ['underperformance', 'declined', 'weakness', 'loss']
    
    ideal_geopolitical_positive = ['favorable', 'momentum']
    ideal_geopolitical_negative = ['unexpected', 'challenging']
    
    strong_competitive_positive = ['strength', 'solid']
    strong_competitive_negative = ['pressures', 'challenges']

    with zipfile.ZipFile(zipfile_path, 'r') as zipfolder:
        file_list = zipfolder.namelist()

        for firm in tqdm(file_list, desc="Processing 10-K files"):
            if not firm.endswith(".html"): 
                continue

            with zipfolder.open(firm) as report_file:
                html = report_file.read().decode(encoding="utf-8")

                cleaned_text = clean_html_text(BeautifulSoup(html, "html.parser").get_text())

                # Sentiment calculation using both ML and LM dictionaries
                sentiment_LM_pos, sentiment_LM_neg = calculate_sentiment(cleaned_text, LM_positive, LM_negative)
                sentiment_ML_pos, sentiment_ML_neg = calculate_sentiment(cleaned_text, ML_positive, ML_negative)

                 #Positivity ratios to standardise the values for comparison. Different 10-ks have different number of words hence need to standardise
                lm_total = sentiment_LM_pos + sentiment_LM_neg
                ml_total = sentiment_ML_pos + sentiment_ML_neg

                lm_positive_ratio = sentiment_LM_pos / lm_total if lm_total != 0 else 0
                ml_positive_ratio = sentiment_ML_pos / ml_total if ml_total != 0 else 0

                # Calculating the contextual sentiment for each of my three topics
                financial_pos, financial_neg = calculate_contextual_sentiment(cleaned_text, good_financial_positive, good_financial_negative)
                geopolitical_pos, geopolitical_neg = calculate_contextual_sentiment(cleaned_text, ideal_geopolitical_positive, ideal_geopolitical_negative)
                competitive_pos, competitive_neg = calculate_contextual_sentiment(cleaned_text, strong_competitive_positive, strong_competitive_negative)

                results.append({
                    'Firm': firm,
                    'LM_Positive': sentiment_LM_pos,
                    'LM_Negative': sentiment_LM_neg,
                    'ML_Positive': sentiment_ML_pos,
                    'ML_Negative': sentiment_ML_neg,
                    'Financial_Positive_Score': financial_pos,
                    'Financial_Negative_Score': financial_neg,
                    'Geopolitical_Positive_Score': geopolitical_pos,
                    'Geopolitical_Negative_Score': geopolitical_neg,
                    'Competitive_Positive_Score': competitive_pos,
                    'Competitive_Negative_Score': competitive_neg,
                    'LM_Positive_Ratio': lm_positive_ratio,
                    'ML_Positive_Ratio': ml_positive_ratio
                })

    return pd.DataFrame(results)



# Simulating Stock Price Returns

In [24]:
def simulate_fake_returns(df):
    np.random.seed(42)  
    df['ret_t0'] = np.random.normal(loc=0, scale=1, size=len(df))  # filing day
    df['ret_t0_t2'] = np.random.normal(loc=0, scale=1, size=len(df))  # day t to t+2
    df['ret_t2_t10'] = np.random.normal(loc=0, scale=1, size=len(df))  # day t+3 to t+10
    return df


# Output

In [27]:
def main():
    zipfile_path = '10k_files.zip'  
    sentiment_dictionaries = load_sentiment_words()  
    
    sentiment_df = process_10k_files(zipfile_path, sentiment_dictionaries)

    sentiment_df = simulate_fake_returns(sentiment_df)

    sentiment_df.to_csv('output/analysis_sample.csv', index=False)

if __name__ == "__main__":
    main()



Processing 10-K files: 100%|██████████| 1993/1993 [17:41<00:00,  1.88it/s]


# Failed Attempt

In [None]:
#Failed attempt at the 10-k filing data
#import requests
#from requests_html import HTMLSession
#import time

# Function to get filing date from SEC EDGAR using CIK and Accession Number
#def get_filing_date(cik, accession_number):
    #session = HTMLSession()
   #url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{accession_number}-index.html'
    
    #try:
        #r = session.get(url)
        #time.sleep(5)  # Adding sleep to avoid rate limiting by SEC
        #if r.status_code == 200:
            # Find the filing date using CSS selector. You can inspect the page to get the correct CSS selector for the filing date.
            #filing_date = r.html.find('#contentDiv > div:nth-child(1) > div.formContent > div:nth-child(1) > div:nth-child(2)', first=True)
            #if filing_date:
                #return filing_date.text.strip()  # Clean the text and return
           # else:
                #print(f"Filing date not found for CIK {cik} and Accession Number {accession_number}")
                #return None
        #else:
            #print(f"Failed to fetch page for CIK {cik}, Status code: {r.status_code}")
            #return None
    #except Exception as e:
       #print(f"Error fetching filing date for CIK {cik}, Accession Number {accession_number}: {e}")
        #return None

# Function to get all filing dates for a list of companies
#def get_all_filing_dates(cik_list, accession_number_list):
    #filing_dates = []
    #for cik, accession_number in zip(cik_list, accession_number_list):
        #filing_date = get_filing_date(cik, accession_number)
        #filing_dates.append({
            #'CIK': cik,
            #'Accession_Number': accession_number,
            #'Filing_Date': filing_date
        })
    
    #return filing_dates

# Example CIKs and Accession Numbers (replace with actual data)
#cik_list = ['1800', '2488']  # List of CIKs
#accession_number_list = ['0001104659-22-025141', '0001193125-19-071675']  # Corresponding Accession Numbers

# Get filing dates
#filing_dates = get_all_filing_dates(cik_list, accession_number_list)

# Print or save results
#for filing in filing_dates:
    #print(filing)