In [None]:
!git clone https://github.com/donbowen/Class-Notes-1045

In [None]:
import fnmatch
import glob
import os
import re
from time import sleep
from zipfile import ZipFile

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from near_regex import NEAR_regex  # copy this file into the asgn folder
from tqdm import tqdm  # progress bar on loops

# if you have tqdm issues, run this in terminal or with ! trick
# jupyter nbextension enable --py widgetsnbextension
# jupyter labextension install @jupyter-widgets/jupyterlab-manager
#
# if that fails, you can disable it

os.makedirs("output", exist_ok=True)

# Load sentiment dictionaries
BHR_negative = pd.read_csv('inputs/ML_negative_unigram.txt', names=['word'])['word'].to_list()
with open('inputs/ML_positive_unigram.txt', 'r') as file:
    BHR_positive = [line.strip() for line in file]
LM = pd.read_csv('inputs/LM_MasterDictionary_1993-2021.csv')
LM_negative = LM.query('Negative > 0')['Word'].to_list()
LM_positive = LM.query('Positive > 0')['Word'].to_list()

# Define topics for the contextual sentiment analysis
topics = {
    'competition': ['compete', 'competitor', 'competition'],
    'innovation': ['innovate', 'innovation'],
    'strategy': ['strategy']
}

# Define helper functions
def get_sentiment(text, positive_words, negative_words):
    """Calculate the sentiment score of a text using positive and negative words."""
    pos_count = len(re.findall(NEAR_regex([f'({w})' for w in positive_words]), text))
    neg_count = len(re.findall(NEAR_regex([f'({w})' for w in negative_words]), text))
    return (pos_count - neg_count) / len(text.split())

def get_contextual_sentiment(text, topics):
    """Calculate the sentiment scores of a text around specific topics."""
    results = {}
    for topic, words in topics.items():
        pos_count = len(re.findall(NEAR_regex([f'({w})' for w in words] + [f'({w})' for w in LM_positive]), text))
        neg_count = len(re.findall(NEAR_regex([f'({w})' for w in words] + [f'({w})' for w in LM_negative]), text))
        results[f'{topic}_positive'] = pos_count / len(text.split())
        results[f'{topic}_negative'] = neg_count / len(text.split())
    return results

# Load sample firms
sample_firms = os.listdir('/content/sec-edgar-filings')

# Loop over sample firms
with ZipFile('10k_files/10k_files.zip','r') as zipfolder:
    # before the loop, get list of files in zipped folder
    file_list = zipfolder.namelist()
    # Initialize output dataframe
    results = pd.DataFrame(columns=['Security', 'URL', 'Length', 'Unique Words', 'LM Positive', 'LM Negative', 'ML Positive', 'ML Negative', 'competition_positive', 'competition_negative', 'innovation_positive', 'innovation_negative', 'strategy_positive', 'strategy_negative'])
    # Loop over firms
    for row in tqdm(sample_firms, total=len(sample_firms)):
        # Get a list of possible files for this firm
        firm_folder = f"sec-edgar-filings/{row}/10-K/*.*.txt"
        # Use fnmatch to match the pattern and get the file name
        pattern = f"*{row}*.txt"
        matching_files = [file for file in file_list if fnmatch.fnmatch(file, pattern)]
        if len(matching_files) > 0:
            file_name = matching_files[0]
            with zipfolder.open(file_name) as file:
                # Read the file content
                content = file.read()
                soup = BeautifulSoup(content, 'html.parser')
                # Extract the text from the file
                text = soup.get_text()
                # Remove non-alphabetic characters
                text = re.sub(r'[^a-zA-Z\s]+', '', text)
                # Remove extra whitespaces
                text = ' '.join(text.split())
                # Get sentiment scores
                lm_positive_count = len(re.findall(NEAR_regex([f'({w})' for w in LM_positive]), text))
                lm_negative_count = len(re.findall(NEAR_regex([f'({w})' for w in LM_negative]), text))
                ml_positive_count = len(re.findall(NEAR_regex([f'({w})' for w in BHR_positive]), text))
                ml_negative_count = len(re.findall(NEAR_regex([f'({w})' for w in BHR_negative]), text))
                unique_words = len(set(text.split()))
                length = len(text.split())
                contextual_sentiment = get_contextual_sentiment(text, topics)
                # Append results to the output dataframe
                results = results.append({'Security': row['Security'], 'URL': row['URL'], 'Length': length, 'Unique Words': unique_words, 'LM Positive': lm_positive_count, 'LM Negative': lm_negative_count, 'ML Positive': ml_positive_count, 'ML Negative': ml_negative_count, 'competition_positive': contextual_sentiment['competition_positive'], 'competition_negative': contextual_sentiment['competition_negative'], 'innovation_positive': contextual_sentiment['innovation_positive'], 'innovation_negative': contextual_sentiment['innovation_negative'], 'strategy_positive': contextual_sentiment['strategy_positive'], 'strategy_negative': contextual_sentiment['strategy_negative']}, ignore_index=True)

# Save the results to a CSV file
results.to_csv('output/results.csv', index=False)


In [None]:
# Load the sentiment dictionaries
BHR_negative = pd.read_csv('inputs/ML_negative_unigram.txt', names=['word'])['word'].to_list()
with open('inputs/ML_positive_unigram.txt', 'r') as file:
    BHR_positive = [line.strip() for line in file]
LM = pd.read_csv('inputs/LM_MasterDictionary_1993-2021.csv')
LM_negative = LM.query('Negative > 0')['Word'].to_list()
LM_positive = LM.query('Positive > 0')['Word'].to_list()

# Define lists of positive and negative words for each topic
topic1_positive = ['happy', 'joy', 'excellent', 'positive']
topic1_negative = ['sad', 'anger', 'terrible', 'negative']
topic2_positive = ['love', 'enjoy', 'fantastic', 'good']
topic2_negative = ['hate', 'dislike', 'worst', 'bad']
topic3_positive = ['excited', 'wonderful', 'amazing', 'great']
topic3_negative = ['nervous', 'boring', 'awful', 'horrible']

# Create a function to measure sentiment in a document using the LM and BHR dictionaries
def measure_sentiment(text, positive_words, negative_words):
    words = text.split()
    num_positive_words = len([word for word in words if word in positive_words])
    num_negative_words = len([word for word in words if word in negative_words])
    return num_positive_words / (num_positive_words + num_negative_words), num_negative_words / (num_positive_words + num_negative_words)

# Create a function to measure contextual sentiment in a document
def measure_contextual_sentiment(text, topic_positive, topic_negative):
    positive_regex = NEAR_regex(['(' + '|'.join(topic_positive) + ')'])
    negative_regex = NEAR_regex(['(' + '|'.join(topic_negative) + ')'])
    num_positive_words = len(re.findall(positive_regex, text))
    num_negative_words = len(re.findall(negative_regex, text))
    return num_positive_words / (num_positive_words + num_negative_words), num_negative_words / (num_positive_words + num_negative_words)

# Load the initial dataset of sample firms and create the sentiment measurements
sample_firms = pd.read_csv('inputs/sample firms.csv')

for index, row in tqdm(sample_firms.iterrows(), total=sample_firms.shape[0]):
    # Load the corresponding 10-K, clean the text
    with ZipFile(f'10k_files/{row["ticker"]}.zip', 'r') as zipfolder:
        # Get a list of possible files for this firm
        firm_folder = f"sec-edgar-filings/{row['cik']}/10-K/*/*.htm*"
        possible_files = fnmatch.filter(zipfolder.namelist(), firm_folder)
        if len(possible_files) == 0:
            continue
        fpath = possible_files[0] # The first match is the path to the file
        with zipfolder.open(fpath) as report_file:
            html = report_file.read().decode(encoding="utf-8")
        soup = BeautifulSoup(html, features='lxml-xml')
        for div in soup.find_all("div", {'style':'display:none'}): 
            div.decompose()
        text = soup.get_text()
        text = re.sub(r'(\n|\t|\xa0)+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        # Measure sentiment and contextual sentiment
        lm_positive_sentiment, lm_negative_sentiment = measure_sentiment(text, LM_positive, LM_negative)
        bhr_positive_sentiment, bhr_negative_sentiment = measure_sentiment(text, BHR_positive, BHR_negative)
        topic1_positive_sentiment, topic1_negative_sentiment = measure_contextual_sentiment(text, topic1_positive, topic1_negative)
        topic2_positive_sentiment, topic2_negative_sentiment = measure_contextual_sentiment(text, topic2_positive, topic2_negative)
        topic3_positive_sentiment, topic3_negative_sentiment = measure_contextual_sentiment(text, topic3_positive, topic3_negative)
        # Save the sentiment measurements to the dataframe
        sample_firms.loc[index, 'lm_positive_sentiment'] = lm_positive_sentiment
        sample_firms.loc[index, 'lm_negative_sentiment'] = lm_negative_sentiment
        sample_firms.loc[index, 'bhr_positive_sentiment'] = bhr_positive_sentiment
        sample_firms.loc[index, 'bhr_negative_sentiment'] = bhr_negative_sentiment
        sample_firms.loc[index, 'topic1_positive_sentiment'] = topic1_positive_sentiment
        sample_firms.loc[index, 'topic1_negative_sentiment'] = topic1_negative_sentiment
        sample_firms.loc[index, 'topic2_positive_sentiment'] = topic2_positive_sentiment
        sample_firms.loc[index, 'topic2_negative_sentiment'] = topic2_negative_sentiment
        sample_firms.loc[index, 'topic3_positive_sentiment'] = topic3_positive_sentiment
        sample_firms.loc[index, 'topic3_negative_sentiment'] = topic3_negative_sentiment
