## Setup

In [34]:
# import fnmatch
import glob
import os
import re
import pandas as pd
# from time import sleep

# Load 2022 returns
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO

# import numpy as np
from bs4 import BeautifulSoup
from near_regex import NEAR_regex
from tqdm import tqdm

In [35]:
# Paths and file handling
input_dir = 'inputs'
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Inputs
topic_path = input_dir + '/topic_list.csv'
sp500_path = input_dir + '/s&p500_2022.csv'
firm_10k_path = "10k_files/sec-edgar-filings"

# Outputs
sentiment_save_path = output_dir + '/ticker_sentiments.csv'

## Load 2022 returns

In [5]:
url = "https://github.com/LeDataSciFi/data/raw/main/Stock%20Returns%20(CRSP)/crsp_2022_only.zip"
with urlopen(url) as request:
    data = BytesIO(request.read())

with ZipFile(data) as archive:
    with archive.open(archive.namelist()[0]) as stata:
        stock_rets = pd.read_stata(stata)

stock_rets

Unnamed: 0,ticker,date,ret
0,JJSF,2021-12-01,-0.011276
1,JJSF,2021-12-02,0.030954
2,JJSF,2021-12-03,0.000287
3,JJSF,2021-12-06,0.014362
4,JJSF,2021-12-07,0.012459
...,...,...,...
2594044,TSLA,2022-12-23,-0.017551
2594045,TSLA,2022-12-27,-0.114089
2594046,TSLA,2022-12-28,0.033089
2594047,TSLA,2022-12-29,0.080827


In [None]:
# ** TODO make this generic at top for both
# Get df of companies
if not os.path.exists(sp500_path):
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    pd.read_html(url)[0].to_csv(sp500_path, index=False)  # [1] shows updates

sp500 = pd.read_csv(sp500_path)[['Symbol', 'Security', 'CIK']]

In [None]:
# Get the filing date for each 10-K
session = HTMLSession()

for i in tqdm(range(len(sp500))):  # TODO:
    tic = sp500['Symbol'].iloc[i]
    cik = sp500['CIK'].iloc[i]
    
    if not os.path.exists(fr'{firm_10k_path}/{tic}/10-K/'):
        print(f'Error finding accession number for {tic}')
        continue
    accession = os.listdir(fr'{firm_10k_path}/{tic}/10-K/')[0]
    
    url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{accession}-index.html'
    r = session.get(url)
    sp500.loc[i, 'filing_date'] = r.html.find('.info', first=True).text

sp500

In [None]:
# Based on filing date, add return from t to t+2 and from t+3 to t+10
combined_return = sp500.merge(
        stock_rets.rename(columns={'ticker':'Symbol'}),
        on='Symbol',
        how='left',
        validate='1:m') # TODO: include more?
combined_return = combined_return.query('filing_date <= date').groupby('Symbol').head(11)
combined_return['agg_ret'] = 1 + combined_return['ret']

combined_return['prod2'] = combined_return.groupby('Symbol').head(3)['agg_ret'].cumprod() - 1
combined_return['prod10'] = combined_return.groupby('Symbol').tail(8)['agg_ret'].cumprod() - 1
final_return = combined_return.groupby('Symbol').head(3).groupby('Symbol').tail(1)[['Symbol', 'Security', 'CIK', 'filing_date', 'prod2']]
final_return = final_return.merge(
        combined_return.groupby('Symbol').tail(1)[['Symbol', 'Security', 'CIK', 'prod10']],
        on=['Symbol', 'Security', 'CIK'],
        validate='1:1',
        how='left') # TODO: include more?

final_return.to_csv(returns_save_path, index=False)

## Load sentiment dictionaries

TODO: justify that positive values as of 2021 are relevant.

In [3]:
# ML Dictionaries
with open('inputs/ML_negative_unigram.txt', 'r') as file:
    BHR_negative = [line.strip() for line in file]
with open('inputs/ML_positive_unigram.txt', 'r') as file:
    BHR_positive = [line.strip() for line in file]

In [4]:
# LM Dictionaries
LM = pd.read_csv('inputs/LM_MasterDictionary_1993-2021.csv')
LM_negative = LM.query('Negative > 0')['Word'].to_list()
LM_positive = LM.query('Positive > 0')['Word'].to_list()

## Load each firm and add sentiment variables

TODO: load and clean

For each firm, 

- [ ] load the corresponding 10-K. Clean the text.

- [ ] Create the sentiment measurements, and save those new measurements to the correct row and column in the dataframe.

- [ ] Bonus: Save the total length of the document (# of words)

- [ ] Bonus: Save the # of unique words (similar to total length)

- [ ] Calculate returns from t to t+2 inclusive

- [ ] Calculate returns from t+3 to t+10 inclusive

- [ ] Download 2021 accounting data (2021 ccm_cleaned.dta) from the data repo (possibly useful in analysis) and add to dataset

In [6]:
# Gather sentiments into regex
BHR_negative_regex = '(' + '|'.join(BHR_negative).lower() + ')'
BHR_positive_regex = '(' + '|'.join(BHR_positive).lower() + ')'
LM_negative_regex = '(' + '|'.join(LM_negative).lower() + ')'
LM_positive_regex = '(' + '|'.join(LM_positive).lower() + ')'

In [7]:
# # Topic regex
# def NEAR_regex_helper(topic_list, valence_list, max_words_between=5):
#     topic_regex = '(' + '|'.join(topic_list).lower() + ')'
#     valence_regex = '(' + '|'.join(valence_list).lower() + ')'
#     return NEAR_regex([topic_regex, valence_regex], max_words_between=max_words_between)

# # Socially responsible investing
# esg_topics = ['esg', 'sustainable', 'sustainability', 'impact invest',
#               'clean energy', 'gender', 'diversity', 'inclusion',
#               'microfinance', 'ethical', 'cdfi']
# esg_negative = ['limited', 'limit', 'underdeveloped', 'underdevelop',
#                 'bureaucratic', 'slow', 'insufficient']
# esg_positive = ['ethical', 'sustainable', 'profitable', 'profit',
#                 'innovative', 'innovation', 'transformative', 'transform']
# esg_negative_regex = NEAR_regex_helper(esg_topics, esg_negative)
# esg_positive_regex = NEAR_regex_helper(esg_topics, esg_positive)

# # Ecommerce
# ecom_topics = ['online', 'digital payment', 'logistics', 'delivery', 
#                'mobile commerce', 'social commerce', 'dropship',
#                'drop ship', 'social media']
# ecom_negative = ['risky', 'unsustainable',  'unsustained', 'monopoly',
#                  'monopolistic', 'unethical']
# ecom_positive = ['convenienent', 'convenienence', 'accessible', 'access',
#                  'innovative', 'innovation', 'profitable', 'profit',
#                  'efficient']
# ecom_negative_regex = NEAR_regex_helper(ecom_topics, ecom_negative)
# ecom_positive_regex = NEAR_regex_helper(ecom_topics, ecom_positive)

# # Biotech and healthcare
# bio_topics = ['gene', 'biopharm', 'telemedic', 'personalized medic',
#               'medical device', 'vaccine', 'precision medic', 'organ',
#               'regenerative medic', 'prosthetic', 'clinic', 'fda',
#               'health']
# bio_negative = ['risky', 'expensive', 'slow', 'controversial', 'unethical']
# bio_positive = ['new', 'safe', 'innovative', 'innovation',
#                 'transformative', 'transform', 'life', 'lives']
# bio_negative_regex = NEAR_regex_helper(bio_topics, bio_negative)
# bio_positive_regex = NEAR_regex_helper(bio_topics, bio_positive)

In [29]:
# Topic regex
def NEAR_regex_helper(topic_list, valence_list, max_words_between=5):
    topic_regex = '(' + '|'.join(topic_list).lower() + ')'
    valence_regex = '(' + '|'.join(valence_list).lower() + ')'
    return NEAR_regex([topic_regex, valence_regex], max_words_between=max_words_between)

# Read in file
if not os.path.exists(topic_path):
    print(f'Cannot find path {topic_path} to topic list')
else:
    topic_df = pd.read_csv(topic_path)

    # ESG
    esg_topic = topic_df['term'].loc[topic_df['type'] == 'esg_topic']
    esg_negative = topic_df['term'].loc[topic_df['type'] == 'esg_negative']
    esg_positive = topic_df['term'].loc[topic_df['type'] == 'esg_positive']

    # Ecommerce
    ecom_topic = topic_df['term'].loc[topic_df['type'] == 'ecom_topic']
    ecom_negative = topic_df['term'].loc[topic_df['type'] == 'ecom_negative']
    ecom_positive = topic_df['term'].loc[topic_df['type'] == 'ecom_positive']

    # Bio
    bio_topic = topic_df['term'].loc[topic_df['type'] == 'bio_topic']
    bio_negative = topic_df['term'].loc[topic_df['type'] == 'bio_negative']
    bio_positive = topic_df['term'].loc[topic_df['type'] == 'bio_positive']

    # Generate regex
    esg_negative_regex = NEAR_regex_helper(esg_topic, esg_negative)
    esg_positive_regex = NEAR_regex_helper(esg_topic, esg_positive)
    ecom_negative_regex = NEAR_regex_helper(ecom_topic, ecom_negative)
    ecom_positive_regex = NEAR_regex_helper(ecom_topic, ecom_positive)
    bio_negative_regex = NEAR_regex_helper(bio_topic, bio_negative)
    bio_positive_regex = NEAR_regex_helper(bio_topic, bio_positive)

In [23]:
# esg_negative_regex
# # esg_positive_regex
# # ecom_negative_regex
# # ecom_positive_regex
# # bio_negative_regex
# # bio_positive_regex

'(?:\\b(esg|sustainable|sustained|sustainability|impact investing|impact invest|clean energy|clean investing|clean invest|diversity|equity|inclusion|microfinance|ethics|ethically|cdfi)\\b(?: +[^ \\n]*){0,5} *\\b(limit|limited|underdevelop|underdeveloped|bureaucratic|slow|slowly|insufficient|behind|poor|faulty|behind)\\b)|(?:\\b(limit|limited|underdevelop|underdeveloped|bureaucratic|slow|slowly|insufficient|behind|poor|faulty|behind)\\b(?: +[^ \\n]*){0,5} *\\b(esg|sustainable|sustained|sustainability|impact investing|impact invest|clean energy|clean investing|clean invest|diversity|equity|inclusion|microfinance|ethics|ethically|cdfi)\\b)'

In [8]:
if not os.path.exists(sp500_path):
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    pd.read_html(url)[0].to_csv(sp500_path, index=False)  # [1] shows updates

sp500 = pd.read_csv(sp500_path)[['Symbol', 'Security', 'CIK']]

In [24]:
def add_sentiment(df, i, sentiment_name, search, text, word_count):
    df.loc[i, sentiment_name] = len(re.findall(search, text)) / word_count

In [None]:
for i in tqdm(range(len(sp500))):
    tic = sp500['Symbol'].iloc[i]
    
    # Check existence of path
    if not os.path.exists(fr'{firm_10k_path}/{tic}'):
        print(f'Cannot find 10-K for ticker {tic}')
        continue
    
    for path in glob.glob(fr'{firm_10k_path}/{tic}/*/*/*.html'):
        # Open and clean the 10-K
        with open(path, 'rb') as report_file:
            html = report_file.read()
        soup = BeautifulSoup(html, 'lxml-xml')
        for div in soup.find_all("div", {'style': 'display:none'}):
            div.decompose()
        lower = soup.get_text().lower()
        no_punc = re.sub(r'\W', ' ', lower)
        cleaned = re.sub(r'\s+', ' ', no_punc)
    
        # Add word count and unique word count
        word_list = re.findall(r'\w+', cleaned)
        sp500.loc[i, 'unique_word_count'] = len(set(word_list))
        word_count = len(word_list)
        sp500.loc[i, 'word_count'] = word_count
        
        # Gather valence variables
        add_sentiment(sp500, i, 'bhr_negative', BHR_negative_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'bhr_positive', BHR_positive_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'lm_negative', LM_negative_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'lm_positive', LM_positive_regex, cleaned, word_count)
        
        # Gather topic valence variables
        add_sentiment(sp500, i, 'esg_negative', esg_negative_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'esg_positive', esg_positive_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'ecom_negative', ecom_negative_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'ecom_positive', ecom_positive_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'bio_negative', bio_negative_regex, cleaned, word_count)
        add_sentiment(sp500, i, 'bio_positive', bio_positive_regex, cleaned, word_count)
        
        # Save intermittently
        if i % 25 == 0:
            sp500.to_csv(sentiment_save_path, index=False)

sp500.to_csv(sentiment_save_path, index=False)
sp500

 13%|████▊                                 | 64/503 [14:43<1:42:36, 14.02s/it]

Cannot find 10-K for ticker BRK.B


 16%|██████                                | 80/503 [18:04<1:37:50, 13.88s/it]

Cannot find 10-K for ticker BF.B


 34%|████████████▍                        | 169/503 [38:07<1:17:33, 13.93s/it]

Cannot find 10-K for ticker ELV


 39%|██████████████▍                      | 197/503 [45:40<1:35:45, 18.78s/it]

Cannot find 10-K for ticker FRC


 42%|███████████████▋                     | 213/503 [48:56<1:04:28, 13.34s/it]

Cannot find 10-K for ticker GEHC


 71%|██████████████████████████▏          | 356/503 [1:19:36<20:09,  8.23s/it]

## Add 10-K sentiment data

TODO: load and clean
