## Setup

In [1]:
import fnmatch
import glob
import os
import re
from time import sleep
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
# import sys

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from near_regex import NEAR_regex  # copy this file into the asgn folder
from tqdm import tqdm  # progress bar on loops

In [2]:
# File handling
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

## Load sentiment dictionaries

TODO: justify that positive values as of 2021 are relevant.

In [3]:
# ML Dictionaries
with open('inputs/ML_negative_unigram.txt', 'r') as file:
    BHR_negative = [line.strip() for line in file]
with open('inputs/ML_positive_unigram.txt', 'r') as file:
    BHR_positive = [line.strip() for line in file]

In [4]:
# LM Dictionaries
LM = pd.read_csv('inputs/LM_MasterDictionary_1993-2021.csv')
LM_negative = LM.query('Negative > 0')['Word'].to_list()
LM_positive = LM.query('Positive > 0')['Word'].to_list()

## Load 2022 returns

In [5]:
# stock_path = "inputs/crsp_2022.zip"

# if not os.path.exists(stock_path):
url = "https://github.com/LeDataSciFi/data/raw/main/Stock%20Returns%20(CRSP)/crsp_2022_only.zip"
with urlopen(url) as request:
    data = BytesIO(request.read())

with ZipFile(data) as archive:
    with archive.open(archive.namelist()[0]) as stata:
        stock_rets = pd.read_stata(stata)

stock_rets

Unnamed: 0,ticker,date,ret
0,JJSF,2021-12-01,-0.011276
1,JJSF,2021-12-02,0.030954
2,JJSF,2021-12-03,0.000287
3,JJSF,2021-12-06,0.014362
4,JJSF,2021-12-07,0.012459
...,...,...,...
2594044,TSLA,2022-12-23,-0.017551
2594045,TSLA,2022-12-27,-0.114089
2594046,TSLA,2022-12-28,0.033089
2594047,TSLA,2022-12-29,0.080827


## Load each firm and add sentiment variables

TODO: load and clean

For each firm, 

- [ ] load the corresponding 10-K. Clean the text.

- [ ] Create the sentiment measurements, and save those new measurements to the correct row and column in the dataframe.

- [ ] Bonus: Save the total length of the document (# of words)

- [ ] Bonus: Save the # of unique words (similar to total length)

- [ ] Calculate returns from t to t+2 inclusive

- [ ] Calculate returns from t+3 to t+10 inclusive

- [ ] Download 2021 accounting data (2021 ccm_cleaned.dta) from the data repo (possibly useful in analysis) and add to dataset

In [20]:
# Gather sentiments into regex
BHR_negative_regex = '(' + '|'.join(BHR_negative).lower() + ')'
BHR_positive_regex = '(' + '|'.join(BHR_positive).lower() + ')'
LM_negative_regex = '(' + '|'.join(LM_negative).lower() + ')'
LM_positive_regex = '(' + '|'.join(LM_positive).lower() + ')'

In [23]:
# Topic regex

# Socially responsible investing
esg_topics = ['esg', 'sustainability', 'impact invest', 'ethical', 
              'clean energy', 'gender', 'diversity', 'inclusion',
              'microfinance', 'cdfi']
esg_negative = ['limited', 'limit', 'underdeveloped', 'underdevelop',
                'bureaucratic', 'slow', 'insufficient']
esg_positive = ['ethical', 'sustainable', 'profitable', 'profit',
                'innovative', 'innovation', 'transformative', 'transform']

# Ecommerce
ecom_topics = ['online', 'digital payment', 'logistics', 'delivery', 
               'mobile commerce', 'social commerce', 'dropship',
               'drop ship', 'social media']
ecom_negative = ['risky', 'unsustainable',  'unsustained', 'monopoly',
                 'monopolistic', 'unethical']
ecom_positive = ['convenienent', 'convenienence', 'accessible', 'access',
                 'innovative', 'innovation', 'profitable', 'profit',
                 'efficient']

# Biotech and healthcare
bio_topics = ['gene', 'biopharm', 'telemedic', 'personalized medic',
              'medical device', 'vaccine', 'precision medic', 'organ',
              'regenerative medic', 'prosthetic', 'clinic', 'fda',
              'health']
bio_negative = ['risky', 'expensive', 'slow', 'controversial', 'unethical']
bio_positive = ['new', 'safe', 'innovative', 'innovation',
                'transformative', 'transform', 'life', 'lives']

In [21]:
sp500_path = "inputs/s&p500_2022.csv"

if not os.path.exists(sp500_path):
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    pd.read_html(url)[0].to_csv(sp500_path, index=False)  # [1] shows updates

sp500 = pd.read_csv(sp500_path)[['Symbol', 'Security', 'CIK']]

In [22]:
firm_10k_path = "10k_files/sec-edgar-filings"

for i in tqdm(range(len(sp500[:3]))):    # TODO
    tic = sp500['Symbol'].iloc[i]
    
    # Check existence of path
    if not os.path.exists(fr'{firm_10k_path}/{tic}'):
        print(f'Cannot find 10-K for ticker {tic}')
        continue
    
    # Open 10-K
    for path in glob.glob(fr'{firm_10k_path}/{tic}/*/*/*.html'): #'/*/*.xml'):
        with open(path, 'rb') as report_file:
            html = report_file.read()
        soup = BeautifulSoup(html, 'lxml-xml')
        for div in soup.find_all("div", {'style': 'display:none'}):
            div.decompose()
        lower = soup.get_text().lower()
        no_punc = re.sub(r'\W', ' ', lower)
        cleaned = re.sub(r'\s+', ' ', no_punc)
    
        # Gather sentiment variables
        sp500.loc[i, 'word_count'] = len(re.findall(r'\w+', cleaned))
        sp500.loc[i, 'bhr_negative'] = \
                len(re.findall(BHR_negative_regex, cleaned)) / \
                sp500.loc[i, 'word_count']
        sp500.loc[i, 'bhr_positive'] = \
                len(re.findall(BHR_positive_regex, cleaned)) / \
                sp500.loc[i, 'word_count']
        sp500.loc[i, 'lm_negative'] = \
                len(re.findall(LM_negative_regex, cleaned)) / \
                sp500.loc[i, 'word_count']
        sp500.loc[i, 'lm_positive'] = \
                len(re.findall(LM_positive_regex, cleaned)) / \
                sp500.loc[i, 'word_count']
        
    # if os.path.exists()
    # print(i, sp500['Symbol'].iloc[i])
sp500

Unnamed: 0,Symbol,Security,CIK,word_count,bhr_negative,bhr_positive,lm_negative,lm_positive
0,MMM,3M,66740,76432.0,0.044314,0.041750,0.043097,0.015125
1,AOS,A. O. Smith,91142,33810.0,0.032919,0.038391,0.034102,0.013162
2,ABT,Abbott,1800,52061.0,0.039492,0.039761,0.036246,0.010584
3,ABBV,AbbVie,1551152,,,,,
4,ACN,Accenture,1467373,,,,,
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,1041061,,,,,
499,ZBRA,Zebra Technologies,877212,,,,,
500,ZBH,Zimmer Biomet,1136869,,,,,
501,ZION,Zions Bancorporation,109380,,,,,


In [None]:
# TODO: Get list of firms this way or just with os.list dir
# firm_10k_path = "10k_files/sec-edgar-filings"

# if not os.path.exists(firm_10k_path):
#     print("Cannot find 10-K files.")
# else:
#     firms = os.listdir(firm_10k_path)
    
#     for firm in firms[:3]: # TODO
#         for path in glob.glob(fr'{firm_10k_path}/{firm}/*/*/*.html'): #'/*/*.xml'):
#             with open(path, 'rb') as report_file:
#                 html = report_file.read()
#             soup = BeautifulSoup(html, 'lxml-xml')
#             for div in soup.find_all("div", {'style': 'display:none'}):
#                 div.decompose()
#             lower = soup.get_text().lower()
#             no_punc = re.sub(r'\W', ' ', lower)
#             cleaned = re.sub(r'\s+', ' ', no_punc)
            
#             print(path)
            

# # Create the pandas set
# for firm in firms:
    

## Add 10-K sentiment data

TODO: load and clean
