In [65]:
import pandas as pd
from time import gmtime, strftime
import time
import os
from tqdm import tqdm
import pickle
from scipy.stats import linregress
from collections import defaultdict

from utils.crawler import Scrape10K, Scrape10Q, ConvertHTML
from utils.preprocessing import get_texts

In [3]:
df = pd.read_excel("data/esg_score.xlsx", sheet_name = "data")

In [4]:
energy_tickers = df[df["sector"] == "Energy"]["Company"]

In [30]:
esg_energy = df[df["sector"] == "Energy"][["Company", "socialScore", "governanceScore", "environmentScore"]]

In [5]:
len(energy_tickers)

20

In [6]:
ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
energy_cik = []
for ticker in energy_tickers:    
    try:
        # for a given ticker, find its cik number through th ticker library
        energy_cik.append(ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:])
    except:
        # if could not find cik, give it a empty cik
        energy_cik.append('')

# Analysis based on Energy sector

## Scrape Energy sector

In [8]:
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-K'
filing_url_base = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-Q&count=1000'

# Initialize log file
# (log file name = the time we initiate scraping session)
t = strftime("%Y_%m_%d_%H_%M_%S", gmtime())
log_file_name = t + ".txt"
log_file_path = os.path.join("log", log_file_name)

with open(log_file_path, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(energy_cik):
    time.sleep(5)
    Scrape10K(browse_url_base=browse_url_base_10k, 
          filing_url_base=filing_url_base, 
          doc_url_base=doc_url_base, 
          cik=cik,
          log_file_name=log_file_path)

    time.sleep(5)
    Scrape10Q(browse_url_base = browse_url_base_10q, 
          filing_url_base = filing_url_base, 
          doc_url_base = doc_url_base, 
          cik = cik,
          log_file_name = log_file_path)
    

#return to the main menu

  0%|          | 0/20 [00:00<?, ?it/s]

Already scraped CIK 0000858470


  5%|▌         | 1/20 [00:10<03:10, 10.00s/it]

Already scraped CIK 0000858470
Already scraped CIK 0000101778


  5%|▌         | 1/20 [00:19<06:07, 19.35s/it]


KeyboardInterrupt: 

In [9]:
# For 10-Ks...
# -*- coding: utf-8 -*-
dir_10k = os.path.join("data", "10k")
dir_10q = os.path.join("data", "10q")

# Iterate over CIKs and clean HTML filings
for cik in tqdm(energy_cik):
    ConvertHTML(cik, dir_10k)

# For 10-Qs...

# Iterate over CIKs and clean HTML filings
for cik in tqdm(energy_cik):
    ConvertHTML(cik, dir_10q)

100%|██████████| 20/20 [12:47<00:00, 38.39s/it]
100%|██████████| 20/20 [13:43<00:00, 41.16s/it]


In [89]:
energy_cik[:1]

['0000858470']

In [11]:
ret = get_texts(energy_cik, energy_tickers)

0it [00:00, ?it/s]

data/10k/0000858470/pickle/agg_texts.pkl
data/10k/0000858470/pickle/token_counter.pkl


1it [01:01, 61.55s/it]

data/10q/0000858470/pickle/agg_texts.pkl
data/10q/0000858470/pickle/token_counter.pkl
data/10k/0000101778/pickle/agg_texts.pkl
data/10k/0000101778/pickle/token_counter.pkl
data/10q/0000101778/pickle/agg_texts.pkl
data/10q/0000101778/pickle/token_counter.pkl


2it [03:09, 100.32s/it]

data/10k/0000093410/pickle/agg_texts.pkl
data/10k/0000093410/pickle/token_counter.pkl
data/10q/0000093410/pickle/agg_texts.pkl
data/10q/0000093410/pickle/token_counter.pkl


3it [05:38, 122.93s/it]

data/10k/0000821189/pickle/agg_texts.pkl
data/10k/0000821189/pickle/token_counter.pkl


4it [06:56, 104.89s/it]

data/10q/0000821189/pickle/agg_texts.pkl
data/10q/0000821189/pickle/token_counter.pkl
data/10k/0000006769/pickle/agg_texts.pkl
data/10k/0000006769/pickle/token_counter.pkl
data/10q/0000006769/pickle/agg_texts.pkl
data/10q/0000006769/pickle/token_counter.pkl


5it [08:44, 106.03s/it]

data/10k/0000797468/pickle/agg_texts.pkl
data/10k/0000797468/pickle/token_counter.pkl


6it [11:37, 128.78s/it]

data/10q/0000797468/pickle/agg_texts.pkl
data/10q/0000797468/pickle/token_counter.pkl
data/10k/0001163165/pickle/agg_texts.pkl
data/10k/0001163165/pickle/token_counter.pkl
data/10q/0001163165/pickle/agg_texts.pkl
data/10q/0001163165/pickle/token_counter.pkl


7it [12:34, 105.31s/it]

data/10k/0000034088/pickle/agg_texts.pkl
data/10k/0000034088/pickle/token_counter.pkl


8it [13:25, 88.15s/it] 

data/10q/0000034088/pickle/agg_texts.pkl
data/10q/0000034088/pickle/token_counter.pkl
data/10k/0001038357/pickle/agg_texts.pkl
data/10k/0001038357/pickle/token_counter.pkl
data/10q/0001038357/pickle/agg_texts.pkl
data/10q/0001038357/pickle/token_counter.pkl


9it [14:58, 89.54s/it]

data/10k/0001021860/pickle/agg_texts.pkl
data/10k/0001021860/pickle/token_counter.pkl


10it [15:33, 72.94s/it]

data/10q/0001021860/pickle/agg_texts.pkl
data/10q/0001021860/pickle/token_counter.pkl
data/10k/0001534701/pickle/agg_texts.pkl
data/10k/0001534701/pickle/token_counter.pkl


11it [16:51, 74.36s/it]

data/10q/0001534701/pickle/agg_texts.pkl
data/10q/0001534701/pickle/token_counter.pkl
data/10k/0001090012/pickle/agg_texts.pkl
data/10k/0001090012/pickle/token_counter.pkl
data/10q/0001090012/pickle/agg_texts.pkl
data/10q/0001090012/pickle/token_counter.pkl


12it [17:37, 65.68s/it]

data/10k/0000004447/pickle/agg_texts.pkl
data/10k/0000004447/pickle/token_counter.pkl


13it [18:53, 68.76s/it]

data/10q/0000004447/pickle/agg_texts.pkl
data/10q/0000004447/pickle/token_counter.pkl
data/10k/0001035002/pickle/agg_texts.pkl
data/10k/0001035002/pickle/token_counter.pkl
data/10q/0001035002/pickle/agg_texts.pkl
data/10q/0001035002/pickle/token_counter.pkl


14it [20:31, 77.79s/it]

data/10k/0001510295/pickle/agg_texts.pkl
data/10k/0001510295/pickle/token_counter.pkl


15it [21:41, 75.36s/it]

data/10q/0001510295/pickle/agg_texts.pkl
data/10q/0001510295/pickle/token_counter.pkl
data/10k/0001039684/pickle/agg_texts.pkl
data/10k/0001039684/pickle/token_counter.pkl
data/10q/0001039684/pickle/agg_texts.pkl
data/10q/0001039684/pickle/token_counter.pkl


16it [23:36, 87.25s/it]

data/10k/0000045012/pickle/agg_texts.pkl
data/10k/0000045012/pickle/token_counter.pkl
data/10q/0000045012/pickle/agg_texts.pkl
data/10q/0000045012/pickle/token_counter.pkl


17it [24:31, 77.53s/it]

data/10k/0000087347/pickle/agg_texts.pkl
data/10k/0000087347/pickle/token_counter.pkl


18it [24:58, 62.40s/it]

data/10q/0000087347/pickle/agg_texts.pkl
data/10q/0000087347/pickle/token_counter.pkl
data/10k/0000107263/pickle/agg_texts.pkl
data/10k/0000107263/pickle/token_counter.pkl
data/10q/0000107263/pickle/agg_texts.pkl
data/10q/0000107263/pickle/token_counter.pkl


19it [28:30, 107.37s/it]

data/10k/0001506307/pickle/agg_texts.pkl
data/10k/0001506307/pickle/token_counter.pkl
data/10q/0001506307/pickle/agg_texts.pkl
data/10q/0001506307/pickle/token_counter.pkl


20it [31:39, 94.98s/it] 


In [13]:
docs = ret["docs"]

In [15]:
counters = ret["counters"]

## Regression of E, S, G score on indicator function(occurence of word_i)

Company - word table: 1 if word occurs in a certain doc, else 0

In [22]:
energy_dictionary = list()
for ticker in energy_tickers.values:
    energy_dictionary.extend(counters[ticker].keys())

In [24]:
energy_dictionary = set(energy_dictionary)
print("Number of distinct words: {}".format(len(energy_dictionary)))

Number of distinct words: 30198


In [33]:
esg_energy["{}Score".format("social")]

1      14.01
3      10.27
4      10.67
7      11.06
8       8.88
10     10.85
18      9.83
19     10.57
35      9.14
38     11.64
39      8.08
47      8.32
68      6.36
86      6.87
96      6.71
135     8.13
164     9.62
172     9.82
236     8.52
290     6.09
Name: socialScore, dtype: float64

In [50]:
energy_dictionary.remove('')

In [78]:
word_count = defaultdict(int)
for word in energy_dictionary:
    cnt = 0
    for ticker in energy_tickers:
        cnt += counters[ticker].get(word, 0)
    word_count[word] += cnt
    
top_k_word = [k for k, v in sorted(word_count.items(), key=lambda item: item[1], reverse=True)][:5000]

In [79]:
df_ind = pd.DataFrame(columns=["social_beta", "governance_beta", "environment_beta"], index = top_k_word)


In [80]:
df_ind.head()

Unnamed: 0,social_beta,governance_beta,environment_beta
million,,,
oper,,,
ga,,,
financi,,,
compani,,,


In [82]:
for typ in ["social", "governance", "environment"]:
    score = esg_energy["{}Score".format(typ)]
    slopes = []
    for word in top_k_word:
        inds = []
        for ticker in energy_tickers:
            if word in counters[ticker]:
                inds.append(1)
            else:
                inds.append(0)
        slope, intercept, *_ = linregress(inds, score)
        slopes.append(slope)
    df_ind["{}_beta".format(typ)] = slopes

In [85]:
df_ind

Unnamed: 0,social_beta,governance_beta,environment_beta
million,,,
oper,,,
ga,,,
financi,,,
compani,,,
...,...,...,...
mpl,-0.868889,0.390556,3.712778
ryan,0.432941,-0.892353,-1.304118
taae,-2.696842,-1.072105,-1.061579
ware,0.558889,2.340556,5.629444


In [86]:
df_ind.sort_values(by=["social_beta", "governance_beta", "environment_beta"], ascending=False)

Unnamed: 0,social_beta,governance_beta,environment_beta
cog,4.987368,2.422632,7.706842
ding,4.987368,2.422632,7.706842
schroeder,4.987368,2.422632,7.706842
contentscabot,4.987368,2.422632,7.706842
mead,4.987368,2.422632,7.706842
...,...,...,...
fewer,,,
pure,,,
quickli,,,
attest,,,


In [61]:
with open("dict.txt", 'w') as f:
    f.write('\n'.join(list(energy_dictionary)))

In [None]:
df_ind.to_csv(os.path.join("data", "reg_result"))

In [None]:
counters[ticker]

In [None]:
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=10000)
word_count_vector = cv.fit_transform(docs)

In [91]:
import shutil

In [93]:
for cik in energy_cik[1:]:
    print(cik)
    pkl_path = os.path.join("data", "10k", cik, "pickle")
    shutil.rmtree(pkl_path)
    pkl_path = os.path.join("data", "10q", cik, "pickle")
    shutil.rmtree(pkl_path)

0000101778
0000093410
0000821189
0000006769
0000797468
0001163165
0000034088
0001038357
0001021860
0001534701
0001090012
0000004447
0001035002
0001510295
0001039684
0000045012
0000087347
0000107263
0001506307
