In [25]:
import pandas as pd
from time import gmtime, strftime
import time
import os
from tqdm import tqdm
import pickle
from scipy.stats import linregress
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from utils.crawler import Scrape10K, Scrape10Q, ConvertHTML
from utils.preprocessing import get_texts, stop_words

In [2]:
df = pd.read_excel("data/esg_score.xlsx", sheet_name = "data")

In [3]:
energy_tickers = df[df["sector"] == "Energy"]["Company"]

In [4]:
esg_energy = df[df["sector"] == "Energy"][["Company", "socialScore", "governanceScore", "environmentScore"]]

In [135]:
print("Number of companies in Energy sector: {}".format(len(energy_tickers)))

Number of companies in Energy sector: 20


In [6]:
ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
energy_cik = []
for ticker in energy_tickers:    
    try:
        # for a given ticker, find its cik number through th ticker library
        energy_cik.append(ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:])
    except:
        # if could not find cik, give it a empty cik
        energy_cik.append('')

# Analysis based on Energy sector

## Scrape Energy sector

In [8]:
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-K'
filing_url_base = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-Q&count=1000'

# Initialize log file
# (log file name = the time we initiate scraping session)
t = strftime("%Y_%m_%d_%H_%M_%S", gmtime())
log_file_name = t + ".txt"
log_file_path = os.path.join("log", log_file_name)

with open(log_file_path, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(energy_cik):
    time.sleep(5)
    Scrape10K(browse_url_base=browse_url_base_10k, 
          filing_url_base=filing_url_base, 
          doc_url_base=doc_url_base, 
          cik=cik,
          log_file_name=log_file_path)

    time.sleep(5)
    Scrape10Q(browse_url_base = browse_url_base_10q, 
          filing_url_base = filing_url_base, 
          doc_url_base = doc_url_base, 
          cik = cik,
          log_file_name = log_file_path)
    

#return to the main menu

  0%|          | 0/20 [00:00<?, ?it/s]

Already scraped CIK 0000858470


  5%|▌         | 1/20 [00:10<03:10, 10.00s/it]

Already scraped CIK 0000858470
Already scraped CIK 0000101778


  5%|▌         | 1/20 [00:19<06:07, 19.35s/it]


KeyboardInterrupt: 

## Transform from HTML to txt

In [9]:
# For 10-Ks...
# -*- coding: utf-8 -*-
dir_10k = os.path.join("data", "10k")
dir_10q = os.path.join("data", "10q")

# Iterate over CIKs and clean HTML filings
for cik in tqdm(energy_cik):
    ConvertHTML(cik, dir_10k)

# For 10-Qs...

# Iterate over CIKs and clean HTML filings
for cik in tqdm(energy_cik):
    ConvertHTML(cik, dir_10q)

100%|██████████| 20/20 [12:47<00:00, 38.39s/it]
100%|██████████| 20/20 [13:43<00:00, 41.16s/it]


## Get the preprocessed texts

In [8]:
ret = get_texts(energy_cik, energy_tickers)

20it [24:52, 74.62s/it] 


In [9]:
docs = ret["docs"]

In [10]:
counters = ret["counters"]

# Regression of E, S, G score on indicator function(occurence of word_i)

Company - word table: 1 if word occurs in a certain doc, else 0

In [11]:
energy_dictionary = list()
for ticker in energy_tickers.values:
    energy_dictionary.extend(counters[ticker].keys())

In [12]:
energy_dictionary = set(energy_dictionary)
print("Number of distinct words: {}".format(len(energy_dictionary)))

Number of distinct words: 35938


In [14]:
energy_dictionary.remove('')

In [15]:
word_count = defaultdict(int)
for word in energy_dictionary:
    cnt = 0
    for ticker in energy_tickers:
        cnt += counters[ticker].get(word, 0)
    word_count[word] += cnt
    
top_k_word = [k for k, v in sorted(word_count.items(), key=lambda item: item[1], reverse=True)][:5000]

In [17]:
df_ind = pd.DataFrame(columns=["social_beta", "governance_beta", "environment_beta"], index = top_k_word)


In [19]:
for typ in ["social", "governance", "environment"]:
    score = esg_energy["{}Score".format(typ)]
    slopes = []
    for word in top_k_word:
        inds = []
        for ticker in energy_tickers:
            if word in counters[ticker]:
                inds.append(1)
            else:
                inds.append(0)
        slope, intercept, *_ = linregress(inds, score)
        slopes.append(slope)
    df_ind["{}_beta".format(typ)] = slopes

  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


In [133]:
df_ind = df_ind.sort_values(by=["social_beta", "governance_beta", "environment_beta"], ascending=False)

In [136]:
df_ind.head()

Unnamed: 0,social_beta,governance_beta,environment_beta
cog,4.987368,2.422632,7.706842
dinge,4.987368,2.422632,7.706842
schroeder,4.987368,2.422632,7.706842
extraordinary,3.39451,1.61,3.300196
insignificant,3.39451,1.61,3.300196


In [None]:
df_ind.to_csv(os.path.join("data", "reg_result_ind.csv"))

In [22]:
with open("dict_lemmatize.txt", 'w') as f:
    f.write('\n'.join(list(energy_dictionary)))

# Regression of E, S, G score on tfidf score for each word in each doc

In [32]:
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=10000)
word_count_vector = cv.fit_transform(docs)



In [33]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [43]:
# top 10000 frequent words
feature_names = cv.get_feature_names()

In [46]:
print("Number of words: {}".format(len(feature_names)))

Number of words: 10000


In [None]:
for typ in ["social", "governance", "environment"]:
    score = esg_energy["{}Score".format(typ)]
    slopes = []
    for word in top_k_word:
        inds = []
        for ticker in energy_tickers:
            if word in counters[ticker]:
                inds.append(1)
            else:
                inds.append(0)
        slope, intercept, *_ = linregress(inds, score)
        slopes.append(slope)
    df_ind["{}_beta".format(typ)] = slopes

In [109]:
df_doc_word = pd.DataFrame(columns=feature_names, index=energy_tickers)

for i, ticker in tqdm(enumerate(energy_tickers)):
    tf_idf_vector = tfidf_transformer.transform(cv.transform([docs[i]]))
    
    coo_matrix = tf_idf_vector.tocoo()
    # coo_matrix: A sparse matrix in which coo_matrix.col stores word_idx, coo_matrix.data stores tfidf score
    
    tuples = zip(coo_matrix.col, coo_matrix.data)
    for word_idx, tfidf in tuples:
        df_doc_word.at[ticker, feature_names[word_idx]] = tfidf

20it [00:18,  1.11it/s]


In [117]:
df_doc_word = df_doc_word.fillna(0)

In [76]:
df_tfidf = pd.DataFrame(columns=["social_beta", "governance_beta", "environment_beta"], index=feature_names)


In [118]:
for typ in ["social", "governance", "environment"]:
    score = esg_energy["{}Score".format(typ)]
    slopes = []
    for word in feature_names:
        tfidfs = df_doc_word[word].values.astype(float)
        # print(tfidfs, score)
        slope, intercept, *_ = linregress(tfidfs, score)
        slopes.append(slope)
    df_tfidf["{}_beta".format(typ)] = slopes

Use just the top 5000 words, same as the 5000 words in indicator mode

In [123]:
word_to_drop = list(set(feature_names) - set(top_k_word))

In [131]:
df_tfidf = df_tfidf.sort_values(by=["social_beta", "governance_beta", "environment_beta"], ascending=False)

In [132]:
df_tfidf.drop(word_to_drop).to_csv(os.path.join("data", "reg_result_tfidf.csv"))

## Appendix

In [91]:
import shutil

In [None]:
for cik in energy_cik[1:]:
    pkl_path = os.path.join("data", "10k", cik, "pickle")
    shutil.rmtree(pkl_path)
    pkl_path = os.path.join("data", "10q", cik, "pickle")
    shutil.rmtree(pkl_path)