In [28]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from utils.preprocessing import get_texts
from utils.preprocessing import get_texts, stop_words

In [107]:
df = pd.read_excel("data/esg_score.xlsx", sheet_name = "data")

In [4]:
energy_tickers = df[df["sector"] == "Energy"]["Company"]
esg_energy = df[df["sector"] == "Energy"][["Company", "socialScore", "governanceScore", "environmentScore"]]

In [7]:
env_score = esg_energy["environmentScore"]

In [118]:
upper_score = np.quantile(env_score, 0.7)
lower_score = np.quantile(env_score, 0.3)

In [119]:
good_companies = esg_energy[esg_energy["environmentScore"] > upper_score]["Company"].values
bad_companies = esg_energy[esg_energy["environmentScore"] < lower_score]["Company"].values

In [120]:
good_companies

array(['COG', 'MRO', 'CVX', 'EOG', 'APA', 'OXY'], dtype=object)

In [121]:
bad_companies

array(['NOV', 'OKE', 'HAL', 'SLB', 'WMB', 'KMI'], dtype=object)

In [47]:
esg_energy[esg_energy["environmentScore"] > upper_score]

Unnamed: 0,Company,socialScore,governanceScore,environmentScore
1,COG,14.01,9.28,23.39
3,MRO,10.27,8.7,23.76
4,CVX,10.67,10.21,20.29
8,APA,8.88,7.96,21.98
10,OXY,10.85,6.75,20.0


In [122]:
bad_companies_score = esg_energy[esg_energy["environmentScore"] > upper_score]["environmentScore"].values
good_companies_score = esg_energy[esg_energy["environmentScore"] < lower_score]["environmentScore"].values

In [123]:
avg_bad = np.mean(bad_companies_score)
avg_good = np.mean(good_companies_score)
print(avg_bad, avg_good)

21.515 9.834999999999999


In [125]:
print(upper_score, lower_score)

19.376 14.691


In [128]:
ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))
good_cik = []
bad_cik = []
for ticker in good_companies:    
    try:
        # for a given ticker, find its cik number through th ticker library
        good_cik.append(ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:])
    except:
        # if could not find cik, give it a empty cik
        good_cik.append('')

for ticker in bad_companies:    
    try:
        # for a given ticker, find its cik number through th ticker library
        bad_cik.append(ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:])
    except:
        # if could not find cik, give it a empty cik
        bad_cik.append('')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [129]:
ret_good = get_texts(good_cik, good_companies)

6it [00:02,  2.61it/s]


In [130]:
ret_bad = get_texts(bad_cik, bad_companies)

6it [00:04,  1.49it/s]


In [131]:
good_docs = ret_good["docs"]
bad_docs = ret_bad["docs"]

In [155]:
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=500, ngram_range=(1,3))
word_count_vector = cv.fit_transform(good_docs + bad_docs)



In [156]:
feature_names = cv.get_feature_names()

In [157]:
good_docs_list = []
bad_docs_list = []
for doc in good_docs:
    li = doc.split(' ')
    good_docs_list.append(set(li))
for doc in bad_docs:
    li = doc.split(' ')
    bad_docs_list.append(set(li))

In [158]:
d = {"word": [], "good_score": [], "bad_score": [], "good_score_all": []
    , "bad_score_all": [], "good_nums": [], "bad_nums": []}

for word in feature_names[:100]:
    good_sum = bad_sum = good_num = bad_num = 0

    for i, doc_set in enumerate(good_docs):
        if word in doc_set:
            good_num += 1
            good_sum += good_companies_score[i]
    for i, doc_set in enumerate(bad_docs):
        if word in doc_set:
            bad_num += 1
            bad_sum += bad_companies_score[i]
    
    # print("word: {}".format(word))
    d["word"].append(word) 
    
    if good_num:
        d["good_score"].append(good_sum / good_num)
    else:
        d["good_score"].append(0)
    if bad_num:
        d["bad_score"].append(bad_sum / bad_num)
    else:
        d["bad_score"].append(0)

    d["good_score_all"].append(good_sum / len(good_docs_list))
    d["bad_score_all"].append(bad_sum / len(bad_docs_list))

    d["good_nums"].append(good_num)
    d["bad_nums"].append(bad_num)


In [159]:
df = pd.DataFrame(data=d)


In [160]:
df.head()

Unnamed: 0,word,good_score,bad_score,good_score_all,bad_score_all,good_nums,bad_nums
0,accompanying note consolidated,9.773333,23.76,4.886667,3.96,3,1
1,accompanying note integral,10.5575,21.695,7.038333,7.231667,4,2
2,acmp,0.0,21.98,0.0,3.663333,0,1
3,activity cash flow,10.975,21.066,7.316667,17.555,4,5
4,adjusted ebitda,0.0,22.2825,0.0,14.855,0,4


In [161]:
df["diff"] = abs(df["good_nums"] - df["bad_nums"])

In [164]:
df.sort_values("diff", ascending=False).head(60)

Unnamed: 0,word,good_score,bad_score,good_score_all,bad_score_all,good_nums,bad_nums,diff
53,capital exploration,9.835,0.0,9.835,0.0,6,0,6
64,commodity derivative instrument,9.835,23.76,9.835,3.96,6,1,5
92,dd rate,10.242,23.76,8.535,3.96,5,1,4
72,condensate natural,9.835,22.87,9.835,7.623333,6,2,4
52,cameron,8.98,21.066,1.496667,17.555,1,5,4
74,condensate ngls,9.835,22.87,9.835,7.623333,6,2,4
90,cubic foot natural,9.835,20.99,9.835,6.996667,6,2,4
73,condensate natural gas,9.835,22.87,9.835,7.623333,6,2,4
4,adjusted ebitda,0.0,22.2825,0.0,14.855,0,4,4
45,brent,9.294,19.98,7.745,6.66,5,2,3


In [166]:
len(df)

100

In [165]:
df.to_csv("energy_good_vs_bad_uni_bi_tri.csv")

In [100]:
comp = pd.read_csv("data/sp500_component_stocks.csv")


In [140]:

comp[comp['A'].isin(good_companies)]["Agilent Technologies Inc."]

44                   Apache Corporation
111         Cabot Oil & Gas Corporation
128                 Chevron Corporation
163                  EOG Resources Inc.
322            Marathon Oil Corporation
358    Occidental Petroleum Corporation
Name: Agilent Technologies Inc., dtype: object

In [141]:
comp[comp['A'].isin(bad_companies)]["Agilent Technologies Inc."]

215            Halliburton Company
274      Kinder Morgan Inc Class P
343    National Oilwell Varco Inc.
354                     ONEOK Inc.
411                Schlumberger NV
487        Williams Companies Inc.
Name: Agilent Technologies Inc., dtype: object