In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from tqdm import tqdm

from utils.preprocessing import get_texts
from utils.preprocessing import get_texts, stop_words
import random
from sklearn.metrics import confusion_matrix 
import numpy as np

In [None]:
df_esg_score = pd.read_excel("data/esg_score.xlsx", sheet_name = "data").dropna()

In [None]:
sectors = df_esg_score['sector'].unique().tolist()
sectors = sectors[:-1] # drop nan
score_type = ['environmentScore', "socialScore", "governanceScore"]

In [None]:
# iterate through sectors
for s in tqdm(sectors):
    
    # iterate through scores
    for t in score_type:
        tickers = df_esg_score[df_esg_score["sector"] == s]["Company"]
        esgs = df_esg_score[df_esg_score["sector"] == s][["Company", "socialScore", "governanceScore", "environmentScore"]]

        score = esgs[t]

        # alpha value sets the threshold for good and bad scores
        alpha = 0.5
        upper_score = np.quantile(score, 1 - alpha)
        lower_score = np.quantile(score, alpha)


        bad_companies = esgs[esgs[t] > upper_score]["Company"].values
        good_companies = esgs[esgs[t] < lower_score]["Company"].values


        
        #training set
        train_bad=random.sample(list(bad_companies), int(len(bad_companies)*0.7))
        train_good=random.sample(list(good_companies), int(len(good_companies)*0.7))

        #validation set
        validate_bad = [i for i in bad_companies if i not in train_bad]
        validate_good = [i for i in good_companies if i not in train_good]

        # validate_good = pd.DataFrame(validate_good) 
        # validate_bad = pd.DataFrame(validate_bad) 


        validation = pd.DataFrame({'good':validate_good, 'bad':validate_bad})

        #save the list of good and bad companies
        validation_path = os.path.join("data", "validation_data")
        if not os.path.isdir(validation_path):
            os.mkdir(validation_path)

        validation.to_csv("data/validation_data/{}_{}_{}.csv".format(s[:8], t[:3], alpha))

        #training data bad companies scores
        train_bad_scores=pd.DataFrame()

        for i in train_bad:
            df_bad=esgs[esgs['Company'] == i]
            train_bad_scores=train_bad_scores.append(df_bad)


        #training data good companies scores
        train_good_scores=pd.DataFrame()

        for i in train_good:
            df_good=esgs[esgs['Company'] == i]
            train_good_scores =train_good_scores.append(df_good)



        # bad_companies_score = esgs[esgs[score_type] > upper_score][score_type].values
        # good_companies_score = esgs[esgs[score_type] < lower_score][score_type].values

        good_companies_score_training=train_good_scores[score_type].values
        bad_companies_score_training=train_bad_scores[score_type].values


        avg_bad = np.mean(bad_companies_score_training)
        avg_good = np.mean(good_companies_score_training)
        
        
        
                ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))
        good_cik = []
        bad_cik = []
        for ticker in train_good:    
            try:
                # for a given ticker, find its cik number through th ticker library
                good_cik.append(ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:])
            except:
                # if could not find cik, give it a empty cik
                good_cik.append('')

        for ticker in train_bad:    
            try:
                # for a given ticker, find its cik number through th ticker library
                bad_cik.append(ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:])
            except:
                # if could not find cik, give it a empty cik
                bad_cik.append('')



        ret_good = get_texts(good_cik, train_good)
        ret_bad = get_texts(bad_cik, train_bad)

        good_docs = ret_good["docs"]
        bad_docs = ret_bad["docs"]



        # TODO: Modify here for different ngram range
        n_min = 2
        n_max = 3
        cv = CountVectorizer(max_df=0.7, stop_words=stop_words, max_features=200, ngram_range=(n_min, n_max))
        word_count_vector = cv.fit_transform(good_docs + bad_docs)

        feature_names = cv.get_feature_names()
        count_feature = word_count_vector.toarray().sum(axis=0)
        feature_names = cv.get_feature_names()



        d = {"word": [], "good_score": [], "bad_score": [], "good_score_all": []
        , "bad_score_all": [], "count": [], "good_nums": [], "bad_nums": []}

        for feature_idx, word in enumerate(feature_names):
            good_sum = bad_sum = good_num = bad_num = 0

            for i, doc_set in enumerate(good_docs):
                if word in doc_set:
                    good_num += 1
                    good_sum += good_companies_score_training[i]
            for i, doc_set in enumerate(bad_docs):
                if word in doc_set:
                    bad_num += 1
                    bad_sum += bad_companies_score_training[i]

            # print("word: {}".format(word))
            d["word"].append(word) 

            if good_num:
                d["good_score"].append(good_sum / good_num)
            else:
                d["good_score"].append(0)
            if bad_num:
                d["bad_score"].append(bad_sum / bad_num)
            else:
                d["bad_score"].append(0)

            d["good_score_all"].append(good_sum / len(good_docs))
            d["bad_score_all"].append(bad_sum / len(bad_docs))

            d["count"].append(count_feature[feature_idx])
            d["good_nums"].append(good_num)
            d["bad_nums"].append(bad_num)





            df = pd.DataFrame(data=d)
        df["diff"] = abs(df["good_nums"] - df["bad_nums"])
        df = df.sort_values("diff", ascending=False)#.head(60)


        goodvbad_path = os.path.join("data", "training_goodvbad")
        if not os.path.isdir(goodvbad_path):
            os.mkdir(goodvbad_path)


        df.round(2).to_csv("data/training_goodvbad/{}_{}_{}_n{}-{}.csv".format(s[:8], t[:3], alpha, n_min, n_max))