In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

import tensorflow as tf
import tensorflow.keras as keras
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelBinarizer


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Webscraper

In [None]:
from bs4 import BeautifulSoup
import requests


def get_tickers():
    wiki_page = requests.get(
        'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies').text
    sp_data = pd.read_html(wiki_page)
    ticker_df = sp_data[0]
    ticker_options = ticker_df['Symbol']
    return ticker_options


def level(x):
    if x == 0:
        return 'No Controversy'
    elif x == 1.0:
        return 'Little Controversy'
    elif x == 2.0:
        return 'Moderate Controversy'
    elif x == 3.0:
        return 'Relatively High Controversy'
    else:
        return 'High Controversy'


def web_scraper(ticker):
    elements = []
    url = 'https://finance.yahoo.com/quote/'+ticker+'/sustainability?p='+ticker
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    web_data = requests.get(url, headers=headers, timeout=5).text

    soup = BeautifulSoup(web_data, 'html.parser')
    esg_score = soup.find("div", {'class': 'Fz(36px) Fw(600) D(ib) Mend(5px)'})
    if(esg_score == None):
        return pd.DataFrame()
    datapoint = 100 - int(esg_score.text)
    controversy_score = soup.find('div', {'class': 'D(ib) Fz(36px) Fw(500)'})
    if(controversy_score == None):
        return pd.DataFrame()
    controversy_datapoint = controversy_score.text
    if(controversy_datapoint == None):
        return pd.DataFrame()
    scores = soup.find_all(
        'div', {'class': 'D(ib) Fz(23px) smartphone_Fz(22px) Fw(600)'})

    for score in scores:
        elements.append(round(33.3 - float(score.text),2))

    df = pd.DataFrame({'Total ESG Score': datapoint,
                       'Environment Score': elements[0],
                       'Social Score': elements[1],
                       'Governance Score': elements[2],
                       'Controversy Score': int(controversy_datapoint)},
                      index=[ticker])
    return df


def get_esg(ticker):
    esg_data = web_scraper(ticker)
    return esg_data

# Data Processing

In [None]:
dirname = '/kaggle/input/aws-esg/ADX_ESG_NEWS_2020-09-01.csv'

df = pd.read_csv(dirname)
df= df[df['totalCountDailyScore'] != 0]

In [None]:
unique = df.drop_duplicates(subset=['ticker'])
unique = unique.loc[:, ['ticker', 'companyName']]

ticker = list(unique['ticker'])
companyName = list(unique['companyName'])

result = zip(ticker, companyName)

result = list(result)

companies_dict = {}

n = 1
for x in result:
    
    score = get_esg(x[0]).values.tolist()
    print(n)
    if len(score) == 0:
        continue
    n += 1
    score = score[0]
    companies_dict[x[0]] = [x[1], score]
    


with open('companies.json', 'w', encoding='utf-8') as f:
    json.dump(companies_dict, f, ensure_ascii=False, indent=4)

In [None]:
daily_score = list(df_new['totalCountDailyScore'])
date = list(df_new['date'])

result = zip(date, daily_score)

result = list(result)

In [None]:
with open('/kaggle/input/companies/companies.json', 'r') as f:
    data=f.read()

data = json.loads(data)

data = list(data.keys())
print(len(data))

boolean_series = df.ticker.isin(data)
filtered_df = df[boolean_series]

In [None]:
x={}
n=0
print(len(data))
for name in data:
    df_new = df.loc[df['ticker'] == name, ['ticker', 'companyName', 'date', 'totalCountDailyScore']]
    df_new['totalCountDailyScore'] = ((1 - ((df_new.loc[:, 'totalCountDailyScore'] + 1) / 3.5)) * 100).astype(int)
    date = df_new['date'].values.tolist()
    score = df_new['totalCountDailyScore'].values.tolist()
    combined = list(zip(date, score))
    
    x[df_new['ticker'].values[0]] = { 
    "companyName": df_new['companyName'].values[0],
    "values": combined
    }
    
print('done')

with open('time.json', 'w', encoding='utf-8') as f:
    json.dump(x, f, ensure_ascii=False, indent=4)

In [None]:
with open('/kaggle/input/companies/time.json', 'r') as f:
    data=f.read()

data = json.loads(data)

names = list(data.keys())

x = {}
for name in names:
    esg = []
    for score in data[name]['values']:
        esg.append(score[1])
    
    x[name] = esg

In [3]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range = (0, 1))

with open('/kaggle/input/companies/time.json', 'r') as f:
    data=f.read()

data = json.loads(data)
esg = []

for score in data['GOOGL']['values']:
    esg.append(score[1])
esg = np.array(esg).reshape(-1, 1)
esg_scaled= scaler.fit_transform(esg)

esg_scaled

features_set = []
labels = []
for i in range(60, len(esg_scaled)):
    features_set.append(esg_scaled[i-60:i, 0])
    labels.append(esg_scaled[i, 0])
    
features_set, labels = np.array(features_set), np.array(labels)
features_set = np.reshape(features_set, (features_set.shape[0], features_set.shape[1], 1))

# Creating Model

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(features_set.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=50))
model.add(Dropout(0.2))

model.add(Dense(units = 1))
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [5]:
model.fit(features_set, labels, epochs = 30, batch_size = 32)

In [6]:
test_inputs = esg_scaled[len(esg_scaled) - 60:]

test_inputs = test_inputs.reshape(-1,1)

test_features = []
for i in range(60, 61):
    test_features.append(test_inputs[i-64:i, 0])

In [7]:
test_features = np.array(test_features)
test_features = np.reshape(test_features, (test_features.shape[0], test_features.shape[1], 1))

In [8]:
predictions = model.predict(test_features)
predictions = scaler.inverse_transform(predictions)
predictions