In [20]:
import pandas as pd 
import string
from nltk.tokenize import word_tokenize 
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

In [21]:
nltk.download('stopwords')
nltk.download('punkt') 
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/owenwong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/owenwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/owenwong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
train = pd.read_csv('../../data/processed/train.csv')
test = pd.read_csv('../../data/processed/test.csv')

train

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,"Thank you Emily, and welcome everyone to Agile...",15.0,Low
1,A,2022,3,"Thank you, Hannah, and welcome, everyone, to A...",15.0,Low
2,AAPL,2022,1,"Good day, and welcome to the Apple Q1 FY 2022 ...",17.0,Low
3,AAPL,2022,2,"Good day, and welcome to the Apple Q2 FY 2022 ...",17.0,Low
4,AAPL,2022,3,"Good day, and welcome to the Apple Q3 FY 2022 ...",17.0,Low
...,...,...,...,...,...,...
603,WYNN,2022,4,Here we are three years in the global pandemic...,26.0,
604,YUM,2022,1,Welcome to the Q1 2022 Yum! Brands Earnings co...,21.0,Medium
605,YUM,2022,2,"Before we get started, I would like to remind ...",21.0,Medium
606,ZTS,2022,1,"Thank you, operator. Good morning, everyone, a...",18.0,Low


In [23]:
len(train['transcript_esg'][1].split(" "))

962

In [24]:
stop_words = set(stopwords.words('english'))

In [25]:
def convert_to_lowercase(msg):
    '''
    aim: change all words to lower case
    '''
    return msg.lower()


def remove_punctuation(msg):
    '''
    aim: remove all the punctuation from the tweet given
    Punctuations are characters other than alphaters and digits.
    '''
    return msg.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(msg):
    '''
    aim: remove all stopwords in the tweets
    '''
    word_tokens = word_tokenize(msg)
    filtered_tweet = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_tweet)

def remove_urls(msg):
    '''
    aim: remove all the urls contained inside the tweets
    '''
    return re.sub(r'http\S+|www\S+|https\S+', '', msg, flags=re.MULTILINE)

def remove_numbers(msg): 
    return re.sub(r'\d+', '', msg) 

def clean_transcript(msg): 
    if not isinstance(msg, str): 
        return ""
    msg = remove_numbers(msg) 
    msg = convert_to_lowercase(msg)
    msg = remove_urls(msg)
    msg = remove_punctuation(msg)
    msg = remove_stopwords(msg)
    return msg

In [26]:
train['transcript_esg'] = train['transcript_esg'].apply(clean_transcript)
test['transcript_esg'] = test['transcript_esg'].apply(clean_transcript)
train.head(5)

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,thank emily welcome everyone agilents conferen...,15.0,Low
1,A,2022,3,thank hannah welcome everyone agilents confere...,15.0,Low
2,AAPL,2022,1,good day welcome apple q fy earnings conferenc...,17.0,Low
3,AAPL,2022,2,good day welcome apple q fy earnings conferenc...,17.0,Low
4,AAPL,2022,3,good day welcome apple q fy earnings conferenc...,17.0,Low


In [27]:
len(train['transcript_esg'][1].split(" "))

538

In [28]:
def lemmatization(tweet):
    '''
    aim: perform lemmatization on the text
    '''
    words = nltk.word_tokenize(tweet)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

lemmatizer = WordNetLemmatizer() 

# Applying stemming and lemmatization to compare the output
train['transcript_esg'] = train['transcript_esg'].apply(lemmatization)
train

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,thank emily welcome everyone agilents conferen...,15.0,Low
1,A,2022,3,thank hannah welcome everyone agilents confere...,15.0,Low
2,AAPL,2022,1,good day welcome apple q fy earnings conferenc...,17.0,Low
3,AAPL,2022,2,good day welcome apple q fy earnings conferenc...,17.0,Low
4,AAPL,2022,3,good day welcome apple q fy earnings conferenc...,17.0,Low
...,...,...,...,...,...,...
603,WYNN,2022,4,three year global pandemic later wynn la vega ...,26.0,
604,YUM,2022,1,welcome q yum brand earnings conference call n...,21.0,Medium
605,YUM,2022,2,get started would like remind conference call ...,21.0,Medium
606,ZTS,2022,1,thank operator good morning everyone welcome z...,18.0,Low


In [29]:
train = train.dropna(subset=['transcript_esg', 'esg_score'])
test = test.dropna(subset=['transcript_esg', 'esg_score'])

### Importing Libraries and Loading the Tokenizer 

In [30]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('nbroad/ESG-BERT')

### Encoding the Text data 

In [31]:
# padding ensures all sequences are the same length, truncation cuts off texts longer than BERT's maximum input length, and return_tensors specifies tensor output.

def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Encode the datasets
train_encodings = encode_texts(train['transcript_esg'].tolist())
test_encodings = encode_texts(test['transcript_esg'].tolist())

### Dataset Preparation

In [32]:
import torch
from torch.utils.data import Dataset 

class ESGDataset(Dataset):
    def __init__(self, encodings, scores):
        self.encodings = encodings
        self.scores = scores

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.scores[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.scores)

train_dataset = ESGDataset(train_encodings, train['esg_score'].tolist())
test_dataset = ESGDataset(test_encodings, test['esg_score'].tolist())

### Load and Modify the Model

In [33]:
from transformers import BertForSequenceClassification

# Load the model
model = BertForSequenceClassification.from_pretrained('nbroad/ESG-BERT')  

# For regression, we need to remove the classification head that outputs logits for classes
# We set it to 1 so it only output 1 single continuous value 
model.classifier = torch.nn.Linear(model.config.hidden_size, 1)
model.num_labels = 1

### Training Setup 

In [34]:
from transformers import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

from sklearn.metrics import mean_squared_error

# Load optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Training Loop

In [35]:
# Training loop
model.train()
for epoch in range(3):  # number of epochs
    print(f"Epoch {epoch+1}")
    # Wrap the train_loader with tqdm for a progress bar
    progress_bar = tqdm(train_loader, desc="Training", leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'loss': loss.item()})

Epoch 1


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
                                                

KeyboardInterrupt: 

### Save and Evaluate the Model

In [None]:
# Save the model
# It is a good practice to save a checkpoint at the end of training that includes
# the model state, optimizer state, and any other relevant information.
model_save_path = "../../model/trained_ESG_BERT.pth"  # Define your path here
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, model_save_path)
print(f"Model saved to {model_save_path}")

model.eval()
predictions = []

# Wrap the test loader in tqdm for a progress bar
test_loader = DataLoader(test_dataset, batch_size=16)
progress_bar = tqdm(test_loader, desc="Evaluating", leave=False)
with torch.no_grad():
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(-1).cpu().numpy()
        predictions.extend(logits)

test['predicted_esg_score'] = predictions
test

In [None]:
mse = mean_squared_error(test['esg_score'], test['predicted_esg_score'])
print(f"Test MSE: {mse}")

In [None]:
test.to_csv('test_with_predictions.csv', index=False)
print("Results saved with predictions.")

In [None]:
mean_esg_score = test['esg_score'].mean()
min_esg_score = test['esg_score'].min()
max_esg_score = test['esg_score'].max()

print("Mean ESG Score:", mean_esg_score)
print("Range of ESG Scores:", max_esg_score - min_esg_score)

Mean ESG Score: 20.692307692307693
Range of ESG Scores: 27.0


## TO-DO 

1. Currently the MSE is not good(78.X), perhaps can make it better by hyperparameter finetuning (on lr, batch size, num of epoches, optimizer, parameter, regularization, etc. )