In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm

### Load the data

In [4]:
train = pd.read_csv('../../data/processed/train_esg_shortened.csv')
test = pd.read_csv('../../data/processed/test_esg_shortened.csv')

train

Unnamed: 0,symbol,year,quarter,transcript_esg,esg_score,esg_risk_level
0,A,2022,1,thank emily welcome everyone agilents conferen...,15.0,Low
1,A,2022,3,thank hannah welcome everyone agilents confere...,15.0,Low
2,AAPL,2022,1,good day welcome apple q fy earnings conferenc...,17.0,Low
3,AAPL,2022,2,good day welcome apple q fy earnings conferenc...,17.0,Low
4,AAPL,2022,3,good day welcome apple q fy earnings conferenc...,17.0,Low
...,...,...,...,...,...,...
542,WMT,2023,3,followed questionandanswer session turn call d...,25.0,Medium
543,YUM,2022,1,welcome q yum brand earnings conference call n...,21.0,Medium
544,YUM,2022,2,get started would like remind conference call ...,21.0,Medium
545,ZTS,2022,1,thank operator good morning everyone welcome z...,18.0,Low


In [5]:
# Find the number of unique classes in the 'esg_risk_level' column
unique_classes = train['esg_risk_level'].unique()
num_classes = len(unique_classes)

print(f"Number of unique ESG risk levels: {num_classes}")
print(f"Unique classes: {unique_classes}")

Number of unique ESG risk levels: 5
Unique classes: ['Low' 'Medium' 'Severe' 'High' 'Negligible']


### Load tokenizer and encode texts

In [3]:
# Load tokenizer and encode texts
tokenizer = AutoTokenizer.from_pretrained('nbroad/ESG-BERT')
def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

### Encode and handle labels

In [4]:
# Encode and handle labels
label_encoder = LabelEncoder()
train['esg_risk_level'] = label_encoder.fit_transform(train['esg_risk_level'])
test['esg_risk_level'] = label_encoder.transform(test['esg_risk_level'])
train['transcript_esg'].fillna('', inplace=True)
test['transcript_esg'].fillna('', inplace=True)
train_encodings = encode_texts(train['transcript_esg'].tolist())
test_encodings = encode_texts(test['transcript_esg'].tolist())


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Dataset Preparation

In [5]:
class ESGDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)  # Ensure labels are long from the start

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

### Prepare datasets

In [6]:
train_dataset = ESGDataset(train_encodings, train['esg_risk_level'].tolist())
test_dataset = ESGDataset(test_encodings, test['esg_risk_level'].tolist())

### Model setup with class weights to handle imbalance


In [7]:
num_classes = len(label_encoder.classes_)
class_weights = torch.tensor([1.0 / train['esg_risk_level'].value_counts()[i] for i in range(num_classes)], dtype=torch.float)  # Explicitly define as float
class_weights = class_weights / class_weights.sum() * num_classes
model = BertForSequenceClassification.from_pretrained('nbroad/ESG-BERT', num_labels=num_classes, ignore_mismatched_sizes=True)
model.classifier = torch.nn.Linear(model.config.hidden_size, num_classes)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(torch.device("cpu")))  # Ensure weights are on the correct device and float

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nbroad/ESG-BERT and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([26]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([26, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data loader

In [8]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
device = torch.device("cpu")  # Use "cuda" if you have GPU
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Training with class weights

In [9]:

# Training Loop
model.train()
for epoch in range(3):
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits.float(), labels)  # Ensure logits are FloatTensor
        
        loss.backward()
        optimizer.step()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training Epoch 1: 100%|██████████| 35/35 [08:51<00:00, 15.18s/it]
Training Epoch 2: 100%|██████████| 35/35 [08:52<00:00, 15.21s/it]
Training Epoch 3: 100%|██████████| 35/35 [09:16<00:00, 15.89s/it]


### Evaluation

In [10]:

def evaluate_and_save_results(model, dataset, data, filename):
    model.eval()
    loader = DataLoader(dataset, batch_size=16)
    all_predictions = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
            all_predictions.extend(predictions)

    # Convert numerical predictions back to label names using the inverse transformation of the label encoder
    predicted_labels = label_encoder.inverse_transform(all_predictions)

    # Append predictions to the DataFrame
    data['predicted_esg_risk_level'] = predicted_labels

    # Calculate accuracy
    accuracy = accuracy_score(data['esg_risk_level'], all_predictions)
    print(f"Accuracy for {filename}: {accuracy}")

    # Save to CSV
    data.to_csv(filename, index=False)
    print(f"Results saved to {filename}")
    return accuracy

### Compute accuracies

In [11]:
# Use the modified function for both train and test datasets
train_accuracy = evaluate_and_save_results(model, train_dataset, train.copy(), '../../data/processed/train_results.csv')
test_accuracy = evaluate_and_save_results(model, test_dataset, test.copy(), '../../data/processed/test_results.csv')

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Accuracy for ../../data/processed/train_results.csv: 0.5137111517367459
Results saved to ../../data/processed/train_results.csv
Accuracy for ../../data/processed/test_results.csv: 0.463768115942029
Results saved to ../../data/processed/test_results.csv


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Save the model

In [12]:
# Save the model
model_save_path = "../../model/trained_ESG_BERT.pth"
torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, model_save_path)