In [135]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import shutil
import sys  
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report

In [136]:
# Load the data
news_data_cleaned = pd.read_csv('news_data.csv')

# Clean the data
news_data_cleaned['text'] = news_data_cleaned['text'].fillna('')
news_data_cleaned['categories'] = news_data_cleaned['categories'].fillna('')

# Split categories into lists
news_data_cleaned['categories_split'] = news_data_cleaned['categories'].apply(lambda x: x.split(','))

# One-hot encode the categories using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
categories_onehot = mlb.fit_transform(news_data_cleaned['categories_split'])
categories_df = pd.DataFrame(categories_onehot, columns=mlb.classes_)

categories = [
    'Greenhouse gas emissions', 'Carbon footprint', 'Exposure to companies active in the fossil fuel sector',
    'Exposure to fossil fuels through real estate assets', 'Non-renewable energy consumption and production',
    'Exposure to energy-inefficient real estate assets', 'Biodiversity/Nature Capital', 'Water', 'Pollution',
    'Waste', 'Chemical', 'Land/agricultural practice', 'Oceans/seas practices', 'Raw materials consumption',
    'UNGC principles', 'OECD guidelines', 'Workplace safety', 'Supplier management', 'Grievance mechanism',
    'Whistleblower protection', 'Discrimination', 'Human Rights', 'Gender', 'Exposure to controversial weapons',
    'Excessive CEO pay ratio', 'Corruption and bribery', 'Management structure', 'Employee relations',
    'Remuneration of staff', 'Tax compliance'
]

full_data = pd.concat([news_data_cleaned[['text']], categories_df], axis=1)

for category in categories:
    if category not in full_data.columns:
        full_data[category] = 0

In [137]:
categories_df = pd.DataFrame(categories_onehot, columns=mlb.classes_)
most_common_tags = pd.DataFrame(categories_df.sum(axis=0)).reset_index()
most_common_tags.columns = ['Tag', 'count']
most_common_tags = most_common_tags[most_common_tags['count'] > 0]

# Calculate class weights for non-zero categories
most_common_tags['class_weight'] = len(categories_df) / most_common_tags['count']
class_weight = {}
filtered_categories = []

for index, label in enumerate(categories):
    if label in most_common_tags['Tag'].values:
        class_weight[index] = most_common_tags[most_common_tags['Tag'] == label]['class_weight'].values[0]
        filtered_categories.append(label)  
filtered_data = full_data[['text'] + filtered_categories]
print(most_common_tags.head())



                                                 Tag  count  class_weight
0                        Biodiversity/Nature Capital      8     31.375000
1                                   Carbon footprint      7     35.857143
2                                           Chemical     20     12.550000
3                             Corruption and bribery      8     31.375000
4   Exposure to companies active in the fossil fu...      2    125.500000


In [138]:
# Hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [140]:
# Custom Dataset Class for filtered categories
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, active_categories):
        self.tokenizer = tokenizer
        self.df = df
        self.text = df['text']
        self.targets = self.df[active_categories].values
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'text': text  
        }
        
# Split the full dataset into 80% train, 10% validation, and 10% test
train_data, remaining_data = train_test_split(filtered_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

# Reset the index for all the datasets
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Create Dataset instances using filtered categories
train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN, filtered_categories)
valid_dataset = CustomDataset(val_data, tokenizer, MAX_LEN, filtered_categories)
test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN, filtered_categories)

# DataLoaders for batching
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)
val_data_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)
test_data_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)

# Check the sizes of your training, validation, and test sets
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 200
Validation dataset size: 25
Test dataset size: 26


In [142]:
# Define BERT-based model class
class BERTClass(torch.nn.Module):
    def __init__(self, num_active_categories):
        super(BERTClass, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, num_active_categories)  
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.dropout(output)
        output = self.linear(output)
        return output

# Create model with the number of output categories
num_active_categories = len(filtered_categories)  
model = BERTClass(num_active_categories).to(device)


In [143]:
num_active_categories = len(filtered_categories)
model = BERTClass(num_active_categories)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
class_weight_tensor = torch.FloatTensor(list(class_weight.values())).to(device)

# Define the loss function with class weights
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss(weight=class_weight_tensor)(outputs, targets)

# Define the optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


In [144]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [145]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
    # Initialize tracker for minimum validation loss
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print(f'############# Epoch {epoch}: Training Start #############')

        for batch_idx, data in enumerate(training_loader):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()  
            loss = loss_fn(outputs, targets)
            
            loss.backward()  
            optimizer.step() 
            
            train_loss += ((1 / (batch_idx + 1)) * (loss.item() - train_loss))  

        print(f'############# Epoch {epoch}: Training End #############')
        print(f'############# Epoch {epoch}: Validation Start #############')

        # Validation phase
        model.eval()  

        val_targets = []  
        val_outputs = []  

        with torch.no_grad():  #
            for batch_idx, data in enumerate(validation_loader):
                ids = data['input_ids'].to(device, dtype=torch.long)
                mask = data['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
                targets = data['targets'].to(device, dtype=torch.float)

                outputs = model(ids, mask, token_type_ids)
                loss = loss_fn(outputs, targets)
                valid_loss += ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))  

                val_targets.extend(targets.cpu().detach().numpy().tolist())  
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) 

        print(f'############# Epoch {epoch}: Validation End #############')

        # Calculate average losses
        train_loss /= len(training_loader)
        valid_loss /= len(validation_loader)
        print(f'Epoch: {epoch} \tAverage Training Loss: {train_loss:.6f} \tAverage Validation Loss: {valid_loss:.6f}')

        # Create checkpoint variable and add important data
        checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        
        # Save checkpoint
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
        # Save the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print(f'Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss

        print(f'############# Epoch {epoch} Done #############\n')

    return model

In [146]:
ckpt_path = "/Users/nicholastristan/Documents/multilabel_classification/curr_ckpt.pth"
best_model_path = "/Users/nicholastristan/Documents/multilabel_classification/best_model.pt"
directory = os.path.dirname(ckpt_path)
if not os.path.exists(directory):
    os.makedirs(directory)

In [104]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)



############# Epoch 1: Training Start #############
############# Epoch 1: Training End #############
############# Epoch 1: Validation Start #############
############# Epoch 1: Validation End #############
Epoch: 1 	Average Training Loss: 4.574385 	Average Validation Loss: 27.138155
Validation loss decreased (inf --> 27.138155). Saving model ...
############# Epoch 1 Done #############

############# Epoch 2: Training Start #############
############# Epoch 2: Training End #############
############# Epoch 2: Validation Start #############
############# Epoch 2: Validation End #############
Epoch: 2 	Average Training Loss: 3.593827 	Average Validation Loss: 21.578285
Validation loss decreased (27.138155 --> 21.578285). Saving model ...
############# Epoch 2 Done #############

############# Epoch 3: Training Start #############
############# Epoch 3: Training End #############
############# Epoch 3: Validation Start #############
############# Epoch 3: Validation End #############
Ep

In [152]:
def get_predictions(model, data_loader, k=3):
    model.eval()  
    all_outputs = []
    all_targets = []

    with torch.no_grad():  
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch.get('token_type_ids', None)
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            
            targets = batch['targets'].to(device)
            outputs = model(input_ids, attention_mask, token_type_ids)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            probs = torch.sigmoid(logits)
            all_outputs.append(probs.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

    return np.vstack(all_outputs), np.vstack(all_targets)

def precision_at_k(y_true, y_pred, k=3):
    top_k_preds = np.argsort(-y_pred, axis=1)[:, :k] 
    precision_scores = []

    for i in range(y_true.shape[0]):
        top_k_labels = top_k_preds[i]
        true_labels = np.where(y_true[i] == 1)[0]

        correct = np.intersect1d(top_k_labels, true_labels).shape[0]
        precision_scores.append(correct / k)

    return np.mean(precision_scores)

def recall_at_k(y_true, y_pred, k=3):
    top_k_preds = np.argsort(-y_pred, axis=1)[:, :k]
    recall_scores = []

    for i in range(y_true.shape[0]):
        top_k_labels = top_k_preds[i]
        true_labels = np.where(y_true[i] == 1)[0]

        correct = np.intersect1d(top_k_labels, true_labels).shape[0]
        recall_scores.append(correct / len(true_labels) if len(true_labels) > 0 else 0)

    return np.mean(recall_scores)

def ndcg_at_k(y_true, y_pred, k=3):
    def dcg_at_k(relevance_scores):
        return np.sum((2 ** relevance_scores - 1) / np.log2(np.arange(1, len(relevance_scores) + 1) + 1))

    ndcg_scores = []

    for i in range(y_true.shape[0]):
        top_k_preds = np.argsort(-y_pred[i])[:k]
        relevance_scores = y_true[i, top_k_preds]
        
        dcg = dcg_at_k(relevance_scores)
        ideal_dcg = dcg_at_k(np.sort(relevance_scores)[::-1])

        ndcg_scores.append(dcg / ideal_dcg if ideal_dcg > 0 else 0)

    return np.mean(ndcg_scores)

y_pred, y_true = get_predictions(model, val_data_loader, k=3)
precision_k = precision_at_k(y_true, y_pred, k=3)
recall_k = recall_at_k(y_true, y_pred, k=3)
ndcg_k = ndcg_at_k(y_true, y_pred, k=3)
print(f'Precision@3: {precision_k:.4f}')
print(f'Recall@3: {recall_k:.4f}')
print(f'nDCG@3: {ndcg_k:.4f}')


Precision@3: 0.0933
Recall@3: 0.1300
nDCG@3: 0.1809
