In [3]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import shutil
import sys  
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report

In [4]:
# Load the data
news_data_cleaned = pd.read_csv('news_data.csv')

# Clean the data
news_data_cleaned['text'] = news_data_cleaned['text'].fillna('')
news_data_cleaned['categories'] = news_data_cleaned['categories'].fillna('')

# Split categories into lists
news_data_cleaned['categories_split'] = news_data_cleaned['categories'].apply(lambda x: x.split(','))

# One-hot encode the categories using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
categories_onehot = mlb.fit_transform(news_data_cleaned['categories_split'])
categories_df = pd.DataFrame(categories_onehot, columns=mlb.classes_)

categories = [
    'Greenhouse gas emissions', 'Carbon footprint', 'Exposure to companies active in the fossil fuel sector',
    'Exposure to fossil fuels through real estate assets', 'Non-renewable energy consumption and production',
    'Exposure to energy-inefficient real estate assets', 'Biodiversity/Nature Capital', 'Water', 'Pollution',
    'Waste', 'Chemical', 'Land/agricultural practice', 'Oceans/seas practices', 'Raw materials consumption',
    'UNGC principles', 'OECD guidelines', 'Workplace safety', 'Supplier management', 'Grievance mechanism',
    'Whistleblower protection', 'Discrimination', 'Human Rights', 'Gender', 'Exposure to controversial weapons',
    'Excessive CEO pay ratio', 'Corruption and bribery', 'Management structure', 'Employee relations',
    'Remuneration of staff', 'Tax compliance'
]

full_data = pd.concat([news_data_cleaned[['text']], categories_df], axis=1)

for category in categories:
    if category not in full_data.columns:
        full_data[category] = 0

FileNotFoundError: [Errno 2] No such file or directory: 'news_data.csv'

In [547]:
# Hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [537]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['text']  
        self.targets = self.df[categories].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'text': title  
        }

In [538]:
# Split the full dataset into 80% train, 10% validation, and 10% test
train_data, remaining_data = train_test_split(full_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

# Reset the index for all the datasets
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Create Dataset instances
train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)

# DataLoaders for batching
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,  # Keep shuffling for training
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,  # No need to shuffle validation/test sets
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

# Check the sizes of your training, validation, and test sets
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 200
Validation dataset size: 25
Test dataset size: 26


In [None]:
# Compute class weights
y_integers = train_data[categories].values.flatten()
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_integers), y=y_integers)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [539]:
# Define BERT-based model class
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(categories))
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids, 
            return_dict=False
        )
        output = self.dropout(output)
        output = self.linear(output)
        return output

In [540]:
model = BERTClass()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [541]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [548]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
    # Initialize tracker for minimum validation loss
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print(f'############# Epoch {epoch}: Training Start #############')

        for batch_idx, data in enumerate(training_loader):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()  
            loss = loss_fn(outputs, targets)
            
            loss.backward()  
            optimizer.step() 
            
            train_loss += ((1 / (batch_idx + 1)) * (loss.item() - train_loss))  

        print(f'############# Epoch {epoch}: Training End #############')
        print(f'############# Epoch {epoch}: Validation Start #############')

        # Validation phase
        model.eval()  # Set model to evaluation mode

        val_targets = []  
        val_outputs = []  

        with torch.no_grad():  # Disable gradient calculation for validation
            for batch_idx, data in enumerate(validation_loader):
                ids = data['input_ids'].to(device, dtype=torch.long)
                mask = data['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
                targets = data['targets'].to(device, dtype=torch.float)

                outputs = model(ids, mask, token_type_ids)
                loss = loss_fn(outputs, targets)
                valid_loss += ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))  # Update average validation loss

                val_targets.extend(targets.cpu().detach().numpy().tolist())  # Store true labels
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())  # Store model predictions

        print(f'############# Epoch {epoch}: Validation End #############')

        # Calculate average losses
        train_loss /= len(training_loader)
        valid_loss /= len(validation_loader)

        # Print training/validation statistics
        print(f'Epoch: {epoch} \tAverage Training Loss: {train_loss:.6f} \tAverage Validation Loss: {valid_loss:.6f}')

        # Create checkpoint variable and add important data
        checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        
        # Save checkpoint
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
        # Save the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print(f'Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss

        print(f'############# Epoch {epoch} Done #############\n')

    return model

In [543]:
ckpt_path = "/Users/nicholastristan/Documents/multilabel_classification/curr_ckpt.pth"
best_model_path = "/Users/nicholastristan/Documents/multilabel_classification/best_model.pt"
directory = os.path.dirname(ckpt_path)
if not os.path.exists(directory):
    os.makedirs(directory)

In [549]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)



############# Epoch 1: Training Start #############
############# Epoch 1: Training End #############
############# Epoch 1: Validation Start #############
############# Epoch 1: Validation End #############
Epoch: 1 	Average Training Loss: 0.075976 	Average Validation Loss: 0.499702
Validation loss decreased (inf --> 0.499702). Saving model ...
############# Epoch 1 Done #############

############# Epoch 2: Training Start #############
############# Epoch 2: Training End #############
############# Epoch 2: Validation Start #############
############# Epoch 2: Validation End #############
Epoch: 2 	Average Training Loss: 0.070458 	Average Validation Loss: 0.467732
Validation loss decreased (0.499702 --> 0.467732). Saving model ...
############# Epoch 2 Done #############

############# Epoch 3: Training Start #############
############# Epoch 3: Training End #############
############# Epoch 3: Validation Start #############
############# Epoch 3: Validation End #############
Epoch: 

In [556]:
# Function to make predictions on the test data
def predict_on_test_data(test_data_loader, model, device, threshold=0.38):
    model.eval() 
    predictions = []
    true_labels = []
    text_examples = []

    with torch.no_grad():  
        for batch in test_data_loader:
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)

            # Make predictions
            outputs = model(ids, mask, token_type_ids)

            # Apply sigmoid to get probabilities and threshold to make binary predictions
            probs = torch.sigmoid(outputs).cpu().detach().numpy()
            preds = (probs >= threshold).astype(int)

            # Store predictions and true labels
            predictions.extend(preds)
            true_labels.extend(targets.cpu().detach().numpy())
            text_examples.extend(batch['text'])  # Assuming 'text' is part of the dataset

    return text_examples, predictions, true_labels

text_examples, predictions, true_labels = predict_on_test_data(test_data_loader, model, device)

for text, pred in zip(text_examples, predictions):
    predicted_categories = [categories[i] for i, value in enumerate(pred) if value == 1]
    print(f"Text: {text}\nPredicted Categories: {predicted_categories}\n")


Text: nvidia sued author ai use copyrighted work march reuters nvidia nvdao open new tab whose chip power artificial intelligence sued three author said used copyrighted book without permission train nemo open new tab ai platform brian keene abdi nazemian stewart onan said work part dataset book helped train nemo simulate ordinary written language taken october due reported copyright infringement proposed class action filed friday night san francisco federal court author said takedown reflects nvidias admitted trained nemo dataset thereby infringed copyright seeking unspecified damage people united state whose copyrighted work helped train nemos socalled large language model last three year among work covered lawsuit keenes novel ghost walk nazemians novel like love story onans novella last night lobster lawsuit drag nvidia growing body litigation writer well new york time generative ai creates new content based input text image sound nvidia tout nemo fast affordable way adopt generati