In [29]:
import gc
import torch


# Clear GPU memory (if applicable)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()


In [1]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate
import re
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

df = pd.read_csv('filtered_labeled.csv')

def clean_text(text):
    cleaned_text = re.sub(r'(\n[a-zA-Z])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

df['text'] = df['text'].apply(clean_text)

df['Institutional_Form'] = df['Institutional_Form'].astype('category')
df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

category_map = {code: category for code, category in enumerate(df['Institutional_Form'].cat.categories)}

df_train, df_val = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)


def generate_features_with_prompt(df):
    # Define the instruction
    instruction = (
        "You are a contract classification assistant. Your task is to classify the contract text "
        "into one of the predefined categories. Here are the criteria for each category:\n"
        "- Joint Operations: Partnership arrangements to jointly produce services with one or more organizations.\n"
        "- New Joint Entities: Two or more organizations creating a separate new entity to manage or govern a shared asset or service.\n"
        "- Resource Sharing: Sharing of information, personnel, equipment, etc., between governments or community organizations to provide services.\n"
        "- Service Contracts: Agreements with outside entities, public or private, for provision or support services.\n"
        "Analyze the given text carefully and respond with the appropriate category."
    )
    df['Institutional_Form'] = df['Institutional_Form'].astype(str)
    # Generate the `input` column by combining instruction, text, and category
    df['input'] = (
        instruction
        + "\n\nContract Text: "
        + df['text']
        + "\n\nInstitutional Form Category: "
        + df['Institutional_Form']
    )

    # Add the numerical category column if not already present
    if 'Institutional_Form_category' not in df.columns:
        df['Institutional_Form'] = df['Institutional_Form'].astype('category')
        df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

    return df

generate_features_with_prompt(df_train)
generate_features_with_prompt(df_val)

dataset_train = Dataset.from_pandas(df_train.reset_index(drop=True))
dataset_val = Dataset.from_pandas(df_val.reset_index(drop=True))

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
})

#Since our classes are not balanced let's calculate class weights based on inverse value counts
#Convert to pytorch tensor since we will need it
df_train.Institutional_Form_category.value_counts(normalize=True)
class_weights=(1/df_train.Institutional_Form.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()

model_name = "meta-llama/Llama-3.1-8B"

#Quantization Config (for QLORA)
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

#
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=len(category_map)
)

#prepare_model_for_kbit_training() function to preprocess the quantized model for training.
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

#Must use .cache = False as below or it crashes from my experience
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

def llama_preprocessing_function(examples):
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("Institutional_Form_category", "label")
tokenized_datasets.set_format("torch")

#Purpose: Automatically pads text data to the longest sequence in a batch
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    try:
        # it's a classification task, take the argmax
        predictions_processed = np.argmax(predictions, axis=1)

        # Calculate Pearson correlation
        pearson, _ = pearsonr(predictions_processed, labels)

        return {'pearson': pearson}
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {'pearson': None}
    
#We will have a custom loss function that deals with the class weights and have class weights as additional argument in constructor
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss
    
training_args = TrainingArguments(
    output_dir = 'sequence_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights,
)

train_result = trainer.train()

def make_predictions(model, df):
    

    sentences = df.input.tolist()

      # Define the batch size
    batch_size = 32  # You can adjust this based on your system's memory capacity

      # Initialize an empty list to store the model outputs
    all_outputs = []

      # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
          # Get the batch of sentences
        batch_sentences = sentences[i:i + batch_size]

          # Tokenize the batch
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

          # Move tensors to the device where the model is (e.g., GPU or CPU)
        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

          # Perform inference and store the logits
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])

    final_outputs = torch.cat(all_outputs, dim=0)
    df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
    #df['predictions']=df['predictions'].apply(lambda l:category_map[l])

    return df

def get_performance_metrics(df):
    y_test = df.Institutional_Form_category
    y_pred = df.predictions
    print(f"comparing test {y_test} and pred {y_pred}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    
df_val = make_predictions(model, df_val)
get_performance_metrics(df_val)


  from .autonotebook import tqdm as notebook_tqdm
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:04<00:00,  1.13s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 498/498 [00:00<00:00, 1633.00 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 125/125 [00:00<00:00, 1430.36 examples/s]
  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
Failed to detect 

  return fn(*args, **kwargs)


KeyboardInterrupt: 

In [8]:
def make_predictions(model, df):
    

    sentences = df.input.tolist()

      # Define the batch size
    batch_size = 32  # You can adjust this based on your system's memory capacity

      # Initialize an empty list to store the model outputs
    all_outputs = []

      # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
          # Get the batch of sentences
        batch_sentences = sentences[i:i + batch_size]

          # Tokenize the batch
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

          # Move tensors to the device where the model is (e.g., GPU or CPU)
        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

          # Perform inference and store the logits
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])

    final_outputs = torch.cat(all_outputs, dim=0)
    df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
    #df['predictions']=df['predictions'].apply(lambda l:category_map[l])

    return df

def get_performance_metrics(df):
    y_test = df.Institutional_Form_category
    y_pred = df.predictions
    print(f"comparing test {y_test} and pred {y_pred}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    
df_val = make_predictions(model, df_val)
get_performance_metrics(df_val)


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    3
558    3
174    3
280    3
110    3
      ..
6      3
104    1
114    1
355    1
132    3
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[ 3  5  6  7]
 [ 7  2  4  8]
 [ 2  1  2  5]
 [27 15 13 18]]

Classification Report:
              precision    recall  f1-score   support

           0       0.08      0.14      0.10        21
           1       0.09      0.10      0.09        21
           2       0.08      0.20      0.11        10
           3       0.47      0.25      0.32        73

    accuracy                           0.20       125
   macro avg       0.18      0.17      0.16       125
weighted avg       0.31      0.20      0.23       125

Balanced Accuracy Score: 0.17116764514024788
Accuracy Score: 0.2


In [1]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate
import re
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

df = pd.read_csv('filtered_labeled.csv')

def clean_text(text):
    cleaned_text = re.sub(r'(\n[a-zA-Z])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

df['text'] = df['text'].apply(clean_text)

df['Institutional_Form'] = df['Institutional_Form'].astype('category')
df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

category_map = {code: category for code, category in enumerate(df['Institutional_Form'].cat.categories)}

df_train, df_val = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)


def generate_features_with_prompt(df):
    # Define the instruction
    instruction = (
        "You are a contract classification assistant. Your task is to classify the contract text "
        "into one of the predefined categories. Here are the criteria for each category:\n"
        "- Joint Operations: Partnership arrangements to jointly produce services with one or more organizations.\n"
        "- New Joint Entities: Two or more organizations creating a separate new entity to manage or govern a shared asset or service.\n"
        "- Resource Sharing: Sharing of information, personnel, equipment, etc., between governments or community organizations to provide services.\n"
        "- Service Contracts: Agreements with outside entities, public or private, for provision or support services.\n"
        "Analyze the given text carefully and respond with the appropriate category."
    )
    df['Institutional_Form'] = df['Institutional_Form'].astype(str)
    # Generate the `input` column by combining instruction, text, and category
    df['input'] = (
        instruction
        + "\n\nContract Text: "
        + df['text']
        + "\n\nInstitutional Form Category: "
        + df['Institutional_Form']
    )

    # Add the numerical category column if not already present
    if 'Institutional_Form_category' not in df.columns:
        df['Institutional_Form'] = df['Institutional_Form'].astype('category')
        df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

    return df

generate_features_with_prompt(df_train)
generate_features_with_prompt(df_val)

dataset_train = Dataset.from_pandas(df_train.reset_index(drop=True))
dataset_val = Dataset.from_pandas(df_val.reset_index(drop=True))

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
})

#Since our classes are not balanced let's calculate class weights based on inverse value counts
#Convert to pytorch tensor since we will need it
df_train.Institutional_Form_category.value_counts(normalize=True)
class_weights=(1/df_train.Institutional_Form.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()

model_name = "meta-llama/Llama-3.1-8B"

#Quantization Config (for QLORA)
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

#
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=len(category_map)
)

#prepare_model_for_kbit_training() function to preprocess the quantized model for training.
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

#Must use .cache = False as below or it crashes from my experience
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

def llama_preprocessing_function(examples):
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("Institutional_Form_category", "label")
tokenized_datasets.set_format("torch")

#Purpose: Automatically pads text data to the longest sequence in a batch
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    try:
        # it's a classification task, take the argmax
        predictions_processed = np.argmax(predictions, axis=1)

        # Calculate Pearson correlation
        pearson, _ = pearsonr(predictions_processed, labels)

        return {'pearson': pearson}
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {'pearson': None}
    
#We will have a custom loss function that deals with the class weights and have class weights as additional argument in constructor
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss
    


def make_predictions(model, df):
    

    sentences = df.input.tolist()

      # Define the batch size
    batch_size = 32  # You can adjust this based on your system's memory capacity

      # Initialize an empty list to store the model outputs
    all_outputs = []

      # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
          # Get the batch of sentences
        batch_sentences = sentences[i:i + batch_size]

          # Tokenize the batch
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

          # Move tensors to the device where the model is (e.g., GPU or CPU)
        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

          # Perform inference and store the logits
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])

    final_outputs = torch.cat(all_outputs, dim=0)
    df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
    #df['predictions']=df['predictions'].apply(lambda l:category_map[l])

    return df

def get_performance_metrics(df):
    y_test = df.Institutional_Form_category
    y_pred = df.predictions
    print(f"comparing test {y_test} and pred {y_pred}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    
# Define hyperparameter grid
learning_rates = [1e-5, 5e-5, 1e-4]
weight_decays = [0.001, 0.01, 0.1]

# Keep track of results
results = []

for lr in learning_rates:
    for wd in weight_decays:
        print(f"\nTraining with learning_rate={lr}, weight_decay={wd}\n")

        # Update training arguments
        training_args = TrainingArguments(
            output_dir=f'sequence_classification_lr{lr}_wd{wd}',
            learning_rate=lr,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=wd,
            evaluation_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            logging_dir=f'logs_lr{lr}_wd{wd}',
            logging_steps=10
        )

        # Initialize trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets['train'],
            eval_dataset=tokenized_datasets['val'],
            tokenizer=tokenizer,
            data_collator=collate_fn,
            compute_metrics=compute_metrics,
            class_weights=class_weights,
        )

        # Train the model
        train_result = trainer.train()

        # Record training loss and evaluation metrics
        eval_metrics = trainer.evaluate()
        train_loss = train_result.training_loss

        # Make predictions and compute performance metrics
        df_val = make_predictions(model, df_val)
        performance_metrics = get_performance_metrics(df_val)

        # Append the metrics to results
        results.append({
            "learning_rate": lr,
            "weight_decay": wd,
            "train_loss": train_loss,
            "eval_metrics": eval_metrics,
            "performance_metrics": performance_metrics
        })

        # Print results for the current model
        print(f"Results for learning_rate={lr}, weight_decay={wd}")
        print(f"Train Loss: {train_loss}")
        print(f"Evaluation Metrics: {eval_metrics}")
        print(f"Performance Metrics: {performance_metrics}")

# Summarize all results
print("\nSummary of all results:")
for res in results:
    print(res)


  from .autonotebook import tqdm as notebook_tqdm
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:04<00:00,  1.13s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 498/498 [00:00<00:00, 1666.63 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 125/125 [00:00<00:00, 1424.16 examples/s]
  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.



Training with learning_rate=1e-05, weight_decay=0.001



  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmghasemizade97[0m ([33mmghasemizade97-university-of-vermont[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,3.066216,-0.031263
2,2.927700,2.98562,-0.041582
3,2.740300,2.957715,-0.012257


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    3
558    1
174    3
280    3
110    3
      ..
6      3
104    0
114    3
355    1
132    3
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[ 7  7  1  6]
 [ 5  9  0  7]
 [ 4  3  0  3]
 [20 30  4 19]]

Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.33      0.25        21
           1       0.18      0.43      0.26        21
           2       0.00      0.00      0.00        10
           3       0.54      0.26      0.35        73

    accuracy                           0.28       125
   macro avg       0.23      0.26      0.21       125
weighted avg       0.38      0.28      0.29       125

Balanced Accuracy Score: 0.2555446836268754
Accuracy Score: 0.28
Results for learning_rate=1e-05, weight_decay=0.001
Train Loss: 2.7909

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,2.863046,-0.054032
2,2.616800,2.785446,-0.019659
3,2.491900,2.759545,0.0007


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    3
558    1
174    3
280    3
110    3
      ..
6      3
104    0
114    3
355    1
132    3
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[ 7  8  1  5]
 [ 5  8  1  7]
 [ 4  3  1  2]
 [23 23 10 17]]

Classification Report:
              precision    recall  f1-score   support

           0       0.18      0.33      0.23        21
           1       0.19      0.38      0.25        21
           2       0.08      0.10      0.09        10
           3       0.55      0.23      0.33        73

    accuracy                           0.26       125
   macro avg       0.25      0.26      0.23       125
weighted avg       0.39      0.26      0.28       125

Balanced Accuracy Score: 0.2617906066536203
Accuracy Score: 0.264
Results for learning_rate=1e-05, weight_decay=0.01
Train Loss: 2.5295

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,2.697294,-0.057587
2,2.382200,2.626969,0.017276
3,2.308700,2.605075,0.039684


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    3
558    1
174    3
280    3
110    2
      ..
6      3
104    0
114    3
355    1
132    3
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[ 7  8  1  5]
 [ 5  7  3  6]
 [ 4  3  1  2]
 [19 22 17 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.33      0.25        21
           1       0.17      0.33      0.23        21
           2       0.05      0.10      0.06        10
           3       0.54      0.21      0.30        73

    accuracy                           0.24       125
   macro avg       0.24      0.24      0.21       125
weighted avg       0.38      0.24      0.26       125

Balanced Accuracy Score: 0.24303652968036527
Accuracy Score: 0.24
Results for learning_rate=1e-05, weight_decay=0.1
Train Loss: 2.33516

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,2.35245,0.021218
2,2.103200,2.136938,0.063233
3,1.799200,2.089119,0.055747


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    3
558    2
174    3
280    2
110    2
      ..
6      3
104    0
114    3
355    1
132    3
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[ 7  5  4  5]
 [ 4  7  6  4]
 [ 3  3  2  2]
 [20 14 20 19]]

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.33      0.25        21
           1       0.24      0.33      0.28        21
           2       0.06      0.20      0.10        10
           3       0.63      0.26      0.37        73

    accuracy                           0.28       125
   macro avg       0.29      0.28      0.25       125
weighted avg       0.45      0.28      0.31       125

Balanced Accuracy Score: 0.2817351598173516
Accuracy Score: 0.28
Results for learning_rate=5e-05, weight_decay=0.001
Train Loss: 1.9240

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,2.028989,0.09212
2,1.531700,1.851082,0.259632
3,1.250300,1.81914,0.259766


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


SafetensorError: Error while serializing: IoError(Os { code: 122, kind: FilesystemQuotaExceeded, message: "Disk quota exceeded" })

In [1]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate
import re
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

df = pd.read_csv('filtered_labeled.csv')

def clean_text(text):
    cleaned_text = re.sub(r'(\n[a-zA-Z])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

df['text'] = df['text'].apply(clean_text)

df['Institutional_Form'] = df['Institutional_Form'].astype('category')
df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

category_map = {code: category for code, category in enumerate(df['Institutional_Form'].cat.categories)}

df_train, df_val = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)


def generate_features_with_prompt(df):
    # Define the instruction
    instruction = (
        "You are a contract classification assistant. Your task is to classify the contract text "
        "into one of the predefined categories. Here are the criteria for each category:\n"
        "- Joint Operations: Partnership arrangements to jointly produce services with one or more organizations.\n"
        "- New Joint Entities: Two or more organizations creating a separate new entity to manage or govern a shared asset or service.\n"
        "- Resource Sharing: Sharing of information, personnel, equipment, etc., between governments or community organizations to provide services.\n"
        "- Service Contracts: Agreements with outside entities, public or private, for provision or support services.\n"
        "Analyze the given text carefully and respond with the appropriate category."
    )
    df['Institutional_Form'] = df['Institutional_Form'].astype(str)
    # Generate the `input` column by combining instruction, text, and category
    df['input'] = (
        instruction
        + "\n\nContract Text: "
        + df['text']
        + "\n\nInstitutional Form Category: "
        + df['Institutional_Form']
    )

    # Add the numerical category column if not already present
    if 'Institutional_Form_category' not in df.columns:
        df['Institutional_Form'] = df['Institutional_Form'].astype('category')
        df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

    return df

generate_features_with_prompt(df_train)
generate_features_with_prompt(df_val)

dataset_train = Dataset.from_pandas(df_train.reset_index(drop=True))
dataset_val = Dataset.from_pandas(df_val.reset_index(drop=True))

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
})

#Since our classes are not balanced let's calculate class weights based on inverse value counts
#Convert to pytorch tensor since we will need it
df_train.Institutional_Form_category.value_counts(normalize=True)
class_weights=(1/df_train.Institutional_Form.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()

model_name = "meta-llama/Llama-3.1-8B"

#Quantization Config (for QLORA)
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

#
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=len(category_map)
)

#prepare_model_for_kbit_training() function to preprocess the quantized model for training.
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

#Must use .cache = False as below or it crashes from my experience
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

def llama_preprocessing_function(examples):
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("Institutional_Form_category", "label")
tokenized_datasets.set_format("torch")

#Purpose: Automatically pads text data to the longest sequence in a batch
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    try:
        # it's a classification task, take the argmax
        predictions_processed = np.argmax(predictions, axis=1)

        # Calculate Pearson correlation
        pearson, _ = pearsonr(predictions_processed, labels)

        return {'pearson': pearson}
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {'pearson': None}
    
#We will have a custom loss function that deals with the class weights and have class weights as additional argument in constructor
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss
    


def make_predictions(model, df):
    

    sentences = df.input.tolist()

      # Define the batch size
    batch_size = 32  # You can adjust this based on your system's memory capacity

      # Initialize an empty list to store the model outputs
    all_outputs = []

      # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
          # Get the batch of sentences
        batch_sentences = sentences[i:i + batch_size]

          # Tokenize the batch
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

          # Move tensors to the device where the model is (e.g., GPU or CPU)
        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

          # Perform inference and store the logits
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])

    final_outputs = torch.cat(all_outputs, dim=0)
    df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
    #df['predictions']=df['predictions'].apply(lambda l:category_map[l])

    return df

def get_performance_metrics(df):
    y_test = df.Institutional_Form_category
    y_pred = df.predictions
    print(f"comparing test {y_test} and pred {y_pred}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    
# Define hyperparameter grid
learning_rates = [2e-4]
weight_decays = [0.01, 0.1, 0.2]

# Keep track of results
results = []

for lr in learning_rates:
    for wd in weight_decays:
        print(f"\nTraining with learning_rate={lr}, weight_decay={wd}\n")

        # Update training arguments
        training_args = TrainingArguments(
            output_dir=f'sequence_classification_lr{lr}_wd{wd}',
            learning_rate=lr,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=wd,
            evaluation_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            logging_dir=f'logs_lr{lr}_wd{wd}',
            logging_steps=10
        )

        # Initialize trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets['train'],
            eval_dataset=tokenized_datasets['val'],
            tokenizer=tokenizer,
            data_collator=collate_fn,
            compute_metrics=compute_metrics,
            class_weights=class_weights,
        )

        # Train the model
        train_result = trainer.train()

        # Record training loss and evaluation metrics
        eval_metrics = trainer.evaluate()
        train_loss = train_result.training_loss

        # Make predictions and compute performance metrics
        df_val = make_predictions(model, df_val)
        performance_metrics = get_performance_metrics(df_val)

        # Append the metrics to results
        results.append({
            "learning_rate": lr,
            "weight_decay": wd,
            "train_loss": train_loss,
            "eval_metrics": eval_metrics,
            "performance_metrics": performance_metrics
        })

        # Print results for the current model
        print(f"Results for learning_rate={lr}, weight_decay={wd}")
        print(f"Train Loss: {train_loss}")
        print(f"Evaluation Metrics: {eval_metrics}")
        print(f"Performance Metrics: {performance_metrics}")

# Summarize all results
print("\nSummary of all results:")
for res in results:
    print(res)


  from .autonotebook import tqdm as notebook_tqdm
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:04<00:00,  1.10s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 498/498 [00:00<00:00, 1642.46 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 125/125 [00:00<00:00, 1594.17 examples/s]
  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.



Training with learning_rate=0.0002, weight_decay=0.01



  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmghasemizade97[0m ([33mmghasemizade97-university-of-vermont[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,1.862094,0.214477
2,2.093500,1.646135,0.181588
3,1.479100,1.569773,0.294377


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    1
558    1
174    0
280    1
110    1
      ..
6      1
104    2
114    3
355    0
132    1
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[10  6  2  3]
 [ 6 11  3  1]
 [ 0  7  2  1]
 [13 25 11 24]]

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.48      0.40        21
           1       0.22      0.52      0.31        21
           2       0.11      0.20      0.14        10
           3       0.83      0.33      0.47        73

    accuracy                           0.38       125
   macro avg       0.38      0.38      0.33       125
weighted avg       0.59      0.38      0.41       125

Balanced Accuracy Score: 0.3821917808219178
Accuracy Score: 0.376
Results for learning_rate=0.0002, weight_decay=0.01
Train Loss: 1.697

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,1.416458,0.302279
2,1.158900,1.331382,0.412799
3,0.668400,1.206698,0.39026


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    2
558    3
174    1
280    0
110    3
      ..
6      3
104    2
114    3
355    0
132    3
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[10  4  3  4]
 [ 2 13  3  3]
 [ 0  2  6  2]
 [12  7 11 43]]

Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.48      0.44        21
           1       0.50      0.62      0.55        21
           2       0.26      0.60      0.36        10
           3       0.83      0.59      0.69        73

    accuracy                           0.58       125
   macro avg       0.50      0.57      0.51       125
weighted avg       0.66      0.58      0.60       125

Balanced Accuracy Score: 0.5710697977821266
Accuracy Score: 0.576
Results for learning_rate=0.0002, weight_decay=0.1
Train Loss: 0.8450

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,1.33407,0.431341
2,0.522500,1.542571,0.461159
3,0.244200,1.386057,0.495781


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


comparing test 249    0
558    3
174    1
280    0
110    1
      ..
6      3
104    3
114    3
355    3
132    3
Name: Institutional_Form_category, Length: 125, dtype: int8 and pred 249    2
558    3
174    1
280    0
110    3
      ..
6      0
104    2
114    3
355    3
132    3
Name: predictions, Length: 125, dtype: int64
Confusion Matrix:
[[11  4  5  1]
 [ 1 12  4  4]
 [ 0  1  6  3]
 [13  4 13 43]]

Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.52      0.48        21
           1       0.57      0.57      0.57        21
           2       0.21      0.60      0.32        10
           3       0.84      0.59      0.69        73

    accuracy                           0.58       125
   macro avg       0.52      0.57      0.51       125
weighted avg       0.68      0.58      0.61       125

Balanced Accuracy Score: 0.5710697977821266
Accuracy Score: 0.576
Results for learning_rate=0.0002, weight_decay=0.2
Train Loss: 0.3437

In [3]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate
import re
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

df = pd.read_csv('filtered_labeled.csv')

def clean_text(text):
    cleaned_text = re.sub(r'(\n[a-zA-Z])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

df['text'] = df['text'].apply(clean_text)
df['Institutional_Form'] = df['Institutional_Form'].astype('category')
df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes
category_map = {code: category for code, category in enumerate(df['Institutional_Form'].cat.categories)}

#df_train, df_val = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)
train_size = 0.7
test_size = 0.15
val_size = 0.15

df_train, df_temp = train_test_split(df, train_size=train_size, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=test_size / (test_size + val_size), random_state=42)

def generate_features_with_prompt(df):
    # Define the instruction
    instruction = (
        "You are a contract classification assistant. Your task is to classify the contract text "
        "into one of the predefined categories. Here are the criteria for each category:\n"
        "- Joint Operations: Partnership arrangements to jointly produce services with one or more organizations.\n"
        "- New Joint Entities: Two or more organizations creating a separate new entity to manage or govern a shared asset or service.\n"
        "- Resource Sharing: Sharing of information, personnel, equipment, etc., between governments or community organizations to provide services.\n"
        "- Service Contracts: Agreements with outside entities, public or private, for provision or support services.\n"
        "Analyze the given text carefully and respond with the appropriate category."
    )
    df['Institutional_Form'] = df['Institutional_Form'].astype(str)
    # Generate the `input` column by combining instruction, text, and category
    df['input'] = (
        instruction
        + "\n\nContract Text: "
        + df['text']
        + "\n\nInstitutional Form Category: "
        + df['Institutional_Form']
    )

    # Add the numerical category column if not already present
    if 'Institutional_Form_category' not in df.columns:
        df['Institutional_Form'] = df['Institutional_Form'].astype('category')
        df['Institutional_Form_category'] = df['Institutional_Form'].cat.codes

    return df

generate_features_with_prompt(df_train)
generate_features_with_prompt(df_val)

dataset_train = Dataset.from_pandas(df_train.reset_index(drop=True))
dataset_val = Dataset.from_pandas(df_val.reset_index(drop=True))

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
})

#Since our classes are not balanced let's calculate class weights based on inverse value counts
#Convert to pytorch tensor since we will need it
df_train.Institutional_Form_category.value_counts(normalize=True)
class_weights=(1/df_train.Institutional_Form.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()

model_name = "meta-llama/Llama-3.1-8B"

#Quantization Config (for QLORA)
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

#
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=len(category_map)
)

model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, lora_config)

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

#Must use .cache = False as below or it crashes from my experience
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

def llama_preprocessing_function(examples):
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("Institutional_Form_category", "label")
tokenized_datasets.set_format("torch")

#Purpose: Automatically pads text data to the longest sequence in a batch
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    try:
        # it's a classification task, take the argmax
        predictions_processed = np.argmax(predictions, axis=1)

        # Calculate Pearson correlation
        pearson, _ = pearsonr(predictions_processed, labels)

        return {'pearson': pearson}
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {'pearson': None}
    
#We will have a custom loss function that deals with the class weights and have class weights as additional argument in constructor
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss
    
training_args = TrainingArguments(
    output_dir = 'sequence_classification',
    learning_rate = 1e-3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights,
)

train_result = trainer.train()

def make_predictions(model, df):
    

    sentences = df.input.tolist()

      # Define the batch size
    batch_size = 32  # You can adjust this based on your system's memory capacity

      # Initialize an empty list to store the model outputs
    all_outputs = []

      # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
          # Get the batch of sentences
        batch_sentences = sentences[i:i + batch_size]

          # Tokenize the batch
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

          # Move tensors to the device where the model is (e.g., GPU or CPU)
        inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

          # Perform inference and store the logits
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])

    final_outputs = torch.cat(all_outputs, dim=0)
    df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
    #df['predictions']=df['predictions'].apply(lambda l:category_map[l])

    return df

def get_performance_metrics(df):
    y_test = df.Institutional_Form_category
    y_pred = df.predictions
    print(f"comparing test {y_test} and pred {y_pred}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

df_val = make_predictions(model, tokenizer, df_val)

#make_predictions(model,df_val)

get_performance_metrics(df_val)
#df_val

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:04<00:00,  1.16s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 436/436 [00:00<00:00, 1715.37 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 93/93 [00:00<00:00, 1739.14 examples/s]
  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Pearson
1,No log,2.466296,0.134326
2,No log,1.928252,0.393512
3,No log,1.875833,0.57748
4,No log,2.508594,0.38923
5,No log,2.662429,0.472553


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TypeError: make_predictions() takes 2 positional arguments but 3 were given

In [4]:
df_val = make_predictions(model, df_val)

#make_predictions(model,df_val)

get_performance_metrics(df_val)
#df_val

comparing test 56     2
158    3
261    0
238    0
329    2
      ..
72     3
2      1
408    3
318    3
509    3
Name: Institutional_Form_category, Length: 93, dtype: int8 and pred 56     0
158    3
261    0
238    2
329    3
      ..
72     3
2      1
408    3
318    3
509    3
Name: predictions, Length: 93, dtype: int64
Confusion Matrix:
[[ 4  3  6  5]
 [ 1 10  3  1]
 [ 2  1  2  4]
 [ 0  2  5 44]]

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.22      0.32        18
           1       0.62      0.67      0.65        15
           2       0.12      0.22      0.16         9
           3       0.81      0.86      0.84        51

    accuracy                           0.65        93
   macro avg       0.53      0.49      0.49        93
weighted avg       0.67      0.65      0.64        93

Balanced Accuracy Score: 0.4934640522875817
Accuracy Score: 0.6451612903225806
