In [None]:
# Install Pytorch
%pip install "torch==2.2.2" tensorboard

# Install Hugging Face libraries
%pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"  

In [None]:
!huggingface-cli login --token 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

In [None]:
!git config --global credential.helper store  

In [None]:
!pip install peft

## IMPORTS

In [None]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F


from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from transformers import AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch


## LOAD THE DATASET

In [None]:
import pandas as pd

# Path to the uploaded file
file_path = '/kaggle/input/sentiment-a3/selected_comments.xlsx'

# Load the Excel file into a Pandas DataFrame
df = pd.read_excel(file_path)

df['entities']=df['entities'].astype('category')
df['target']=df['entities'].cat.codes

df.head()

df=df[12500:]
len(df)


## We will be creaing 9 batches of 512 size for this data.

In [None]:
df['entities'].cat.categories

In [None]:
category_map = {code: category for code, category in enumerate(df['entities'].cat.categories)}
category_map

# Split into train/val/test for later comparison.
## For simplicity we split based on time.
### First 60% train
### Next 20% val
### Next 20% test
#### This can be problematic a bit since class balance changes over time and some articles on boundries between train/val or val/test have some overlap, but completely beats bias of stratified sample usually used since some articles are literally on same thing, but maybe different sources.<br>
#### Essentially one need to ensure that the data in train and val and test are pretty independent therefore one cannot use stratified since that would give random spread probably giving the differentr sources of same news into train , val, test. Thus curropting.  

In [None]:
# Generalized split for train (60%), val (20%), and test (20%)

# df= df[10000:]

# Calculate the split indices
train_split = int(0.6 * len(df))  # First 60%
val_split = int(0.8 * len(df))    # Next 20% (60% + 20%)

# Perform the splits
df_train = df.iloc[:train_split, :]  # Train set
df_val = df.iloc[train_split:val_split, :]  # Validation set
df_test = df.iloc[val_split:, :]  # Test set

# Print the shapes of the splits
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape) 

### PANDAS DATAFRAME TO HUGGINGFACE DATASET

In [None]:
# Converting pandas DataFrames into Hugging Face Dataset objects:
dataset_train = Dataset.from_pandas(df_train.drop('entities',axis=1))
dataset_val = Dataset.from_pandas(df_val.drop('entities',axis=1))
dataset_test = Dataset.from_pandas(df_test.drop('entities',axis=1))

### DATASET SHUFFLING

In [None]:
dataset_train_shuffled = dataset_train.shuffle(seed=42)  # Using a seed for reproducibility

### DATASET DICTIONARY

In [None]:
dataset = DatasetDict({
    'train': dataset_train_shuffled,
    'val': dataset_val,
    'test': dataset_test
})
dataset

## CLASS WEIGHTS

In [None]:
class_weights=(1/df_train.target.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)  # expects a python list not a numpy array!!
class_weights=class_weights/class_weights.sum()
class_weights

## MODEL NAME -  "meta-llama/Meta-Llama-3-8B"

#### Meta-LLaMA: 
It is likely based on a LLaMA model architecture.
#### 3-8B: 
The model might belong to the 3rd iteration (hypothetical) or is labeled as "3" for some specific reason, and it has 8 billion parameters.


In [None]:

model_name = "meta-llama/Meta-Llama-3-8B"

## Load the tokenizer

### Since LLAMA3 pre-training doesn't have PAD token

* Set the pad_token_id to eos_token_id
* Set pad token ot eos_token

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

# FINE TUNING OF LLAMA

## HELPER FUNCTIONS

#### TRAINER COMPONENTS
* model
* tokenizer
* training arguments
* train dataset
* eval dataset
* Data Collater
* Compute Metrics
* class_weights: In our case since we are using a custom trainer so we can use a weighted loss we will subclass trainer and define the custom loss.

#### Create LLAMA tokenized dataset which will house our train/val parts during the training process but after applying tokenization

In [None]:
MAX_LEN = 512
col_to_delete = ['created_at', 'body']

def llama_preprocessing_function(examples):
    return tokenizer(examples['body'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

## Data Collator
A **data collator** prepares batches of data for training or inference in machine learning, ensuring uniform formatting and adherence to model input requirements. This is especially crucial for variable-sized inputs like text sequences.

### Functions of Data Collator

1. **Padding:** Uniformly pads sequences to the length of the longest sequence using a special token, allowing simultaneous batch processing.
2. **Batching:** Groups individual data points into batches for efficient processing.
3. **Handling Special Tokens:** Adds necessary special tokens to sequences.
4. **Converting to Tensor:** Transforms data into tensors, the required format for machine learning frameworks.

### `DataCollatorWithPadding`

The `DataCollatorWithPadding` specifically manages padding, using a tokenizer to ensure that all sequences are padded to the same length for consistent model input.

- **Syntax:** `collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)`
- **Purpose:** Automatically pads text data to the longest sequence in a batch, crucial for models like BERT or GPT.
- **Tokenizer:** Uses the provided `tokenizer` for sequence processing, respecting model-specific vocabulary and formatting rules.

This collator is commonly used with libraries like Hugging Face's Transformers, facilitating data preprocessing for various NLP models.


In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

#### COMPUTE METRICS

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

## Understanding the Plan
The plan is designed to train a model sequentially on batches of data, due to GPU memory constraints. The idea is to:

Train the model on batch 1, save the model checkpoint.
Load the model checkpoint from step 1, train on batch 2, and save the updated checkpoint.
Repeat this process sequentially for all batches.
This approach tries to mitigate the GPU memory limitation by only keeping the model, one batch of data, and its computation graph in memory at a time, saving checkpoints after each batch.

### CUSTOM TRAINER

In [None]:
import os
import torch
from transformers import Trainer, TrainingArguments
from torch.nn import functional as F

class CustomTrainer(Trainer): 
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure class_weights is a tensor
        if class_weights is not None:
            self.class_weights = class_weights.clone().detach().to(self.args.device) ### Here !!start --->
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


### TRAINGING ARGUMENTS

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/training_metadata',
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy='steps',
    save_strategy='steps',
    save_steps=70,  # Save the model every 500 steps
    save_total_limit=2,  # Only keep the last 2 checkpoints
    load_best_model_at_end=True,
    logging_dir='logs',
    logging_steps=70,  # Log training metrics every 100 steps
    report_to="tensorboard"
)

###  LATEST CHECKPOINT FUNCTION

In [None]:
def get_latest_checkpoint(checkpoint_dir):
    if os.path.exists(checkpoint_dir):
        checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)]
        checkpoints = sorted(checkpoints, key=os.path.getmtime, reverse=True)
        return checkpoints[0] if checkpoints else None
    return None

## ACTUAL FINE TUNING BEGINS !!

### 1. SPLITTING THE DATASET INTO CHUNKS

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split

# Split the dataset into smaller batches
batch_size = 512 # Number of examples per batch
train_dataset = tokenized_datasets['train']

# Create chunks of the dataset
num_batches = len(train_dataset) // batch_size + int(len(train_dataset) % batch_size > 0)
batches = random_split(train_dataset, [batch_size] * (num_batches - 1) + [len(train_dataset) % batch_size])

print(f"Number of batches: {num_batches}")

### 2. TRAINING


In [None]:
def train_on_batch(model, batch, batch_id, output_dir, training_args):
    print(f"Training on batch {batch_id}...")
    
    # Initialize trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=batch,  # Train only on this batch
        eval_dataset=tokenized_datasets['val'],
        tokenizer=tokenizer,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        class_weights=class_weights,
    )

    # Train the model
    trainer.train()
    
    # Save the checkpoint
    checkpoint_dir = os.path.join(output_dir, f"batch_{batch_id}")
    trainer.save_model(checkpoint_dir)
    print(f"Batch {batch_id} training complete. Checkpoint saved at {checkpoint_dir}.")


### 3. LOADING THE MODEL

In [None]:
def load_model(latest_checkpoint, quantization_config, lora_config):
    #print(f"Loading model from: {latest_checkpoint or 'base model'}")
    model_name = latest_checkpoint if latest_checkpoint else "meta-llama/Meta-Llama-3-8B"

    # Load and configure the model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        num_labels=2
    )
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)


    # Initialize and configure the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    
    # Additional model configurations
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    
    print("Tokenizer configured successfully.")


    print("Model loaded and configured successfully.")
    return model


### 4. TESTING 

In [None]:
import torch
import numpy as np
import pandas as pd
import os
from sklearn.metrics import accuracy_score, classification_report


def test_model(model,batch_size,output_test_dir):

    # Define file paths for checkpoints and results
    checkpoint_path = os.path.join(output_test_dir, 'model_checkpoint')
    checkpoint_index_file = os.path.join(checkpoint_path, 'checkpoint_index.txt')
    output_file_path_np = os.path.join(checkpoint_path, 'model_outputs.npy')
    output_file_path_csv = os.path.join(checkpoint_path, 'model_outputs.csv')
    predictions_file_path_csv = os.path.join(output_test_dir, 'predictions_with_logits_without_finetune.csv')
    
    # Create checkpoint directory if it doesn't exist
    os.makedirs(checkpoint_path, exist_ok=True)
    
    # Convert summaries to a list
    sentences = df_test['body'].tolist()
    
    # Initialize variables
    start_index = 0
    all_outputs = []
    
    # Load checkpoint if it exists
    if os.path.exists(checkpoint_index_file):
        with open(checkpoint_index_file, 'r') as f:
            start_index = int(f.read().strip())
        print(f"Resuming from index {start_index}")
    
    # Load previous outputs if they exist
    if os.path.exists(output_file_path_np):
        all_outputs = np.load(output_file_path_np, allow_pickle=True).tolist()
        print(f"Loaded previous outputs with shape {np.array(all_outputs).shape}")
    
    # Process the sentences in batches
    for i in range(start_index, len(sentences), batch_size):
        # Get the batch of sentences
        batch_sentences = sentences[i:i + batch_size]
    
        # Tokenize the batch
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
        # Move tensors to the device where the model is (e.g., GPU or CPU)
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
        # Perform inference and store the logits
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.cpu().numpy()  # Move to CPU and convert to numpy array
            all_outputs.extend(logits)  # Use extend to avoid adding a new dimension
    
        # Save the outputs and checkpoint periodically
        if (i // batch_size) % 10 == 0:  # Save every 10 batches (adjust as needed)
            all_outputs_np = np.array(all_outputs)
            np.save(output_file_path_np, all_outputs_np)
            df_outputs = pd.DataFrame(all_outputs_np)
            df_outputs.to_csv(output_file_path_csv, index=False)
            with open(checkpoint_index_file, 'w') as f:
                f.write(str(i + batch_size))  # Update to the next starting index
            print(f"Checkpoint saved at index {i + batch_size}")
    
    # Save final results
    all_outputs_np = np.array(all_outputs)
    np.save(output_file_path_np, all_outputs_np)
    df_outputs = pd.DataFrame(all_outputs_np)
    df_outputs.to_csv(output_file_path_csv, index=False)
    print("Final results saved.")

    df_test['logits'] = [list(logit) for logit in all_outputs_np]
    predictions = np.argmax(all_outputs_np, axis=1)
    category_map = {0: 0, 1: 1}  # Adjust this mapping if needed
    predictions = [category_map.get(pred, pred) for pred in predictions]      
    df_test['predictions'] = predictions
    
    # Check if the length of predictions matches df_test
    if len(all_outputs_np) == len(df_test):
        df_test.to_csv(predictions_file_path_csv, index=False)
        print(df_test[['logits', 'predictions']].head())
        
    else:
        print(f"Length mismatch: len(all_outputs_np) = {len(all_outputs_np)}, len(df_test) = {len(df_test)}")

    # Assuming you have true labels
    true_labels = df_test['target']  # replace with actual labels
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")

    output_file = os.path.join(output_test_dir, 'test_metrics.txt')
    
    with open(output_file, "w") as f:
        f.write(f"Accuracy: {accuracy}\n")
        f.write("Classification Report:\n")
        f.write(report)
    
    print(f"Accuracy and classification report saved to {output_file}")

## DRIVER FUNCTION

In [None]:
def driver(to_train,to_test):

    # Define directories and configurations
    model_dir = '/kaggle/input/final-model32/final_model'
    output_train_dir = '/kaggle/working/train_data/model_checkpoint'
    output_test_dir = '/kaggle/working/test_data'
    
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    lora_config = LoraConfig(
        r=16,
        lora_alpha=8,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout=0.05,
        bias='none',
        task_type='SEQ_CLS'
    )
    
    
    latest_checkpoint = get_latest_checkpoint(model_dir)
    print(latest_checkpoint)
    
    # Load the model
    model = load_model(latest_checkpoint, quantization_config, lora_config)
    
    
    if to_train:
         train_on_batch(model, batches[8], 8, output_train_dir , training_args)
    
    if to_test:
        test_model(model,32,output_test_dir)

In [None]:
driver(False,True)

# def driver(to_train,to_test):

## Full Data vs. Chunked Data: (ANALYSIS OF OUR APPOROACH)
### Full Data: 
When the model is trained on the full dataset, the optimizer updates the model weights based on gradients computed over mini-batches. However, since the entire dataset is shuffled and seen multiple times (in epochs), the model gets repeated exposure to all parts of the data distribution, ensuring that the weights converge to a solution that reflects the entire dataset.
### Chunked Data: 
When you train sequentially on data chunks, the model weights are updated based only on the current chunk. If the data in the chunks are not representative of the full dataset (e.g., they have a different distribution), the updates from the current chunk may overwrite or bias the knowledge learned from earlier chunksall.