In [None]:
!pip install accelerate bitsandbytes datasets evaluate trl peft

In [None]:
from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os
import json
from sklearn.model_selection import train_test_split
import re

In [None]:
# Create Bitsandbytes configuration
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
# Loading the Pre-Trained model
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",

    use_auth_token=True
)

In [None]:
# Tokenization setup
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:

def load_text_files(dataset_dir):
    """Load all text files from the season folders."""
    # store ir in a dateframe
    texts = []
    for season_folder in os.listdir(dataset_dir):
        season_path = os.path.join(dataset_dir, season_folder)
        if os.path.isdir(season_path):
            for episode_file in os.listdir(season_path):
                if episode_file.endswith(".txt"):
                    episode_path = os.path.join(season_path, episode_file)
                    with open(episode_path, "r", encoding="utf-8") as f:
                        episode_text = f.read()
                        texts.append(episode_text)

    return texts

def tokenize_dataset(texts, tokenizer, max_length=2048):
    """Tokenize a list of text episodes and structure as expected by Trainer."""
    tokenized_data = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )
    # Convert to list of dictionaries with 'input_ids' key
    return [{"input_ids": ids} for ids in tokenized_data["input_ids"]]

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=2048)

def save_splits(dataset_splits, output_dir):
    """Save train, validation, and test splits."""
    # Create val, train, and test dirs if they don't exist
    os.makedirs(output_dir, exist_ok=True)
    for split_name, split_data in dataset_splits.items():
        split_dir = os.path.join(output_dir, split_name)
        os.makedirs(split_dir, exist_ok=True)

    # Save the data in the corresponding folder
    for split_name, split_data in dataset_splits.items():
        split_dir = os.path.join(output_dir, split_name)
        for idx, text in enumerate(split_data):
            # Split text into lines and get the fifth line
            lines = text.splitlines()
            if len(lines) >= 5:  # Ensure the file has at least 5 lines
                # Get the first word from the fifth line
                first_word = lines[4].split(maxsplit=1)[0]  # Safely split the fifth line
                file_name = f"{first_word}.txt"
            else:
                # Default to index if the file doesn't have enough lines
                file_name = f"{idx}.txt"

            # Save the file with the determined name
            file_path = os.path.join(split_dir, file_name)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(text)

    print(f"Splits saved in {output_dir}")

# 2. Split Dataset
def split_dataset(data, train_size=0.8, val_size=0.1, random_seed=42):
    """Split dataset into train, validation"""
    train_data, val_data = train_test_split(data, train_size=train_size, random_state=random_seed)
    return train_data, val_data

def load_split(split_dir):
  train = []
  val = []
  for file in os.listdir(os.path.join(split_dir,'TRAIN_DATA')):
    if file.endswith(".txt"):
      with open(os.path.join(split_dir,'TRAIN_DATA',file), 'r') as f:
        text = f.read()
        train.append((text, file))
        print(f"added {file} to train")
  for file in os.listdir(os.path.join(split_dir,'VALIDATION')):
    if file.endswith(".txt"):
      with open(os.path.join(split_dir,'VALIDATION',file), 'r') as f:
        text = f.read()
        val.append((text, file))
        print(f"added {file} to val")
  return train, val


In [None]:
# Paths
split_dir = "path_to_dataset"

train_data, val_data = load_split(split_dir)

In [None]:
from datasets import Dataset
#sort it acoording to the scene_name
train_data = sorted(train_data, key=lambda x: x[1])
val_data = sorted(val_data, key=lambda x: x[1])
#take only the text
train_data = [x[0] for x in train_data]
val_data = [x[0] for x in val_data]

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}")
len_token, text = compute_max_token_length(train_data + val_data, tokenizer)
print(f"max token length: {len_token}. with text: {text}")

# Step 5: Save splits
#save_splits(dataset_splits, output_split_dir)
train_dataset = Dataset.from_dict({"text": train_data})
val_dataset = Dataset.from_dict({"text": val_data})
print(f"train sample: {train_dataset[0]}")
print(f"val sample: {val_dataset[0]}")
# tokenize the train and save it in a dict with input_ids
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
print(f"tokenized train size: {len(tokenized_train_dataset)}, tokenized val size: {len(tokenized_val_dataset)}")


Train size: 6285, Validation size: 699
max token length: 508. with text: JERRY SEINFELD: [Pleading] Oh, come on. He's just a lonely old man. All old people steal.
MANAGER: That's right. That's why we stopped carrying batteries. Look, I'll be honest with you, we've had a lot of trouble with theft lately - and my boss says I have to make an example to someone.
JERRY SEINFELD: So it could be anyone?
MANAGER: I.. guess. As long as we catch him in the act.
[Jerry turns to George. George has a huge bundle under his overcoat - and is trying to act innocent]
JERRY SEINFELD: That guy! [Pointing at George] Swarm! Swarm!
[George is instantly surrounded by guards]
GEORGE COSTANZA: No! Jerry!
[End scene]
[End of episode]
train sample: {'text': '[Scene: Comedy club]\nJERRY SEINFELD: You know, why we\'re here? [he means: here in the "Comedy club"] To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about "We should go out

Map:   0%|          | 0/6285 [00:00<?, ? examples/s]

Map:   0%|          | 0/699 [00:00<?, ? examples/s]

tokenized train size: 6285, tokenized val size: 699


In [None]:
config = LoraConfig(
    r=32, #Rank
    lora_alpha=16,
    target_modules=[
        'q_proj',
        'v_proj',
    ],
    bias="none",
    task_type="CAUSAL_LM",
    lora_dropout=0.1
)

#Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()
original_model.enable_input_require_grads()
peft_model = get_peft_model(original_model, config)

In [None]:
peft_model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


In [None]:
output_dir = "path_to_checkpoint"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,  # Default batch size
    gradient_accumulation_steps=4,  # Helps with memory efficiency
    num_train_epochs=5,  # Default number of epochs
    learning_rate=2e-4,  # Initial learning rate
    lr_scheduler_type="linear",  # Linear scheduler for learning rate
    warmup_ratio=0.05,  # 5% of total steps dynamically calculated for warm-up
    optim="paged_adamw_8bit",  # Memory-efficient optimizer
    logging_steps=10,  # Logging frequency
    logging_dir = output_dir + "/logs",
    save_strategy="epoch",  # Save model checkpoint after each epoch
    eval_strategy="steps",  # Evaluate after each epoch
    eval_steps = 200,
    do_eval=True,  # Enable evaluation
    gradient_checkpointing=True,  # Memory-efficient training
    report_to="none",  # Disable reporting to external services
    overwrite_output_dir=True,  # Overwrite output directory if it exists
    group_by_length=True,  # Optimizes tokenization efficiency
    resume_from_checkpoint=True,  # Resume interrupted training
    dropout=0.1,  # Applies dropout to LoRA parameters
    weight_decay=0.01,  # Regularization to avoid overfitting
)


In [None]:
import transformers
peft_model.config.use_cache = False

In [None]:

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    processing_class=tokenizer
)

In [None]:
import os
# Set the environment variable
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
peft_model.train()

In [None]:

# resume from checkpoint if checkpoint dor is not empty
if os.path.exists(peft_training_args.output_dir) and len(os.listdir(peft_training_args.output_dir)) > 0:
    peft_trainer.train(resume_from_checkpoint=True)
else:
    peft_trainer.train()  # Start training from scratch if no checkpoint exists
