In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!ln -s "/content/drive/My Drive/Borderline/" "/content/"

In [3]:
%cd Borderline

/content/drive/My Drive/Borderline


Installs

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import Subset, DataLoader, random_split, Dataset
from torch.optim import lr_scheduler
import torch.utils.data as data
from torchvision import transforms
from transformers import BertTokenizer, AdamW, BertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from transformers import TrainingArguments

import moviepy.audio as audio
import moviepy.editor as mp

from unsloth import FastLanguageModel

import json
from transformers import AutoTokenizer
from datasets import Dataset
import numpy as np
import os
import time
import copy
import random
import pandas as pd

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Loading quantized versions of Llama 3

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Llama 3 is up to 8k
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import json
from datasets import Dataset
import pyarrow as pa
from typing import Optional

class OppoDataset:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def _load_data(self):
        with open(self.file_path, 'r') as file:
            data = json.load(file)
        return data

    def _format_data(self, data):
        formatted_data = {
            "instruction": [],
            "input": [],
            "output": []
        }
        for entry in data:
            formatted_data["instruction"].append(entry["instruction"])
            formatted_data["input"].append(entry["input"])
            formatted_data["output"].append(entry["output"])
        return formatted_data

    def to_dataset(self):
        data = self._load_data()
        formatted_data = self._format_data(data)
        arrow_table = pa.Table.from_pydict(formatted_data)
        return Dataset(arrow_table)


In [None]:
# this is basically the system prompt
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # do not forget this part!
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN # without this token generation goes on forever!
        texts.append(text)
    return { "text" : texts, }
pass


# Instantiate the helper class
datapath = 'data/oppodata.json'
dataset_helper = OppoDataset(datapath)

# Convert to Hugging Face dataset
dataset = dataset_helper.to_dataset()
dataset = dataset.map(formatting_prompts_func, batched = True,)

# Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        num_train_epochs=4,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
trainer_stats = trainer.train()