In [None]:
!pip install torch
!pip install pandas
!pip install numpy
!pip install json
!pip install datasets
!pip install peft
!pip install -q -U einops
!pip install -q -U bitsandbytes
!pip install transformers==4.37
!pip install accelerate -U
!pip install huggingface_hub

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
import os
import pandas as pd
import numpy as np
import json
from huggingface_hub import notebook_login

In [None]:
#login into hf account in order to save the model on the hub later on
notebook_login()

In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained( 'microsoft/phi-1_5',
                                            quantization_config=bnb_config)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj", "k_proj", "v_proj", "dense"]
)

model = get_peft_model(model, peft_config)
model.gradient_checkpointing=True

In [None]:
training_arguments = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        save_strategy='epoch',
        do_eval=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        per_device_eval_batch_size=4,
        logging_steps=50,
        learning_rate=4e-4,
        eval_steps=200,
        num_train_epochs=2,
        warmup_steps=100,
        lr_scheduler_type="cosine",
        remove_unused_columns=True
)

In [None]:
def tokenize(sample):
    tokenized_text =  tokenizer(sample["text"], truncation=True, padding=True, max_length=512)
    return tokenized_text

In [None]:
extracted_objs = []
data_path = 'data.json'
with open(data_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        extracted_objs.append(json_obj)

In [None]:
df = pd.DataFrame(extracted_objs)

In [None]:
df["text"] = df[["Prompt", "Completion"]].apply(lambda x: "Prompt: " + x["Prompt"] + " Completion: " + x["Completion"], axis=1)

df

In [None]:
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [None]:
train_data = Dataset.from_pandas(train)
tokenized_train_data = train_data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=train_data.column_names)


In [None]:
validate_data = Dataset.from_pandas(validate)
tokenized_validate_data = validate_data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=validate_data.column_names)


In [None]:
trainer = Trainer (
    model=model,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_validate_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
save_name = ""

model.save_pretrained(save_name)

In [None]:
repo_name = ""

model.push_to_hub(repo_name)