## Setup

In [1]:
!pip install -r requirements.txt



In [2]:
import numpy as np
import os
import pandas as pd
from rich import inspect

from utils import *

console = setup_rich()

In [3]:
filenames = ["discord-4nkq7191ray", "discord-fv8dqxpzsem"]
data_dir = ".data/csv/"
filenames = [os.path.join(data_dir, f"{f}.csv") for f in filenames]
train_data = pd.read_csv(filenames[1])
test_data = pd.read_csv(filenames[0])
train_data.head()

Unnamed: 0,text
0,Hehe
1,"20220127_210848.jpg"",""prependIcon"":""attachment..."
2,I think I'm gonna stream playing im excited
3,Time to impulse buy
4,Wait is pokemon diamond out?


## Pipeline demo

In [4]:
# Load model directly
import warnings

import torch
from transformers import *

warnings.filterwarnings("ignore")


model_name = "microsoft/Phi-3-mini-4k-instruct"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device="cpu",
)
pipe

loading file tokenizer.model from cache at C:\Users\nicol\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\tokenizer.model
loading file added_tokens.json from cache at C:\Users\nicol\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\added_tokens.json
loading file special_tokens_map.json from cache at C:\Users\nicol\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\nicol\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\tokenizer_config.json
loading file tokenizer.json from cache at C:\Users\nicol\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\tokenizer.json
Special tokens have 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Phi3ForCausalLM.

All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at C:\Users\nicol\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}



In [5]:
# pipe("Can you explain LangChain?")

In [6]:
inspect(pipe)

In [7]:
sample = train_data.sample(1)["text"].values[0]
print(sample)


GOODY TWO SHOES


In [8]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
train_dataset

In [9]:
sample = train_data.sample(1)["text"].values[0]
console.print(sample)
input_ids = tokenizer.encode(sample, return_tensors="pt")
input_ids

In [10]:
texts = train_data["text"].tolist()

In [11]:
texts_encodings = tokenizer(texts[:5], truncation=True, padding=True)

In [12]:
texts_encodings.keys()

In [13]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/51305 [00:00<?, ? examples/s]

Map:   0%|          | 0/4245 [00:00<?, ? examples/s]

In [14]:
# inspect(train_dataset)

## Training

In [15]:
model

In [16]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=4,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "qkv_proj",
    ],
)
lora_model = get_peft_model(model, lora_config).to(device)
lora_model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 3,827,371,008 || trainable%: 0.1644


In [17]:
output_dir = ".data/outputs/"

training_args = TrainingArguments(
    eval_strategy="epoch",
    bf16=True,
    gradient_accumulation_steps=256,
    learning_rate=2e-5,
    output_dir=output_dir,
    push_to_hub=False,
    remove_unused_columns=True,
    report_to="none",
    weight_decay=0.01,
    auto_find_batch_size=True,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

PyTorch: setting up devices
Using auto half precision backend
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 51,305
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 256
  Total optimization steps = 600
  Number of trainable parameters = 6,291,456


  0%|          | 0/600 [00:00<?, ?it/s]

You are not running the flash-attention implementation, expect numerical differences.


In [None]:
import math

eval_results = trainer.evaluate()
console.print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")