## Setup

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
# !pip install -q -U pandas # you don't need to install either of these last two libs if you're using Colab
# !pip install -q -U torch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m637.5 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m825.2 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [2]:
import random
import torch
import pandas as pd
from datasets import Dataset
import peft
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)

set_seed()

In [3]:
mistral7b = 'mistralai/Mistral-7B-v0.1'
# STEP 1. Check and make sure you're using the right model and notebook here.
model_name = mistral7b

## EDA

In [4]:
df = pd.read_csv("frankenstein_chunks.csv")
df.head()

Unnamed: 0,text
0,﻿The Project Gutenberg eBook of Frankenstein; ...
1,Further corrections by Menno de Leeuw.\n\n\n**...
2,"I am already far north of London, and as I wal..."
3,Its productions and features may be without ex...
4,But supposing all these conjectures to be fals...


In [5]:
print("Dataframe Info:")
print(df.info())
print("\n")
print("Dataframe Description:")
print(df.describe())
print("\n")
print("Number of unique values in each column:")
print(df.nunique())
random_index= random.randint(0, len(df) - 1)
df.loc[random_index, 'text']

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    481 non-null    object
dtypes: object(1)
memory usage: 3.9+ KB
None


Dataframe Description:
                                                     text
count                                                 481
unique                                                481
top     ﻿The Project Gutenberg eBook of Frankenstein; ...
freq                                                    1


Number of unique values in each column:
text    481
dtype: int64


'The thatch had fallen in, the walls were unplastered, and the\ndoor was off its hinges. I ordered it to be repaired, bought some\nfurniture, and took possession, an incident which would doubtless have\noccasioned some surprise had not all the senses of the cottagers been\nbenumbed by want and squalid poverty. As it was, I lived ungazed at\nand unmolested, hardly thanked for the pittance of food and clothes\nwhich I gave, so much does suffering blunt even the coarsest sensations\nof men.\n\nIn this retreat I devoted the morning to labour; but in the evening,\nwhen the weather permitted, I walked on the stony beach of the sea to\nlisten to the waves as they roared and dashed at my feet. It was a\nmonotonous yet ever-changing scene. I thought of Switzerland; it was\nfar different from this desolate and appalling landscape. '

In [6]:
df.isnull().sum()

text    0
dtype: int64

In [7]:
# Now we'll quickly convert this to a train/test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2)

# STEP 2. Convert the train_df and test_df from Pandas into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


## Model Import and Tokenization

In [10]:
quant_config = BitsAndBytesConfig(
  # STEP 3. Pass the appropriate parameters here to 4-bit quantize the model, then instantiate the model and check what it's running on.
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=quant_config)
print("\n\nModel is running on:" + "\n")
model.device

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



Model is running on:



device(type='cuda', index=0)

In [14]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# STEP 4. Prepare the model for QLoRA. Configure LoRA for our finetuning run. Then tokenize the data.
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.5,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_train_dataset= train_dataset.map(lambda examples: tokenizer(examples["text"], padding="longest", truncation=True), batched=True)
tokenized_test_dataset = test_dataset.map(lambda examples: tokenizer(examples["text"], padding="longest", truncation=True), batched=True)

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/97 [00:00<?, ? examples/s]

## Base Model Evaluation

In [15]:
def generate_text(prompt):
  device = "cuda"
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=100)
  output = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return output

In [16]:
# STEP 5. Generate a completion with the base model for informal evaluation.
base_generation = generate_text("I'm afraid that I've created a")
base_generation

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'I\'m afraid that I\'ve created a monster.\n\nI\'ve been working on a new project for the past few months, and I\'ve been using a new tool to help me with it. I\'ve been using a tool called "Git" to help me manage my code.\n\nGit is a version control system. It\'s a tool that helps you keep track of changes to your code. It\'s a tool that helps you keep track of changes to your code.\n\nI\'ve'

In [17]:
def calc_perplexity(model):
  total_perplexity = 0
  for row in test_dataset:
    inputs = tokenizer(row['text'], return_tensors="pt")
    input_ids = inputs["input_ids"]
    # Calculate the loss without updating the model
    with torch.no_grad():
        outputs = model(**inputs, labels=input_ids)
    loss = outputs.loss
    # STEP 6. Complete the equation for perplexity.
    perplexity = torch.exp(loss)
    total_perplexity += perplexity

  num_test_rows = len(test_dataset)
  avg_perplexity = total_perplexity / num_test_rows
  return avg_perplexity

base_ppl = calc_perplexity(model)
base_ppl

tensor(8.9044)

## Training

Make sure you can leave your browser open for a while. This took about 15 minutes on a Colab T4 GPU.

In [18]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
model.config.use_cache = False

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        warmup_steps=2,
        fp16=True,
        logging_steps=1,
        save_steps=200,
        output_dir="outputs",
      # STEP 7. Configure the training arguments.
        per_device_train_batch_size=2,
        num_train_epochs=2,
        learning_rate=2e-5,
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
# STEP 8. Finetune the model.
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
1,2.4396
2,2.5293
3,1.8036
4,2.4867
5,2.1968
6,2.1149
7,1.6191
8,2.0092
9,1.9388
10,2.3472




Step,Training Loss
1,2.4396
2,2.5293
3,1.8036
4,2.4867
5,2.1968
6,2.1149
7,1.6191
8,2.0092
9,1.9388
10,2.3472


TrainOutput(global_step=384, training_loss=1.831227629367883, metrics={'train_runtime': 1316.338, 'train_samples_per_second': 0.583, 'train_steps_per_second': 0.292, 'total_flos': 1.2179460914675712e+16, 'train_loss': 1.831227629367883, 'epoch': 2.0})

## Evaluating the finetuned model

In [19]:
# STEP 9. Generate a completion with the finetuned model and compare it to the base generation.
ft_generation = generate_text("I'm afraid that I've created a")

print("Base model generation: " + base_generation + "\n\n")
print("Finetuned generation: " + ft_generation)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Base model generation: I'm afraid that I've created a monster.

I've been working on a new project for the past few months, and I've been using a new tool to help me with it. I've been using a tool called "Git" to help me manage my code.

Git is a version control system. It's a tool that helps you keep track of changes to your code. It's a tool that helps you keep track of changes to your code.

I've


Finetuned generation: I'm afraid that I've created a monster who will be the means of my own destruction.

I'm afraid, as a father, it is my duty to be careful that I do not destroy my own

offspring.

I'm afraid that I have been the means of destroying my friend.

I'm afraid that I have been the means of destroying my friend.

I'm afraid that I have been the means of destroying my friend.

I'm afraid that I


A little more like the original text, right? Try experimenting with the hyperparameters to see if you can improve performance.

In [20]:
# STEP 10. Calculate the finetuned model's perplexity and compare it to the base model's.
ft_ppl = calc_perplexity(model)
print("Base model perplexity: " + str(base_ppl))
print("Finetuned model perplexity: " + str(ft_ppl))



Base model perplexity: tensor(8.9044)
Finetuned model perplexity: tensor(6.8156)
