## Setup

In [1]:
!pip install -U pip setuptools wheel

!pip install -q bitsandbytes
!pip install -q transformers
!pip install -q peft
!pip install -q accelerate
!pip install -q datasets



In [2]:
import random
import torch
import pandas as pd
from datasets import Dataset
import peft
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)

set_seed()

In [3]:
mistral7b = 'mistralai/Mistral-7B-v0.1'
# STEP 1. Check and make sure you're using the right model and notebook here.
model_name = mistral7b

## EDA

In [None]:
df = pd.read_csv("frankenstein_chunks.csv")
df.head()

Unnamed: 0,text
0,﻿The Project Gutenberg eBook of Frankenstein; ...
1,Further corrections by Menno de Leeuw.\n\n\n**...
2,"I am already far north of London, and as I wal..."
3,Its productions and features may be without ex...
4,But supposing all these conjectures to be fals...


In [5]:
print("Dataframe Info:")
print(df.info())
print("\n")
print("Dataframe Description:")
print(df.describe())
print("\n")
print("Number of unique values in each column:")
print(df.nunique())
random_index= random.randint(0, len(df) - 1)
df.loc[random_index, 'text']

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    481 non-null    object
dtypes: object(1)
memory usage: 3.9+ KB
None


Dataframe Description:
                                                     text
count                                                 481
unique                                                481
top     International donations are gratefully accepte...
freq                                                    1


Number of unique values in each column:
text    481
dtype: int64


'The thatch had fallen in, the walls were unplastered, and the\ndoor was off its hinges. I ordered it to be repaired, bought some\nfurniture, and took possession, an incident which would doubtless have\noccasioned some surprise had not all the senses of the cottagers been\nbenumbed by want and squalid poverty. As it was, I lived ungazed at\nand unmolested, hardly thanked for the pittance of food and clothes\nwhich I gave, so much does suffering blunt even the coarsest sensations\nof men.\n\nIn this retreat I devoted the morning to labour; but in the evening,\nwhen the weather permitted, I walked on the stony beach of the sea to\nlisten to the waves as they roared and dashed at my feet. It was a\nmonotonous yet ever-changing scene. I thought of Switzerland; it was\nfar different from this desolate and appalling landscape. '

In [6]:
df.isnull().sum()

Unnamed: 0,0
text,0


In [7]:
# Now we'll quickly convert this to a train/test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2)

# STEP 2. Convert the train_df and test_df from Pandas into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

## Model Import and Tokenization

In [9]:
quant_config = BitsAndBytesConfig(
  # STEP 3. Pass the appropriate parameters here to 4-bit quantize the model, then instantiate the model and check what it's running on.
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=quant_config)
print("\n\nModel is running on:" + "\n")
model.device

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



Model is running on:



device(type='cuda', index=0)

In [10]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# STEP 4. Prepare the model for QLoRA. Configure LoRA for our finetuning run. Then tokenize the data.
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_train_dataset= train_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)
tokenized_test_dataset = test_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

## Base Model Evaluation

In [12]:
def generate_text(prompt):
  device = "cuda"
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=100)
  output = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return output

In [13]:
# STEP 5. Generate a completion with the base model for informal evaluation.
base_generation = generate_text("I'm afraid I've created a ")
base_generation

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"I'm afraid I've created a 2000-level problem with a 100-level solution.\n\nI'm a 2000-level problem.\n\nI'm a 2000-level problem.\n\nI'm a 2000-level problem.\n\nI'm a 2000-level problem.\n\nI'm a 2000-level problem.\n\nI'm a 2"

In [16]:
def calc_perplexity(model):
  total_perplexity = 0
  device = "cuda"
  model.eval()
  for row in test_dataset:
    inputs = tokenizer(row['text'], return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    input_ids = inputs["input_ids"].to(device)
    # Calculate the loss without updating the model
    with torch.no_grad():
        outputs = model(**inputs, labels=input_ids)
    loss = outputs.loss
    # STEP 6. Complete the equation for perplexity.
    perplexity = torch.exp(loss)
    total_perplexity += perplexity

  num_test_rows = len(test_dataset)
  avg_perplexity = total_perplexity / num_test_rows
  return avg_perplexity

base_ppl = calc_perplexity(model)
base_ppl

tensor(8.7415, device='cuda:0')

## Training

Make sure you can leave your browser open for a while. This took about 15 minutes on a Colab T4 GPU.

In [17]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
model.config.use_cache = False

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        warmup_steps=2,
        fp16=True,
        logging_steps=1,
        save_steps=200,
        output_dir="outputs",
      # STEP 7. Configure the training arguments.
        per_device_train_batch_size=2,
        num_train_epochs=2,
        learning_rate=2e-5,
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
# STEP 8. Finetune the model.
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
1,2.332661
2,2.359794
3,1.520278
4,2.018794
5,2.204825
6,1.711687
7,1.857488
8,2.04783
9,1.630368
10,2.027582


  return fn(*args, **kwargs)


TrainOutput(global_step=384, training_loss=1.811546140505622, metrics={'train_runtime': 1087.3561, 'train_samples_per_second': 0.706, 'train_steps_per_second': 0.353, 'total_flos': 8212484377706496.0, 'train_loss': 1.811546140505622, 'epoch': 2.0})

## Evaluating the finetuned model

In [19]:
# STEP 9. Generate a completion with the finetuned model and compare it to the base generation.
ft_generation = generate_text("I'm afraid I've created a ")

print("Base model generation: " + base_generation + "\n\n")
print("Finetuned generation: " + ft_generation)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Base model generation: I'm afraid I've created a 2000-level problem with a 100-level solution.

I'm a 2000-level problem.

I'm a 2000-level problem.

I'm a 2000-level problem.

I'm a 2000-level problem.

I'm a 2000-level problem.

I'm a 2


Finetuned generation: I'm afraid I've created a  monster, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekanen, #
 bekan


A little more like the original text, right? Try experimenting with the hyperparameters to see if you can improve performance.

In [20]:
# STEP 10. Calculate the finetuned model's perplexity and compare it to the base model's.
ft_ppl = calc_perplexity(model)
print("Base model perplexity: " + str(base_ppl))
print("Finetuned model perplexity: " + str(ft_ppl))

Base model perplexity: tensor(8.7415, device='cuda:0')
Finetuned model perplexity: tensor(6.5309, device='cuda:0')
