In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import tensorflow as tf
from gensim.models import Word2Vec
import pandas as pd
import nltk
nltk.download('punkt')

# Load the Pokemon descriptions dataset
pokemon_descriptions = pd.read_csv('/content/drive/MyDrive/pokemon/poki_data.csv')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# !pip uninstall tokenizers
# !pip install tokenizers==0.4.2
from tokenizers import ByteLevelBPETokenizer

pokemon_names = pokemon_descriptions['name']

# Initialize a new tokenizer and train it
tokenizer = ByteLevelBPETokenizer()

# Customize training with desired parameters
tokenizer.train_from_iterator(pokemon_names, vocab_size=100, min_frequency=2)

# Save the trained tokenizer
tokenizer.save_model("/content/drive/MyDrive/pokemon/")


['/content/drive/MyDrive/pokemon/vocab.json',
 '/content/drive/MyDrive/pokemon/merges.txt']

In [None]:
# Initialize the tokenizer
tokenizer = ByteLevelBPETokenizer("/content/drive/MyDrive/pokemon/vocab.json", "/content/drive/MyDrive/pokemon/merges.txt")

# Tokenize Pokémon names
pokemon_descriptions['name'] = pokemon_descriptions['name'].apply(lambda x: tokenizer.encode(x))


In [None]:
from transformers import AutoTokenizer

# Initialize the tokenizer
description_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize Pokémon descriptions
pokemon_descriptions['desc'] = pokemon_descriptions['desc'].apply(lambda x: description_tokenizer.encode(x))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

class PokemonDataset(Dataset):
   def __init__(self, encodings, labels):
       self.encodings = encodings
       self.labels = labels

   def __getitem__(self, idx):
       item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
       item["labels"] = torch.tensor(self.labels[idx])
       return item

   def __len__(self):
       return len(self.labels)

# Convert the DataFrame to a list of dictionaries
# inputs = pokemon_descriptions['name'].tolist()
# targets = pokemon_descriptions['desc'].tolist()

# # Encode the inputs


# Create the dataset
# dataset = PokemonDataset(inputs, targets)


In [None]:
import pickle

# Save the dataset to a file using pickle
file_path = '/content/drive/MyDrive/pokemon/pokemon_dataset.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(dataset, file)

print(f"Dataset saved to {file_path}")


Dataset saved to /content/drive/MyDrive/pokemon/pokemon_dataset.pkl


In [None]:
import pickle

file_path = '/content/drive/MyDrive/pokemon/pokemon_dataset.pkl'

# Load the dataset from the file using pickle
with open(file_path, 'rb') as file:
    dataset = pickle.load(file)

# Now you can use the loaded_dataset variable, which contains the data from the pickle file


In [None]:
!pip -qqq install bitsandbytes accelerate
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Hugging Face model name
model_name = "meta-llama/Llama-2-7b-chat-hf"
use_flash_attention = False

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_use_double_quant=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
   model_name,
   token="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",
   quantization_config=bnb_config,
   use_cache=False,
   use_flash_attention_2=use_flash_attention,
   device_map="auto",
   torch_dtype=torch.float16
)

model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,   token="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")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
   output_dir="finetuned-llama-7b-chat-hf-med",
   num_train_epochs=3,
   per_device_train_batch_size=4,
   gradient_accumulation_steps=2,
   gradient_checkpointing=True,
   optim="paged_adamw_32bit",
   logging_steps=10,
   save_strategy="epoch",
   learning_rate=2e-4,
   fp16=True,
   max_grad_norm=0.3,
   warmup_ratio=0.03,
   lr_scheduler_type="constant",
   disable_tqdm=False
)


In [None]:
# !pip install trl
# !pip install peft
from trl import SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
   lora_alpha=32,
   lora_dropout=0.1,
   r=16,
   bias="none",
   task_type="CAUSAL_LM",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)



max_seq_length = 1024 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
   model=model,
   train_dataset=dataset,
   peft_config=peft_config,
   max_seq_length=max_seq_length,
   tokenizer=tokenizer,
   packing=True,
   args=args,
)


ValueError: ignored

In [None]:
# Train
trainer.train()

# Save model
trainer.save_model()
