In [1]:
%pip install accelerate peft bitsandbytes transformers trl jsonlines pandas numpy python-dotenv numba



In [2]:
import os
import torch
import pandas as pd
import numpy as np
from accelerate import Accelerator
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    Trainer
)
from peft import LoraConfig, PeftModel, PeftConfig, get_peft_model, prepare_model_for_kbit_training
import time
from dotenv import load_dotenv
import random
import torch
random.seed(42)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
lora_model = "Llama-2-7b-chat-hf-text-to-sql"
load_dotenv()

False

In [3]:
from google.colab import userdata
my_token = userdata.get('token')

In [4]:
# Should include quantization here
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=compute_dtype,
    bnb_8bit_use_double_quant=False,
)

lora_config = PeftConfig.from_pretrained(lora_model, load_from_disk = True)
model = AutoModelForCausalLM.from_pretrained(lora_config.base_model_name_or_path, token = my_token)
tokenizer = AutoTokenizer.from_pretrained(lora_config.base_model_name_or_path, token = my_token)
model = get_peft_model(model, lora_config)
model = PeftModel.from_pretrained(model, lora_model, load_from_disk = True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [5]:
prompt = """Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables. ### Instruction: What\'s the average crowd size when the Home team is melbourne? ### Input: CREATE TABLE table_78356 (\n    "Home team" text,\n    "Home team score" text,\n    "Away team" text,\n    "Away team score" text,\n    "Venue" text,\n    "Crowd" real,\n    "Date" text\n) ### Response:"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
print(inputs['input_ids'].shape)

output_sequences = model.generate(**inputs, max_new_tokens=200, top_k = 5)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

torch.Size([1, 132])
['Below are sql tables schemas paired with instruction that describes a task. Using valid SQLite, write a response that appropriately completes the request for the provided tables. ### Instruction: What\'s the average crowd size when the Home team is melbourne? ### Input: CREATE TABLE table_78356 (\n    "Home team" text,\n    "Home team score" text,\n    "Away team" text,\n    "Away team score" text,\n    "Venue" text,\n    "Crowd" real,\n    "Date" text\n) ### Response: The average crowd size for when the Home team is Melbourne is 50000.']


In [9]:
model = model.merge_and_unload()
model.save_pretrained("Llama-2-7b-chat-hf-text-to-sql-lora")

In [12]:
tokenizer.save_pretrained("Llama-2-7b-chat-hf-text-to-sql-lora")

('Llama-2-7b-chat-hf-text-to-sql-lora/tokenizer_config.json',
 'Llama-2-7b-chat-hf-text-to-sql-lora/special_tokens_map.json',
 'Llama-2-7b-chat-hf-text-to-sql-lora/tokenizer.model',
 'Llama-2-7b-chat-hf-text-to-sql-lora/added_tokens.json',
 'Llama-2-7b-chat-hf-text-to-sql-lora/tokenizer.json')

In [10]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [13]:
!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -r llama.cpp/requirements.txt
!python llama.cpp/convert.py Llama-2-7b-chat-hf-text-to-sql-lora --outfile /content/Llama-2-7b-chat-hf-text-to-sql-lora.gguf --outtype q8_0

fatal: destination path 'llama.cpp' already exists and is not an empty directory.
Loading model file Llama-2-7b-chat-hf-text-to-sql-lora/model-00001-of-00006.safetensors
Loading model file Llama-2-7b-chat-hf-text-to-sql-lora/model-00001-of-00006.safetensors
Loading model file Llama-2-7b-chat-hf-text-to-sql-lora/model-00002-of-00006.safetensors
Loading model file Llama-2-7b-chat-hf-text-to-sql-lora/model-00003-of-00006.safetensors
Loading model file Llama-2-7b-chat-hf-text-to-sql-lora/model-00004-of-00006.safetensors
Loading model file Llama-2-7b-chat-hf-text-to-sql-lora/model-00005-of-00006.safetensors
Loading model file Llama-2-7b-chat-hf-text-to-sql-lora/model-00006-of-00006.safetensors
params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=4096, n_ff=11008, n_head=32, n_head_kv=32, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyQ8_0: 7>,

In [20]:
!pip install -q huggingface_hub
from huggingface_hub import create_repo, HfApi
from google.colab import userdata

api = HfApi(token = my_token)
username = "lyum"
lora_model = "Llama-2-7b-chat-hf-text-to-sql-lora"

# Create empty repo
create_repo(
    repo_id = f"{username}/Llama-2-7b-chat-hf-text-to-sql-lora-GGUF",
    repo_type="model",
    exist_ok=True,
    token=my_token
)

# Upload gguf files
api.upload_file(
    repo_id=f"{username}/Llama-2-7b-chat-hf-text-to-sql-lora-GGUF",
    path_or_fileobj = "Llama-2-7b-chat-hf-text-to-sql-lora.gguf",
    path_in_repo="Llama-2-7b-chat-hf-text-to-sql-lora.gguf",
    token=my_token
)

Llama-2-7b-chat-hf-text-to-sql-lora.gguf:   0%|          | 0.00/7.16G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lyum/Llama-2-7b-chat-hf-text-to-sql-lora-GGUF/commit/f2c50f3d369d66de4472568d6c7eeeef69a75346', commit_message='Upload Llama-2-7b-chat-hf-text-to-sql-lora.gguf with huggingface_hub', commit_description='', oid='f2c50f3d369d66de4472568d6c7eeeef69a75346', pr_url=None, pr_revision=None, pr_num=None)