- https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=LjY75GoYUCB8 --> unsloth google colab tutorial for llama models

- https://medium.com/@scholarly360/llama3-complete-guide-on-colab-4d933465a841

In [1]:
# Install necessary libraries
!pip install unsloth transformers torch

# Import libraries
import torch
import json
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
from google.colab import files

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the LLaMA model and tokenizer
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # Use the LLaMA model (quantized to 4-bit)
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Enable faster inference
FastLanguageModel.for_inference(model)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Collecting unsloth
  Downloading unsloth-2024.10.2-py3-none-any.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting unsloth-zoo (from unsloth)
  Downloading unsloth_zoo-2024.10.4-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.13-py3-none-any.whl.metadata (8.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading d

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers via:
`pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"`
Unsloth 2024.10.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
# load in data
# json_path = "/content/drive/MyDrive/ASAPPDatasets/80_10_10.json"
json_path = "/content/drive/MyDrive/abcd_sample.json"
with open(json_path, 'r') as file:
    data = json.load(file)

# train = data['train']

In [13]:

answers = []

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# Proceessing all conversations #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

def processData(original):
    # Prompt format
    alpaca_prompt = """Extract only the customer's name, email, and phone number from the input. Do not include input in the response. Do not include extra information. Answer in JSON format.

    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""

    # Tokenize the input
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                "Extract the customer's name, email, and phone number from the input.",  # instruction
                original,  # input
                "",  # output - leave this blank for generation!
            )
        ], return_tensors="pt"
    ).to("cuda")  # move to GPU

    # Generate the response
    outputs = model.generate(**inputs, max_new_tokens=64, temperature=0.5, use_cache=True)

    # Decode the output and append to the answers list
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Append the first decoded output to the answers list

    start = decoded_output[0].find('### Response:')
    decoded_output[0] = decoded_output[0][start + 13:]
    json_start = decoded_output[0].find('{')
    json_end = decoded_output[0].find('}')

    print(decoded_output[0][json_start:json_end + 1])

    answers.append(decoded_output[0][json_start:json_end + 1])

for convo in data:
  processData(convo['original'])

# print(answers)
with open('predicted_ans.json', 'w') as f:
  for a in answers:
    f.write(a + ', \n')

files.download('predicted_ans.json')


{
      "name": "Crystal Minh",
      "email": "cminh730@email.com",
      "phone": "(977) 625-2661"
    }
{
    		"Name": "Alessandro Phoenix",
    		"Email": "aphoenix939@email.com",
    		"Phone": "7916676427"
    	}
{
    		"name": "HEY HO!",
    		"email": "good afternoon, how can I help you?",
    		"phone": "I've got a promo code and I want to know when they expire."
    	}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>