In [1]:
import torch
import transformers
from huggingface_hub import login
import os
import dotenv

In [2]:
dotenv.load_dotenv()
HF_API_KEY = os.getenv('hfAccessToken')
login(HF_API_KEY)

In [3]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
)
pipeline("Hello job description parser")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Hello job description parser? I am looking for a job. I can write a program that can parse job descriptions to identify the'}]

In [5]:
import json
from datasets import load_dataset

dataset_path = "llama_finetune_data.json"

# Load dataset
dataset = load_dataset("json", data_files=dataset_path, split="train")

# Verify data format
print(dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

{'instruction': 'Extract key points from the given job description.', 'input': 'We are looking for hire experts flutter developer. So you are eligible this post then apply your resume.\nJob Types: Full-time, Part-time\nSalary: ₹20,000.00 - ₹40,000.00 per month\nBenefits:\nFlexible schedule\nFood allowance\nSchedule:\nDay shift\nSupplemental Pay:\nJoining bonus\nOvertime pay\nExperience:\ntotal work: 1 year (Preferred)\nHousing rent subsidy:\nYes\nIndustry:\nSoftware Development\nWork Remotely:\nTemporarily due to COVID-19', 'output': "{'Role Overview': 'we are looking for hire experts flutter developer  so you are eligible this post then apply your resume', 'Key Responsibilities': [], 'Qualifications & Skills': ['flutter']}"}


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enables 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model with offloading
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Model loaded successfully!")


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 