# **Dataset Generator**

Synthetic data is required for various reasons and this tool is designed to generate data on various topics given by user.

In [2]:
!pip install -q transformers torch bitsandbytes sentencepiece accelerate

In [1]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


In [3]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
import torch

In [4]:
hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)

In [5]:
#models to be used

LLAMA = "meta-llama/Llama-3.2-1B-Instruct"
GRANITE = "ibm-granite/granite-3.1-8b-instruct"

In [6]:
system_prompt = "You are a data generator who is able to produce informative articles on a given topic. \
Be diverse and produce useful outputs which is good representative of real world. \
Do not include false information. Be accurate and only talk about the truth."

In [8]:
# Quantization Config - this allows us to load the model into memory and use less memory

quant_config = BitsAndBytesConfig(
    load_in_8bit=True
)

In [15]:
#defining user prompt to generate

user_prompt = "Generate data on the topic of psychology"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

In [None]:
def generate(model):
  #use gpu if available
  device = "cuda" if torch.cuda.is_available() else "cpu"

  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(model, device_map="auto")
  outputs = model.generate(inputs, max_new_tokens=500, streamer=streamer)

  #clean up
  del inputs, outputs, model, tokenizer, streamer
  torch.cuda.empty_cache()

In [1]:
from google.colab import auth

auth.authenticate_user()

In [2]:
!git clone https://github.com/melihzgvnc/llm_practice_notebooks.git

Cloning into 'llm_practice_notebooks'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 6 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), done.


In [11]:
!git add

fatal: /content/drive/MyDrive/Colab: '/content/drive/MyDrive/Colab' is outside repository at '/content/llm_practice_notebooks'
