In [1]:
from datasets import load_dataset

# Load the dataset from Hugging Face Hub
dataset = load_dataset("jawerty/html_dataset")

# Splitting the dataset into training and testing sets
# Assuming the dataset has a default split, adjust if it has specific named splits
train_test_split = dataset['train'].train_test_split(test_size=0.2)

# Extract the training and testing datasets
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Optionally, you can further split or sample the dataset to match your specific size requirements
# For example, to get 1000 data points for training and 200 for testing:
# train_dataset = train_dataset.select(range(1000))
# test_dataset = test_dataset.select(range(200))

# Displaying the first few examples from the training set
#print("Training Set Examples:", train_dataset[:3])


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.optim import AdamW



model_id = "TinyPixel/CodeLlama-7B-Instruct-bf16-sharded"
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token  # Use this if the tokenizer has an EOS token
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [2]:
# def tokenize_function(examples):
#     # Assuming 'prompt' is the field with text data and 'html' is the target
#     # Concatenate them with a special token for separation (e.g., <|endoftext|>)
#     concatenated_examples = [label + "<|endoftext|>" + html for label, html in zip(examples['label'], examples['html'])]
#     return tokenizer(concatenated_examples, padding="max_length", truncation=True, max_length=512)

def tokenize_function(examples):
    # Example adjustment for tokenization
    # Assuming 'input_text' is your input text and 'target_text' is your target text
    inputs = tokenizer(examples['label'], padding='max_length', truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['html'], padding='max_length', truncation=True, max_length=512)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["html"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["html"])

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["label"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["label"])

tokenized_train_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

Map:   0%|          | 0/34 [00:00<?, ? examples/s]



Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [3]:
print(tokenized_train_dataset[0])
# DataLoader
train_dataloader = DataLoader(tokenized_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_test_dataset, batch_size=8)
print(type(train_dataloader))

{'input_ids': tensor([   2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2, 

In [4]:
for batch in train_dataloader:
  break
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([8, 512]), 'attention_mask': torch.Size([8, 512]), 'labels': torch.Size([8, 512])}


In [5]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
# outputs = model(**batch)
# print(outputs.loss, outputs.logits.shape)

In [7]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 40554752 || all params: 3541098752 || trainable%: 1.1452589955898524


In [10]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [11]:
model = accelerator.prepare_model(model)

In [12]:
import wandb, os
wandb.login()

wandb_project = "journal-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

[34m[1mwandb[0m: Currently logged in as: [33mmadhavgupta6803[0m ([33mmg_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [14]:
model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch}")
    for batch in progress_bar:
        #print(batch)
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        print(type(outputs))
        loss = outputs.loss
        print(loss)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_postfix(loss=loss.item())

Epoch 0:   0%|          | 0/5 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.3061, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.5091, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.4417, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.3118, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.6564, device='cuda:0', grad_fn=<NllLossBackward0>)


Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]

<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.7435, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.8045, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.9134, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(10.3114, device='cuda:0', grad_fn=<NllLossBackward0>)


In [15]:
model.eval()
total_eval_loss = 0
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    total_eval_loss += outputs.loss.item()

avg_eval_loss = total_eval_loss / len(eval_dataloader)
print(f"Average evaluation loss: {avg_eval_loss}")

Average evaluation loss: 9.798749446868896


In [16]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    # Convert predictions to the expected integer format
    predictions = torch.argmax(logits, dim=-1).detach().cpu().numpy().astype(int)
    # Ensure references are also in the correct integer format
    references = batch["labels"].detach().cpu().numpy().astype(int)
    # Flatten the 2D arrays to 1D
    predictions_1d = [pred[0] for pred in predictions]
    references_1d = [ref[0] for ref in references]

    metric.add_batch(predictions=predictions_1d, references=references_1d)

accuracy = metric.compute()
print("Accuracy:", accuracy)

Accuracy: {'accuracy': 0.0}


In [None]:
# Assuming 'model' is your fine-tuned model
model_save_path = "./content"
model.save_pretrained(model_save_path)

In [17]:
!pip install flask



In [None]:
from flask import Flask, request, jsonify
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

app = Flask(__name__)

# Load your trained model
model_id = "your_model_id_or_path"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_html(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

@app.route('/generate', methods=['POST'])
def generate():
    data = request.get_json()
    prompt = data.get("prompt")
    if not prompt:
        return jsonify({'error': 'No prompt provided'}), 400

    try:
        html_code = generate_html(prompt)
        return jsonify({'html_code': html_code})
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True)
