In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
!pip install -U transformers

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2  trl==0.4.7

In [None]:
!pip install -U accelerate peft bitsandbytes

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    Trainer
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer



In [3]:
# The model that you want to train from the Hugging Face hub
model_name ="bigscience/bloomz-7b1"

# The instruction dataset to use

# Fine-tuned model name
new_model = "bigscience/bloomz-7b1"
# LoRA attention dimension
lora_r = 16
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.05
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

use_nested_quant = False

output_dir = "./results"

# Number of training epochs
num_train_epochs = 1
fp16 = False
bf16 = False

per_device_train_batch_size =1

per_device_eval_batch_size = 4

gradient_accumulation_steps = 8

gradient_checkpointing = True

max_grad_norm = 0.3

learning_rate = 5e-5

weight_decay = 0.001

optim = "paged_adamw_8bit"

lr_scheduler_type = "constant"

max_steps = -1
warmup_ratio = 0.03
group_by_length = True

save_steps = 100

logging_steps = 25

max_seq_length = False
packing = False
#device_map = {"": 0}

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [5]:
# Load dataset (you can process it here)

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=['query_key_value', 'dense_h_to_4h', 'dense_4h_to_h', 'dense'],
    bias="none",
    task_type="CAUSAL_LM",
)

  return self.fget.__get__(instance, owner)()


In [None]:
model

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


In [None]:
!pip install -q datasets trl peft bitsandbytes sentencepiece wandb

In [None]:
import pandas as pd
train_df = pd.read_csv("/kaggle/input/expect-answer-true-or-false-arabic/data.csv").iloc[:-5000]
valid_df = pd.read_csv("/kaggle/input/expect-answer-true-or-false-arabic/data.csv").iloc[-5000:]
test_df = pd.read_csv("/kaggle/input/expect-answer-true-or-false-arabic/data.csv").iloc[-5000:]

In [None]:
def chat_Format(context,answer):
   return "Instruction:\ncheck answer is true or false of next quetion using context below:\n"+context+ f".\n#Student answer: "+answer+".\n#Response:"

In [None]:
train_df['input']=chat_Format(train_df['question'],train_df['answer'] )+train_df['label']
valid_df['input']=chat_Format(valid_df['question'],valid_df['answer'] ) +valid_df['label']
valid_df['input2']=chat_Format(valid_df['question'],valid_df['answer'] )

In [None]:
train_df['input']=train_df['input'].apply(lambda x:x.replace('CANNOTANSWER',''))

In [None]:
valid_df['input2']=valid_df['input2'].apply(lambda x:x.replace('CANNOTANSWER',''))

In [None]:
"""m=-12
w=np.zeros(len(train_df))
o=0
for i in train_df['input']:
    t=len(tokenizer(i)['input_ids'])
    w[o]=t
    o+=1
    print(o,end='\r')
m=-12
a=np.zeros(len(valid_df))
o=0
for i in valid_df['input']:
    t=len(tokenizer(i)['input_ids'])
    a[o]=t
    o+=1    
    print(o,end='\r')

train_df=train_df.loc[w<650]
valid_df=valid_df.loc[a<650]   
train_texts=train_df
valid_texts=valid_df"""

In [7]:
train_df = pd.read_csv("/kaggle/input/bloomz-arabi-proote/train.csv")
valid_df = pd.read_csv("/kaggle/input/bloomz-arabi-proote/valid.csv")

In [None]:
#Datasets and Dataloaders
from torch.utils.data import Dataset, DataLoader

class QADataset(Dataset):
    def __init__(self, encodings):
        self.inputs = encodings['input']
        
    def __getitem__(self, idx):
        a=tokenizer(self.inputs[idx] , truncation=True, padding='max_length', return_tensors="pt", max_length=650)
        return {
            
            "input_ids": a["input_ids"][0],
            "attention_mask": a["attention_mask"][0],
            "labels":a['input_ids'][0]
        }
    def __len__(self):
        return len(self.inputs)
train_dataset = QADataset(train_df.iloc[28000:37000].reset_index(drop=True))
val_dataset = QADataset(valid_df.iloc[:100].reset_index(drop=True))


In [None]:

!pip install wandb
import wandb
wandb.login(key="14459c516497ab76a78f7fc1278bfe60d301d250")

In [None]:
train_df.to_csv('train.csv')
valid_df.to_csv('valid.csv')

In [8]:
peftmodel=PeftModel.from_pretrained(model,"/kaggle/input/proote/results/checkpoint-1000",is_trainable=True)
peftmodel.enable_input_require_grads()
peftmodel.train()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 4096)
        (word_embeddings_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-29): 30 x BloomBlock(
            (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=12288, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=12288, bias=False)
                )
           

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=3,
    gradient_accumulation_steps=3,
    optim=optim,
    save_steps=200,
    logging_steps=15,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16, 
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=False,
    lr_scheduler_type=lr_scheduler_type,
   gradient_checkpointing=gradient_checkpointing
)

trainer = SFTTrainer(
    model=peftmodel,
    train_dataset=train_dataset,
        eval_dataset=val_dataset,

    peft_config=None,
    dataset_text_field="text",
    args=training_arguments,
    packing=False,
)
trainer.train()
trainer.model.save_pretrained(new_model)

In [None]:
#peftmodel.save_pretrained("bloom")


In [None]:
trainer.evaluate()


In [None]:
! pip install evaluate

In [None]:
! pip install rouge_score

In [None]:
#peftmodel=PeftModel.from_pretrained(model,"/kaggle/input/bloomz-arabi-proote/results/checkpoint-1000")


In [9]:
#tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"

In [10]:
peftmodel.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 4096)
        (word_embeddings_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-29): 30 x BloomBlock(
            (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=12288, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=12288, bias=False)
                )
           

In [33]:
from transformers import logging

# Disable transformers library warnings
logging.set_verbosity_error()
#import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
#metric= evaluate.load("rouge")
#metric2= evaluate.load("bleu")

predictions, references = [] , []
o=0
s=0
l=1000
step=4
for i in range(0,l,step):
        inp2=valid_df['input2'].iloc[i:i+step]

        w=tokenizer(inp2.tolist(), add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',    max_length=650

        )
        d=tokenizer.batch_decode(peftmodel.generate(input_ids=w['input_ids'].cuda(),attention_mask=w['attention_mask'].cuda(),num_beams=2,max_new_tokens=3),skip_special_tokens=True)
        for o in range(len(d)):  

            e=d[o][d[o].find(f'\n#Response:')+len(f'\n#Response:'):]     
            c=(e+' ,').split()[0].strip().lower().strip(':').strip('.')
            if(c=='\ntrue'):
                c='true'
            elif(c=='خط'):
                c='خطأ'
            a=valid_df['label'].iloc[i+o].split()[0]
            s+=int((e+' ,').split()[0].strip().lower()==valid_df['label'].iloc[i+o].split()[0].strip().lower())
            print(f'{i} : {s/(i+o+1)} ',end='\r')
            predictions+=[c]
            references+=[valid_df['label'].iloc[i+o]]


KeyboardInterrupt: 

In [None]:
predictions

In [None]:
references

In [None]:
print(f"accuracy : {s/(i+1)}")

In [None]:
peft_model_id="results"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

In [16]:
context ="""تقنية النانو أحد األساليب المبتكرة لدراسة المادة وطرق تغييرها عند مستوى النانو؛ من أجل إنتاج
ًّ مواد أخرى متطورة تخدم البشرية فى مختلف مجاالت الحياة، والنانو وحدة قياس دقيقة جدا ؛ فالنانو الواحد
يعادل واحدا على المليون من المليمتر ؛ لذلك تستحيل رؤية األشياء المقاسة بالنانو بواسطة العين المجردة،
أو حتى بمكبرات الرؤية البدائية، وهى تستخدم فى القياس الذرى لتحديد األحجام الخاصة بجزئيات المادة
المتواجدة بها."""

In [17]:
quetion=" لماذا لا يمكن رؤية النانو؟"

In [18]:
answer="لانها  شديدة كبيرة جدا"

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer


In [20]:
peftmodel.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 4096)
        (word_embeddings_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-29): 30 x BloomBlock(
            (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=12288, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=12288, bias=False)
                )
           

In [21]:
def chat_Format(context,question,answer):
   return "Instruction:\ncheck answer is true or false of next quetion using context below:\nContext "+context+"\nQuestion "+question + f".\n#Student answer: "+answer+".\n#Response:"

In [39]:
        inp2=chat_Format(context,quetion,answer)
        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

        w=tokenizer(inp2, add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'

        )
        for i in tokenizer.batch_decode(model.generate(input_ids=w['input_ids'].cuda(),attention_mask=w['attention_mask'].cuda(),num_beams=7,num_return_sequences=2,max_new_tokens=60 ),skip_special_tokens=True):
                print(i)

Instruction:
check answer is true or false of next quetion using context below:
Context تقنية النانو أحد األساليب المبتكرة لدراسة المادة وطرق تغييرها عند مستوى النانو؛ من أجل إنتاج
ًّ مواد أخرى متطورة تخدم البشرية فى مختلف مجاالت الحياة، والنانو وحدة قياس دقيقة جدا ؛ فالنانو الواحد
يعادل واحدا على المليون من المليمتر ؛ لذلك تستحيل رؤية األشياء المقاسة بالنانو بواسطة العين المجردة،
أو حتى بمكبرات الرؤية البدائية، وهى تستخدم فى القياس الذرى لتحديد األحجام الخاصة بجزئيات المادة
المتواجدة بها.
Question  لماذا لا يمكن رؤية النانو؟.
#Student answer: لانها  شديدة كبيرة جدا.
#Response:خطأ الجواب هو لا يمكن رؤية النانو بواسطة العين المجردة ، أو حتى بمكبرات الرؤية البدائية.وذلك لأن النانو واحد يعادل واحد على المليون من المليمتر.لذلك ، لا يمكن رؤية النانو بواسطة العين المجردة ، أو حتى بمكبرات الرؤية البدائية ، لأنها
Instruction:
check answer is true or false of next quetion using context below:
Context تقنية النانو أحد األساليب المبتكرة لدراسة المادة وطرق تغييرها عند مستوى النانو؛ من أجل إنتاج
ًّ

In [24]:
s=peftmodel( input_ids=w['input_ids'].cuda(),attention_mask=w['attention_mask'].cuda())['logits'][0][-1]

In [27]:
e=(s/s.sum())
(s[16068]/(s[170089]+s[16068]))**.5

tensor(0.7065, device='cuda:0', dtype=torch.float16, grad_fn=<PowBackward0>)

In [28]:
s.argmax()

tensor(170089, device='cuda:0')

In [26]:
tokenizer.batch_decode([16068])

['خط']

In [None]:
model.generate(input_ids=w['input_ids'].cuda(),attention_mask=w['attention_mask'].cuda(),streamer=streamer,max_new_tokens=30 )