In [None]:
!pip install datasets bitsandbytes accelerate peft

In [None]:
!pip install --upgrade accelerate
!pip install bitsandbytes transformers_stream_generator

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
import torch
import pandas as pd
from datasets import Dataset, load_dataset
from peft import  LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
import bitsandbytes as bnb
from transformers import GPT2LMHeadModel, BitsAndBytesConfig
from transformers import GPT2Tokenizer, AutoModelForCausalLM
import transformers

In [None]:
queries_path = '/kaggle/input/list-of-arabic-instruction-query/list_of_queries.json'
queries_data = pd.read_json(queries_path)
old_column_name = queries_data.columns[0] 
new_column_name = 'queries'
queries_data = queries_data.rename(columns={old_column_name: new_column_name})

In [None]:
queries_data.columns[0]

In [None]:
import re

def max_len(text):
    pattern = r'[\s\n،؛؟\.،:؛!"\'()&*+,،\-./:;<=>؟]'

    # Split the text using the regular expression pattern
    tokens = re.split(pattern, text)
    
    # [token.strip() for token in tokens if token.strip()]
    
    return max(len(token.strip()) for token in tokens if token.strip())


def count_words(text):
    pattern = r'[\s\n،؛؟\.،:؛!"\'()&*+,،\-./:;<=>؟]'

    # Split the text using the regular expression pattern
    tokens = re.split(pattern, text)
    
    return len([token.strip() for token in tokens if token.strip()])


In [None]:
queries_data['max length of words'] = queries_data['queries'].map(max_len)

In [None]:
queries_data['count of words'] = queries_data['queries'].map(count_words)

In [None]:
length_counts = queries_data['max length of words'].value_counts()
length_counts_df = pd.DataFrame({'length': length_counts.index, 'count': length_counts.values})
length_counts_df = length_counts_df.sort_values(by='length', ascending=True)
length_counts_df.describe()

In [None]:
import matplotlib.pyplot as plt
length_counts_df = length_counts_df[length_counts_df['length']<150]
plt.figure(figsize=(10, 6))

# Plot the number of words against their counts
plt.bar(length_counts_df['length'], length_counts_df['count'], color='skyblue')

# Set labels and title
plt.xlabel('length of Words')
plt.ylabel('Count')
plt.title('Frequency of length of Words')

# Rotate x-axis labels for better readability if necessary
plt.xticks(rotation=45)

# Show plot
plt.show()


In [None]:
length_counts_df

In [None]:
word_counts = queries_data['count of words'].value_counts()
word_counts_df = pd.DataFrame({'number of words': word_counts.index, 'count': word_counts.values})
word_counts_df = word_counts_df.sort_values(by='number of words', ascending=True)
word_counts_df.describe()

In [None]:
import matplotlib.pyplot as plt

word_counts_df = word_counts_df[word_counts_df['number of words']<2000]

# Set figure size
plt.figure(figsize=(10, 6))

# Plot the number of words against their counts
plt.bar(word_counts_df['number of words'], word_counts_df['count'], color='skyblue')

# Set labels and title
plt.xlabel('Number of Words')
plt.ylabel('Count')
plt.title('Frequency of Number of Words')

# Rotate x-axis labels for better readability if necessary
plt.xticks(rotation=45)

# Show plot
plt.show()


In [None]:
word_counts_df

In [None]:
queries_data.describe()

In [None]:
len(queries_data[(queries_data['count of words'] < 300) & (queries_data['max length of words'] < 14)])/len(queries_data)

In [None]:
clean_queries_data = queries_data[(queries_data['count of words'] < 300) & (queries_data['max length of words'] < 14)]


In [None]:
queries_dataset = Dataset.from_dict({'queries': clean_queries_data['queries'].tolist()})
queries_dataset

In [None]:
print(queries_dataset['queries'][0])

In [None]:
from torch.utils.data import random_split
seed = 420
generator = torch.Generator().manual_seed(seed)


length = len(queries_dataset)
num_sample = 4
size_sample = length//num_sample
part4_length = length - 3*size_sample

part1, part2, part3, part4 = random_split(queries_dataset, [size_sample, size_sample, size_sample, part4_length], generator=generator)

In [None]:
def subset_to_dataset(subset):
    data = {}
    for key in subset.dataset[0].keys():
        data[key] = []

    for idx in subset.indices:
        for key, value in subset.dataset[idx].items():
            data[key].append(value)

    return Dataset.from_dict(data)

In [None]:
part1_of_dataset = subset_to_dataset(part1)
part2_of_dataset = subset_to_dataset(part2)
part3_of_dataset = subset_to_dataset(part3)
part4_of_dataset = subset_to_dataset(part4)

In [None]:
part1_of_dataset,part1_of_dataset, part1_of_dataset,part1_of_dataset , part1_of_dataset['queries'][0]

In [None]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
) # setup bits and bytes config

model_checkpoint = 'aubmindlab/aragpt2-medium'
model = transformers.GPT2LMHeadModel.from_pretrained(model_checkpoint, quantization_config=bnb_config, device_map='balanced')
model.config.pretraining_tp = 1
model.config.use_cache = False
#model = GPT2LMHeadModel.from_pretrained(model_checkpoint, quantization_config=bnb_config, device_map="auto",)

In [None]:
model

In [None]:
################################################################################
# QLoRA parameters
###########################################################################
# LoRA config based on QLoRA paper
config = LoraConfig(
    r=8 ,
    lora_alpha=16,
    target_modules=['c_attn'],  #['c_attn', 'c_proj', 'c_fc', 'lm_head']
    lora_dropout=0.06,
    bias="none",
    task_type="CAUSAL_LM",
)
# tokenizer.pad_token_id = 0
# model.gradient_checkpointing_enable()

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)

model.print_trainable_parameters()

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint, add_prefix_space=True, add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def tokenize_function(examples):
    # extract text
    text = examples['queries']

    tokenized_inputs = tokenizer(
        text, 
        truncation=True,
        return_tensors='pt',
        max_length=400,
        padding="max_length",
    )

    return tokenized_inputs


In [None]:
tokenize_dataset1 = part1_of_dataset.shuffle(seed = 42).map(tokenize_function) #, batched=True)

In [None]:
tokenize_dataset1

In [None]:
d = tokenize_dataset1.remove_columns(['queries'])

split_index = int(len(d) * 0.75)  # 75% for training, 25% for testing

# Split the dataset into training and testing subsets
train_dataset1 = Dataset.from_dict(d[:split_index])
test_dataset1 = Dataset.from_dict(d[split_index:])

In [None]:
train_dataset1, test_dataset1

In [None]:
training_args = transformers.TrainingArguments(
    output_dir= './outputOfTrain/',                                 # output directory
    evaluation_strategy="steps",                                   # evaluate each epoch
    per_device_train_batch_size=16 ,                          # batch size for training
    per_device_eval_batch_size=4,                                  # batch size for evaluation
    num_train_epochs=0.2,                           # number of training epochs
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    #logging_dir='./logs',                         # directory for storing logs
    save_steps=50,                               # save checkpoint every n steps
    logging_steps=10,                            # log training metrics every n steps
    learning_rate=1e-3,                           # initial learning rate
    weight_decay=0.01,                            # weight decay
    warmup_steps=50,                             # number of warmup steps for learning rate scheduler
    disable_tqdm=True,                           # enable tqdm progress bars
    #report_to=["tensorboard"],                    # report training results to TensorBoard
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    overwrite_output_dir=True,
    gradient_accumulation_steps=2,
)

In [None]:
'''if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model)
torch.cuda.device_count()'''

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset = train_dataset1,
    eval_dataset = test_dataset1,
    args= training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    ),
)
# model.config.use_cache = False

In [None]:
Training = trainer.train()

In [None]:
trainer.save_model('train/arabic-gpt2-m')

In [None]:
def train_model(model_checkpoint, target_module, dataset):
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    ) 
    model = transformers.GPT2LMHeadModel.from_pretrained(model_checkpoint, quantization_config=bnb_config, device_map='balanced')
    model.config.pretraining_tp = 1
    model.config.use_cache = False
    
    tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint, add_prefix_space=True, add_eos_token=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    config = LoraConfig(
        r=8 ,
        lora_alpha=16,
        target_modules=[target_module],  #['c_attn', 'c_proj', 'c_fc', 'lm_head']
        lora_dropout=0.06,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, config)

    print('print_trainable_parameters : ',model.print_trainable_parameters())

    tokenize_dataset = dataset.shuffle(seed = 42).map(tokenize_function) #, batched=True)
    
    d = tokenize_dataset.remove_columns(['queries'])

    split_index = int(len(d) * 0.75)  # 75% for training, 25% for testing

    # Split the dataset into training and testing subsets
    train_dataset = Dataset.from_dict(d[:split_index])
    test_dataset = Dataset.from_dict(d[split_index:])
    
    training_args = transformers.TrainingArguments(
                        output_dir= './outputOfTrain/',                                 # output directory
                        evaluation_strategy="steps",                                   # evaluate each epoch
                        per_device_train_batch_size=16 ,                          # batch size for training
                        per_device_eval_batch_size=4,                                  # batch size for evaluation
                        num_train_epochs=1,                           # number of training epochs
                        gradient_checkpointing=True,
                        optim="paged_adamw_32bit",
                        #logging_dir='./logs',                         # directory for storing logs
                        save_steps=50,                               # save checkpoint every n steps
                        logging_steps=10,                            # log training metrics every n steps
                        learning_rate=1e-3,                           # initial learning rate
                        weight_decay=0.01,                            # weight decay
                        warmup_steps=50,                             # number of warmup steps for learning rate scheduler
                        disable_tqdm=True,                           # enable tqdm progress bars
                        #report_to=["tensorboard"],                    # report training results to TensorBoard
                        fp16=True,
                        max_grad_norm=0.3,
                        warmup_ratio=0.03,
                        overwrite_output_dir=True,
                        gradient_accumulation_steps=2,
                    )
    
    trainer = transformers.Trainer(
                    model=model,
                    train_dataset = train_dataset1,
                    eval_dataset = test_dataset1,
                    args= training_args,
                    data_collator=transformers.DataCollatorForLanguageModeling(
                        tokenizer=tokenizer, mlm=False,
                    ),
                )
    
    Training = trainer.train()
    
    trainer.save_model('train/arabic-gpt2-m')
    
    return model, Training