In [None]:
VER=1
# USE NEXT K ROWS FOR TRAIN
TRAIN_SET = 2500
# USE FIRST K ROWS FOR VALIDATION
VAL_SET = 100
# MAX SEQ LENGTH FOR MISTRAL TRAINING
MAX_LEN = 1024
# BATCH SIZE PER DEVICE
BATCH_SIZE = 1
# NUMBER OF EPOCHS
EPOCHS = 1
# IF LOAD PATH IS NOT NONE, LOAD A PREVIOUSLY TRAINED MODEL
LOAD_PATH = None     #'/kaggle/input/mistral-v0/'

import pandas as pd, numpy as np
from sklearn.metrics import cohen_kappa_score
df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-4/train.csv')
print('Train shape:', df.shape )
print(df.head())

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Necessary until transformers packages is updated in the Kaggle notebook environment.

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import re

#model_name = 'mistralai/Mistral-7B-Instruct-v0.2' # WHEN INTERNET IS TURNED ON
#model_name = '/kaggle/input/mistral-7b-instruct-v02-fp16'
#model_name = "/kaggle/input/gemma-2/transformers/gemma-2-2b-it/1/"
model_name = "/kaggle/input/gemma-2/transformers/gemma-2-9b-it/2/"

tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation='eager'
)

tokenizer.pad_token = tokenizer.eos_token
input_text = "Write me a poem about Machine Learning which is 4 lines long."
input_ids = tokenizer(input_text, return_tensors="pt").to('cuda')

outputs = base_model.generate(**input_ids, max_new_tokens=256)
print(tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True))


In [None]:
!ls /kaggle/input/gemma-2/transformers/gemma-2-9b-it/2

In [None]:
!cd /opt/conda/lib/python3.10/site-packages/transformers/
!cat /opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py

In [None]:
def preprocess(sample, text=False, infer_mode=False, max_seq=MAX_LEN, return_tensors=None):

    sys_prompt = "Mark this year 10 essay and assign a score of 1,2,3,4,5,6 where 6 is the best. Output only a single number with no explanation:"
    prompt = sample["full_text"]
    
    messages = [{"role": "user", "content": sys_prompt + prompt}]
    
    if not infer_mode:
        messages.append({"role": "assistant", "content": str(sample["score"])})

    formatted_sample = tokenizer.apply_chat_template(messages, tokenize=False)
    if infer_mode:
        formatted_sample = formatted_sample.replace("</s>","")
        #print(formatted_sample)
    
    tokenized_sample = tokenizer(formatted_sample, padding=True, return_tensors=return_tensors, 
                                 truncation=True, add_special_tokens=False, max_length=max_seq) 
    
    if return_tensors=="pt":
        tokenized_sample["labels"] = tokenized_sample["input_ids"].clone()
    else:
        tokenized_sample["labels"] = tokenized_sample["input_ids"].copy()
    
    if text: return formatted_sample
    else: return tokenized_sample


def evaluate_model(df, model):
    preds = []

    for i,row in df.iloc[:VAL_SET].iterrows():
        tokenized_sample = preprocess(row, infer_mode=True, max_seq=2048, return_tensors="pt")
        generated_ids = model.generate(**tokenized_sample, 
                                        max_new_tokens=5,
                                        pad_token_id=tokenizer.eos_token_id,
                                        do_sample=False)
        decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        try:
            #answer = decoded[0].rsplit("The score is: ", 1)[1] # Mistral
            score = tokenizer.decode(generated_ids[0][len(tokenized_sample[0]):], skip_special_tokens=True) # Gemma2
            score = int(re.search(r'\d+', score).group())
            score_output = score
        except:
            score_output = score
            score = 3
        
        preds.append(int(score))
        
        print(i, 'Predicted:', score_output, ' Actual:', row['score'])
        print(f'predicted scores={preds}, \n',end='')
    evaluation_score = cohen_kappa_score(df.score.values[:VAL_SET], preds, weights="quadratic")

    return evaluation_score


In [None]:
from datasets import Dataset

dataset_v = Dataset.from_pandas(df.iloc[:VAL_SET])
dataset_t = Dataset.from_pandas(df.iloc[VAL_SET:VAL_SET+TRAIN_SET])

tokenized_dataset_v = dataset_v.map(preprocess, num_proc=4, 
                            remove_columns=['essay_id', 'full_text', 'score'])
tokenized_dataset_t = dataset_t.map(preprocess, num_proc=4, 
                            remove_columns=['essay_id', 'full_text', 'score'])

In [None]:
# Score the model vs the validation set before training
print(evaluate_model(df, base_model))

In [None]:
!pip install peft==0.10.0

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, PeftModel

training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    logging_dir = f'./logs_v{VER}',
    output_dir = f'./output_v{VER}',
    logging_steps=25,
    save_strategy='no',
    load_best_model_at_end=True,
    logging_first_step=True,
    overwrite_output_dir=True,
    warmup_ratio=0.0,
    learning_rate=5e-4,
    lr_scheduler_type='constant',
    weight_decay=0.01,
    eval_steps=None,
    evaluation_strategy='no',
    report_to='none',
)

peft_config = LoraConfig(
    lora_alpha=16, # regularization
    lora_dropout=0.1, 
    r=32, # attention heads
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"], 
)

if not LOAD_PATH:
    adapter_model = get_peft_model(base_model, peft_config)

trainer = Trainer(
    model=adapter_model,
    args=training_args,
    train_dataset=tokenized_dataset_t,
    eval_dataset=tokenized_dataset_v,
    data_collator=DataCollatorForSeq2Seq(tokenizer, padding='longest'),
)

if not LOAD_PATH:
    trainer.train()
    trainer.model.save_pretrained("gemma2-2b-it-adapter")
else:
    model = PeftModel.from_pretrained(model, LOAD_PATH)

In [None]:
trained_model = trainer.model.merge_and_unload()
# Score the new trained model vs the validation set to see if the training has worked
print(evaluate_model(df, trained_model))

In [None]:
#!pip install peft==0.11.0
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, PeftModel
import torch

model_name = "/kaggle/input/gemma-2/transformers/gemma-2-2b-it/1/"

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_dir = f'.',
    output_dir = f'.',
    logging_steps=25,
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_first_step=True,
    overwrite_output_dir=True,
    warmup_ratio=0.0,
    learning_rate=5e-4,
    lr_scheduler_type='constant',
    weight_decay=0.01,
    eval_steps=None,
    evaluation_strategy='epoch',
    report_to='none',
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation='eager'
)

print('base_model', base_model.num_parameters())
base_model.save_pretrained("gemma2-2b-it")

peft_config = LoraConfig(
    lora_alpha=16, # regularization
    lora_dropout=0.1, 
    r=32, # attention heads
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"], 
)

# create and save an adapter
adapter_model = get_peft_model(base_model, peft_config)
adapter_path = "gemma2-2b-it-adapter"
adapter_model.save_pretrained(adapter_path)
print('adapter', adapter_model.num_parameters())

# create a peft model from a gemma model and adapter
#model = PeftModel.from_pretrained(base_model, adapter_path, peft_config = peft_config)
#print('PeftModel', model.num_parameters())

trainer = Trainer(
    model=adapter_model,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(tokenizer, padding='longest'),
)

print('TrainerModel', trainer.model.num_parameters())

trainer.model.merge_and_unload()

print('TrainerMergedModel', trainer.model.num_parameters())

trainer.model.save_pretrained('gemma2-2b-it-trained')

model.save_pretrained("gemma2-2b-it-adaptor")

In [None]:
!ls -l /kaggle/working/gemma2-2b-it-adapter