In [2]:
# pip install -r requirements.txt

In [3]:
%env WANDB_PROJECT=LLM_Science_Exam

env: WANDB_PROJECT=LLM_Science_Exam


In [4]:
import os
from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from accelerate import notebook_launcher
# MODEL = 'microsoft/deberta-v3-large'
MODEL ='potsawee/longformer-large-4096-answering-race'

In [5]:
df_valid = pd.read_csv('train_with_context2.csv')
df_train = pd.read_csv('community_sample.csv')
df_train = df_train.drop(columns=["source","id"]).fillna('').sample(2000)

In [6]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [ example['context'] + example['prompt']] * 5
    second_sentences = [example[option] for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences,truncation=True,max_length = 2048)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = 2048
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL,max_length = 2048, truncation=True)
dataset_valid = Dataset.from_pandas(df_valid)
dataset = Dataset.from_pandas(df_train)
tokenized_dataset_valid = dataset_valid.map(preprocess,num_proc=48, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset = dataset.map(preprocess,num_proc=48, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])


Map (num_proc=48):   0%|          | 0/200 [00:00<?, ? examples/s]

Map (num_proc=48):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForMultipleChoice.from_pretrained(MODEL, gradient_checkpointing=True)#, torch_dtype=torch.float16)

In [9]:
model.device

device(type='cpu')

In [10]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

In [21]:
training_args = TrainingArguments(
    warmup_ratio=0.05, 
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    report_to='wandb',
    output_dir = f'./checkpoints_v5',
    overwrite_output_dir=True,
    fp16=True,
    fp16_full_eval= True,
    gradient_accumulation_steps=4,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    load_best_model_at_end=False,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=2,
)

In [12]:
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=8,
    target_modules=["query", "value"],
    lora_dropout=0.01,
    bias="none",
    # fan_in_fan_out=True,
    task_type=TaskType.SEQ_CLS
)
peft_model = get_peft_model(model, 
                            lora_config)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_valid,
    compute_metrics = compute_metrics
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

In [None]:
notebook_launcher(trainer.train())

Step,Training Loss,Validation Loss,Map@3
10,1.4363,1.099169,0.7125
20,1.237,1.098353,0.7125
30,1.3246,1.096714,0.718333
40,1.398,1.094891,0.715833
50,1.0393,1.091727,0.716667
60,1.3027,1.089616,0.716667


