In [1]:
import pandas as pd

In [None]:
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install transformers bitsandbytes accelerate peft unsloth

In [None]:
import os
import copy
import random
from dataclasses import dataclass

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    LlamaForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, roc_auc_score

from unsloth import FastLanguageModel
from accelerate import Accelerator

In [None]:
def seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.use_deterministic_algorithms(True)

seed_everything(0)

In [None]:
@dataclass
class Config:
    # model
    gemma_dir: str = "unsloth/gemma-2-9b-it-bnb-4bit" # instruction-tuningあり
    gemma_dir_noit: str = "unsloth/gemma-2-9b-bnb-4bit" # instruction-tuningなし
    # llama_dir: str = "meta-llama/Llama-3.2-3B-Instruct"
    
    # tokenizer
    max_length: int = 1024

    # lora
    target_modules : tuple = ("q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj")
    freeze_layers: int = 0
    lora_r: int = 16
    lora_alpha: float = lora_r * 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"

    # train
    per_device_train_batch_size: int = 8
    gradient_accumulation_steps: int = 2
    per_device_eval_batch_size: int = 64
    learning_rate: float = 1e-4
    n_epochs: int = 1
    warmup_ratio: float = 0.1
    eval_steps: int = 25
    optim_type: str = "adamw_torch_fused"

config = Config()

In [None]:
training_args = TrainingArguments(
    output_dir="tmp",
    overwrite_output_dir=True,
    report_to="none",
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    num_train_epochs=config.n_epochs,
    warmup_ratio=config.warmup_ratio,
    logging_steps=config.eval_steps,
    eval_strategy="steps",
    eval_steps=config.eval_steps,
    save_strategy="steps",
    save_steps=config.eval_steps,
    save_total_limit=2,
    fp16=True,
    metric_for_best_model="auc",
    greater_is_better=True,
    optim=config.optim_type,
    full_determinism=True
)

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=config.lora_r,
    target_modules=config.target_modules,
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
)

In [None]:
INPUT_DIR = "/kaggle/input/defp2024-kaggle-5"

df_train = pd.read_csv(f"{INPUT_DIR}/train.csv", low_memory=False)
df_test = pd.read_csv(f"{INPUT_DIR}/test.csv", low_memory=False)
print(df_train.shape, df_test.shape)

## SequenceClassification

In [None]:
prompt = (
    "Is the following movie appropriate for viewing by young people?\n"
    "Title: {title}\n"
    "Description: {description}"
)

df_train["text"] = [prompt.format(
    title=row["title"],
    description=row["description"],
) for _, row in df_train.iterrows()]

df_test["text"] = [prompt.format(
    title=row["title"],
    description=row["description"],
) for _, row in df_test.iterrows()]

df_train["labels"] = df_train["rating_flag"]

In [None]:
def tokenize(row, tokenizer):
    return tokenizer(row["text"], padding=True)

In [None]:
def prepare_model(lora_config):
    model = Gemma2ForSequenceClassification.from_pretrained(
        config.gemma_dir,
        num_labels=2,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model.config.use_cache = False
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    
    return model

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = torch.from_numpy(preds).float().softmax(-1).numpy()[:, -1]
    auc = roc_auc_score(labels, preds)
    return {"auc": auc}

## データ準備

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.gemma_dir)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

ds = Dataset.from_pandas(df_train[["text", "labels"]])
ds = ds.map(tokenize, batched=True, fn_kwargs={"tokenizer": tokenizer})

ds_test = Dataset.from_pandas(df_test[["text"]])
ds_test = ds_test.map(tokenize, batched=True, fn_kwargs={"tokenizer": tokenizer})

ds_train, ds_valid = ds.train_test_split(test_size=0.2).values()

## gemma-2-9b-it-bnb-4bit + SequenceClassification

In [None]:
# model  = Gemma2ForSequenceClassification.from_pretrained(
#         config.gemma_dir,
#         num_labels=2,
#         torch_dtype=torch.float16,
#         device_map="auto",
#     )
# model.config.use_cache = False
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, lora_config)

# model

In [None]:
# trainer = Trainer(
#         args=training_args,
#         model=model,
#         tokenizer=tokenizer,
#         train_dataset=ds_train,
#         eval_dataset=ds_valid,
#         compute_metrics=compute_metrics,
#         data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
#     )
# trainer.train()

# preds = trainer.predict(ds_valid).predictions
# preds_oof_llama = torch.from_numpy(preds).float().softmax(dim=-1).numpy()[:, -1]

In [None]:
# print(roc_auc_score(ds_valid["labels"], preds_oof_llama))

## gemma-2-9b-bnb-4bit + SequenceClassification

In [None]:
model  = Gemma2ForSequenceClassification.from_pretrained(
        config.gemma_dir_noit,
        num_labels=2,
        torch_dtype=torch.float16,
        device_map="auto",
    )
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
trainer = Trainer(
        args=training_args,
        model=model,
        tokenizer=tokenizer,
        train_dataset=ds_train,
        eval_dataset=ds_valid,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    )
trainer.train()

preds = trainer.predict(ds_valid).predictions
preds_oof_llama = torch.from_numpy(preds).float().softmax(dim=-1).numpy()[:, -1]

In [None]:
# print(roc_auc_score(ds_valid["labels"], preds_oof_llama))

# Test

In [None]:
preds = trainer.predict(ds_test).predictions
preds_test = torch.from_numpy(preds).float().softmax(dim=-1).numpy()[:, -1]

In [None]:
df_pred = df_test[["show_id"]].copy()
df_pred["pred"] = preds_test
df_pred.to_csv(f"submission_late_gemma_class.csv", index=False)