In [1]:
from __future__ import annotations

TARGET_MODEL = "mistralai/Mistral-7B-v0.1"

DEBUG = False

In [2]:
from pathlib import Path

OUTPUT_DIR = Path("../output")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

INPUT_DIR = Path("../data/")

In [3]:
import pandas as pd

train_df = pd.read_csv(INPUT_DIR / "train_essays_RDizzl3_seven_v1.csv", sep=',')
test_df = pd.read_csv(INPUT_DIR / "llm-detect-ai-generated-text/test_essays.csv", sep=',')
external_df = pd.read_csv(INPUT_DIR / "daigt-external-dataset/daigt_external_dataset.csv", sep=',')
train_prompts_df = pd.read_csv(INPUT_DIR / "llm-detect-ai-generated-text/train_prompts.csv", sep=',')

# show shape
print(f'train_df.shape: {train_df.shape}')
print(f'test_df.shape: {test_df.shape}')
print(f'external_df.shape: {external_df.shape}')
print(f'train_prompts_df.shape: {train_prompts_df.shape}')

train_df.shape: (15867, 2)
test_df.shape: (3, 3)
external_df.shape: (2421, 4)
train_prompts_df.shape: (2, 4)


In [4]:
train_df = train_df.rename(columns={'generated': 'label'})
test_df = test_df.rename(columns={'generated': 'label'})
external_df = external_df.rename(columns={'generated': 'label'})

In [5]:
train_df.label.value_counts()

label
0    14247
1     1620
Name: count, dtype: int64

In [6]:
train_df.head(3)

Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0


In [7]:
test_df.head(3)

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [8]:
external_df.head(3)

Unnamed: 0,id,text,instructions,source_text
0,6060D28C05B6,Some schools in United States ofter classes fr...,\nTask: Write a persuasive essay on whether or...,\nWhen considering the pros and cons of attend...
1,60623DB5DE7A,"Four-day work week, a remarkable idea to conse...",\nTask: Research the advantages and disadvanta...,\nOne of the primary arguments for implementin...
2,607A39D981DE,Students and their families should consider an...,\nTask: \n\n1. Talk to your parents before tak...,\nBefore making any decisions about getting in...


In [9]:
external_df = external_df[["id", "source_text"]]
external_df.columns = ["id", "text"]
external_df['text'] = external_df['text'].str.replace('\n', '')
external_df["label"] = 1

train_df = pd.concat([train_df, external_df])
train_df.reset_index(inplace=True, drop=True)
print(f"Train dataframe has shape: {train_df.shape}")
train_df.head()

Train dataframe has shape: (18288, 3)


Unnamed: 0,text,label,id
0,Cars. Cars have been around since they became ...,0,
1,Transportation is a large necessity in most co...,0,
2,"""America's love affair with it's vehicles seem...",0,
3,How often do you ride in a car? Do you drive a...,0,
4,Cars are a wonderful thing. They are perhaps o...,0,


In [10]:
train_df.value_counts("label")

label
0    14247
1     4041
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = train_df.loc[:, train_df.columns != "label"]
y = train_df.loc[:, train_df.columns == "label"]

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    train_df.loc[valid_index, "fold"] = i
    
print(train_df.groupby("fold")["label"].value_counts())
train_df.head()

fold  label
0.0   0        2850
      1         808
1.0   0        2850
      1         808
2.0   0        2849
      1         809
3.0   0        2849
      1         808
4.0   0        2849
      1         808
Name: count, dtype: int64


Unnamed: 0,text,label,id,fold
0,Cars. Cars have been around since they became ...,0,,1.0
1,Transportation is a large necessity in most co...,0,,3.0
2,"""America's love affair with it's vehicles seem...",0,,0.0
3,How often do you ride in a car? Do you drive a...,0,,2.0
4,Cars are a wonderful thing. They are perhaps o...,0,,3.0


In [12]:
valid_df = train_df[train_df["fold"] == 0]
train_df = train_df[train_df["fold"] != 0]

In [13]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch

peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    target_modules=[
        "q_proj",
        "v_proj"
    ],
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [14]:
from transformers import AutoTokenizer, LlamaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

In [26]:
base_model = LlamaForSequenceClassification.from_pretrained(
    TARGET_MODEL,
    num_labels=2,
    quantization_config=bnb_config,
    device_map={"":1}
)
base_model.config.pretraining_tp = 1 # 1 is 7b
base_model.config.pad_token_id = tokenizer.pad_token_id

You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
model = get_peft_model(base_model, peft_config)

In [28]:
model.print_trainable_parameters()

trainable params: 27,271,168 || all params: 7,137,939,456 || trainable%: 0.3820593907822577


In [29]:
# debug
if DEBUG:
    train_df = train_df.sample(300)
    valid_df = valid_df.sample(50)
#train_df = train_df.sample(100)
#valid_df = valid_df.sample(30)
print(train_df.label.value_counts(), valid_df.label.value_counts())

label
0    11397
1     3233
Name: count, dtype: int64 label
0    2850
1     808
Name: count, dtype: int64


In [30]:
# datasets
from datasets import Dataset

# from pandas
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

In [31]:
def preprocess_function(examples, max_length=512):
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding=True)

In [32]:
train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/14630 [00:00<?, ? examples/s]

Map:   0%|          | 0/3658 [00:00<?, ? examples/s]

In [33]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [34]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy_val = accuracy_score(labels, predictions)
    roc_auc_val = roc_auc_score(labels, predictions)
    
    return {
        "accuracy": accuracy_val,
        "roc_auc": roc_auc_val,
    }

In [35]:
from transformers import TrainingArguments, Trainer

steps = 5 if DEBUG else 20

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    max_grad_norm=0.3,
    optim='paged_adamw_32bit',
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=False,
    warmup_steps=steps,
    eval_steps=steps,
    logging_steps=steps,
    report_to='none' # if DEBUG else 'wandb',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_ds,
    eval_dataset=valid_tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


ValueError: You can't train a model that has been loaded in 8-bit precision on a different device than the one you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}

In [None]:
from shutil import rmtree

trainer.save_model(output_dir=str(OUTPUT_DIR))

for path in Path(training_args.output_dir).glob("checkpoint-*"):
    if path.is_dir():
        rmtree(path)

In [25]:
del trainer, model, base_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(in_features=4096, out_fea

In [None]:
# cuda cache clear
import torch
torch.cuda.empty_cache()

In [None]:
# load model / tokenizer with 4bit bnb

from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
from transformers import AutoTokenizer, LlamaForSequenceClassification

base_model = LlamaForSequenceClassification.from_pretrained(
    TARGET_MODEL,
    num_labels=2,
    quantization_config=bnb_config,
    device_map={"":0}
)
base_model.config.pretraining_tp = 1 # 1 is 7b
base_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
model = PeftModel.from_pretrained(base_model, str(OUTPUT_DIR))

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
pred_output = trainer.predict(valid_tokenized_ds)
logits = pred_output.predictions

In [None]:
logits = pred_output.predictions

In [None]:
valid_df.label

In [None]:
logits

In [None]:
# from scipy.special import expit as sigmoid
import numpy as np
def sigmoid(x):
    return 1 / (1 + np.exp(-x))  
probs = sigmoid(logits[:, 1])
probs.shape, probs[0:5]

In [None]:
sub = pd.DataFrame()
sub['id'] = valid_df['id']
sub['generated'] = probs
# sub.to_csv('submission.csv', index=False)
sub.head()