# Install modules

In [1]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install accelerate -U



# Make environment

In [2]:
import os
import json
from google.colab import drive

drive.mount('/content/drive')
os.chdir("./drive/MyDrive/git_project/Kaggle/DetectAIGeneratedTextUsingBERT/notebook")

f = open("../../../kaggle.json", 'r')
json_data = json.load(f)
os.environ['KAGGLE_USERNAME'] = json_data['username']
os.environ['KAGGLE_KEY'] = json_data['key']

# !mkdir ../data
# !kaggle competitions download -c llm-detect-ai-generated-text -p ../data
# !unzip ../data/llm-detect-ai-generated-text -d ../data

model_checkpoint = "../data/deberta-v3-xsmall"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Module

In [3]:
import json
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import roc_auc_score


In [9]:
os.listdir("../data")

['wordnet',
 'train_essays.csv',
 'sample_submission.csv',
 'test_essays.csv',
 'train_prompts.csv',
 'llm-detect-ai-generated-text.zip',
 'contractions-0.1.73-py2.py3-none-any.whl',
 'Unidecode-1.3.7-py3-none-any.whl',
 'textsearch-0.0.24-py2.py3-none-any.whl',
 'corpora',
 'daigt']

# Load dataset

https://www.kaggle.com/datasets/thedrcat/daigt-proper-train-dataset

In [4]:
pd.read_csv('../data/train_prompts.csv')


Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...


In [5]:
df = pd.read_csv('../data/daigt/train_drcat_04.csv')
df_external = pd.read_csv('../data/daigt/train_drcat_01.csv')
df.head(10)

Unnamed: 0,essay_id,text,label,source,prompt,fold
0,E897534557AF,"In recent years, technology has had a profoun...",1,mistral7binstruct_v2,\nTask: Write an essay discussing the positive...,1
1,DFBA34FFE11D,Should students participate in an extracurricu...,0,persuade_corpus,,2
2,af37ecf5,The electoral college is a symbol of mockery a...,0,train_essays,,5
3,5EC2696BAD78,This is why I think the principle should allow...,0,persuade_corpus,,8
4,llama_70b_v1843,I strongly believe that meditation and mindful...,1,llama_70b_v1,Some schools have implemented meditation and m...,0
5,30D7FD691AE3,"As technology develops more, we need to consid...",0,persuade_corpus,,5
6,8E91F0997B77,The positive attitude is the key of the succes...,0,original_moth,Task: Write an essay discussing why attitude i...,7
7,9124e664,The Electoral College is a process that has be...,0,train_essays,,1
8,9ABF7B48B802,Having school every single day is nobodies fav...,0,original_moth,Task:\nResearch the 3-year high school program...,3
9,E7F9A77683D2,One way school administrators can attempt to c...,1,chat_gpt_moth,Task: Write an essay analyzing the pros and co...,7


# Split with train/valid

use only fold 0 for training

In [6]:
train = df[df.fold != 0].reset_index(drop=True)
valid = df[df.fold == 0].reset_index(drop=True)
train.head()

Unnamed: 0,essay_id,text,label,source,prompt,fold
0,E897534557AF,"In recent years, technology has had a profoun...",1,mistral7binstruct_v2,\nTask: Write an essay discussing the positive...,1
1,DFBA34FFE11D,Should students participate in an extracurricu...,0,persuade_corpus,,2
2,af37ecf5,The electoral college is a symbol of mockery a...,0,train_essays,,5
3,5EC2696BAD78,This is why I think the principle should allow...,0,persuade_corpus,,8
4,30D7FD691AE3,"As technology develops more, we need to consid...",0,persuade_corpus,,5


In [7]:
# Let's try to undersample the persuade_corpus:
not_persuade_df = train[train['source'] != 'persuade_corpus']
persuade_df = train[train['source'] == 'persuade_corpus']
sampled_persuade_df = persuade_df.sample(n=6000, random_state=42)

In [8]:
# Testing idea from discussion with @nbroad about limited characters in human essays
all_human = set(list(''.join(sampled_persuade_df.text.to_list())))
other = set(list(''.join(not_persuade_df.text.to_list())))

In [9]:
chars_to_remove = ''.join([x for x in other if x not in all_human])
print(chars_to_remove)

🏈🌭🌠😄📄🚴🐶💅🍄💯💚🌱👦¬🎾🚪🚔💊♀🧐💥🥕🍟📞👌🍓🌞😱💖🕒😕🛣路❄🍗一🇧🙏👍🐕😹🤛和💸🌃🔋🎨该合📊🍽🔥‘💁ç🍿🇫力注🕵📸🎅😬🌯择💪🧙こ😎🚌🧩🤯🌊🎯🦁せ🏥🔜😍🏻�📉🙃🍣保🕺”👀😻🔍°🤞💇🕰🏄と😲💻😒🤝🤓🏜🤷а🎵🎸🙈🏯😌⚽影🎩😅🛀🔧ê🧘🌷👇り🥔🚨🦄🤖👧🌸🍎ち️💉是🏫有🐻🎬🏦响🐆🌏🔮🥘🥳💭唯护🍖🐠选🥛🧀🐝🥪在中🐱👕💕📖🌅法🐴🏊将🇪🥶🌌🚕…🤔📱💬р司е🏟🧡🐸🔬止🎢📈时–🛠😴🛍🌮🌈🏛🐟😩道🍭う🎃🏨🎣🇸🚑す😡😠的🏡📺み📣😋👯🍕😜😂上🎠🏠😳🎭须🤟é🍝あ🤕🙋🐦🤪о📹‍💤😆👮に🦎☹🧹🏋🍰🦸🗣🏰应ā­🚀😮驾都🛫🍲🗳😖用🧖☀🍳🍔👨🏢🎄🐧😉💦🔭集🎓😁必🌳安💜á□👋🤫🎹🌟🤢🏃👬😓🐰🏏👥🥟🤒😵💀♂👪🏳🚂🌿🧑📚🎶🛑📷💡“🎤🏔🕹⏰🤜😤🦐が🤘🛋🏼🎧🏕👩—🌧👂✨🧬🍋🐒🎉💧🐬🐢🥭🤣🐾は🙌😘使🎊所🙅ん者🙀🧚🏆🥗🌲💆🍷🧠🏽🌽ã🏞🏀🍮📅🎮🚣╯🥦🏖🤗🥜完😨🥤💃😃💼📝禁意ま🍜🍴í🧭💔🌫🌐🤩😔🙄🌴😷💫🥨🤦ü’🥩🇷机🔑部🚗🇺🥑📦👫驶🚫😭手🧦​🛸🍁💨👻🤤💘🌎😊全🧽。🍞💰🥲сÉ🥯🌻🚭🐳


In [10]:
translation_table = str.maketrans('', '', chars_to_remove)
def remove_chars(s):
    return s.translate(translation_table)
not_persuade_df['text'] = not_persuade_df['text'].apply(remove_chars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_persuade_df['text'] = not_persuade_df['text'].apply(remove_chars)


In [11]:
# Let's add a sample of external data to train
df_external0 = df_external[df_external.label == 0.].sample(n=1000, random_state=42)
df_external1 = df_external[df_external.label == 1.].sample(n=1000, random_state=42)

In [12]:
train = pd.concat([not_persuade_df, sampled_persuade_df, df_external0, df_external1]).sample(frac=1, random_state=42).reset_index(drop=True)
train.source.value_counts()

persuade_corpus          7000
llama2_chat              2526
chat_gpt_moth            2503
mistral7binstruct_v1     2179
original_moth            2179
mistral7binstruct_v2     2178
train_essays             1240
llama_70b_v1             1055
falcon_180b_v1            950
darragh_claude_v7         900
darragh_claude_v6         900
radek_500                 450
llammistral7binstruct     328
Name: source, dtype: int64

# To Dataset

In [13]:
ds_train = Dataset.from_pandas(train)
ds_valid = Dataset.from_pandas(valid)

# Load Tokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-xsmall", use_fast=True)



# Define preprocessor

In [16]:
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length=128, padding=True, truncation=True)

In [17]:
ds_train_enc = ds_train.map(preprocess_function, batched=True, batch_size=32)
ds_valid_enc = ds_valid.map(preprocess_function, batched=True, batch_size=32)

Map:   0%|          | 0/24388 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

# Define Model

In [18]:
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-xsmall", num_labels=num_labels)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train configs

In [19]:
metric_name = "roc_auc"
model_name = "deberta-xsmall"
train_batch_size = 32
eval_batch_size = 32
grad_acc = 4

In [20]:
num_steps = len(train) // (train_batch_size * grad_acc)
num_steps

190

In [21]:
args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    eval_steps = 230,
    save_steps = 230,
    learning_rate=2e-5,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=grad_acc,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric_name,
    report_to='none', # change to wandb after enabling internet access
)

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    auc = roc_auc_score(labels, probs[:,1], multi_class='ovr')
    return {"roc_auc": auc}

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train_enc,
    eval_dataset=ds_valid_enc,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train

In [25]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=190, training_loss=0.1889671526457134, metrics={'train_runtime': 238.6793, 'train_samples_per_second': 102.179, 'train_steps_per_second': 0.796, 'total_flos': 400520543600640.0, 'train_loss': 0.1889671526457134, 'epoch': 1.0})

# Save model

In [26]:
tokenizer.save_vocabulary(model_checkpoint)
model.save_pretrained(model_checkpoint)

# Valid

In [127]:
test_df = pd.read_csv("../data/train_essays.csv")
test_dataset = Dataset.from_pandas(test_df)
test_ds_enc = test_dataset.map(preprocess_function, batched=True)
result = trainer.predict(test_ds_enc)

Map:   0%|          | 0/1378 [00:00<?, ? examples/s]

In [33]:
res = []
for src in valid.source.unique():
    if src in ['train_essays', 'persuade_corpus', 'original_moth']: continue
    test2  = valid[valid['source'].isin([src, 'train_essays'])]
    test_ds2 = Dataset.from_pandas(test2)
    test_ds_enc2 = test_ds2.map(preprocess_function, batched=True)
    eval_result = trainer.evaluate(test_ds_enc2)
    score = eval_result['eval_roc_auc']
    res.append(f'{src}: {score}')

for r in res: print(r)

Map:   0%|          | 0/255 [00:00<?, ? examples/s]

Map:   0%|          | 0/380 [00:00<?, ? examples/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Map:   0%|          | 0/380 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

llama_70b_v1: 0.9969105289174494
mistral7binstruct_v1: 0.9987041947926711
mistral7binstruct_v2: 0.9939075630252101
chat_gpt_moth: 0.9987041947926711
radek_500: 0.9939196832579185
falcon_180b_v1: 0.9970450797141286
darragh_claude_v7: 0.9969002306805075
darragh_claude_v6: 0.9969002306805075
llama2_chat: 0.9987094837935173


In [29]:
test = pd.read_csv('../data/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [31]:
test_preds = trainer.predict(test_ds_enc)

In [32]:
logits = test_preds.predictions
probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs[:,1]
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,generated
0,0000aaaa,0.607412
1,1111bbbb,0.997028
2,2222cccc,0.995955
