In [20]:
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

import os

In [21]:
df_train_e = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')#.sample(1000)
df_train_p = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')#.sample(1000)
df_test_e = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')#.sample(1000)

In [22]:
df_train_e

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0


In [23]:
df_train_p["instructions"][0]

'Write an explanatory essay to inform fellow citizens about the advantages of limiting car usage. Your essay must be based on ideas and information that can be found in the passage set. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your essay in the space provided.'

In [24]:
df_train_e = df_train_e[["text", "generated"]]
df_train_e

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [25]:
df_train_e = Dataset.from_pandas(df_train_e)
df_train_e

Dataset({
    features: ['text', 'generated'],
    num_rows: 1378
})

In [26]:
# necessary to rename to be able to later train the model
df_train_e = df_train_e.rename_column("generated","labels")

In [27]:
df_train_e = df_train_e.train_test_split(test_size=0.2, shuffle=True)
df_train_e

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1102
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 276
    })
})

In [28]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

In [29]:
def tokenize_function(ds):
    return tokenizer(ds["text"], truncation=True)

In [30]:
tokenized_df_train_e = df_train_e.map(tokenize_function, batched=True)

Map:   0%|          | 0/1102 [00:00<?, ? examples/s]

Map:   0%|          | 0/276 [00:00<?, ? examples/s]

In [31]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="training_args",report_to=None)

In [33]:
model = DistilBertForSequenceClassification.from_pretrained(checkpoint)

In [34]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [35]:
#tokenized_df_train_e

In [36]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_df_train_e["train"],
    eval_dataset=tokenized_df_train_e["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [37]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=207, training_loss=0.014571232496252383, metrics={'train_runtime': 93.7829, 'train_samples_per_second': 35.252, 'train_steps_per_second': 2.207, 'total_flos': 437937219956736.0, 'train_loss': 0.014571232496252383, 'epoch': 3.0})

In [38]:
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json')