In [1]:
import pandas as pd

# Data

In [2]:
dataset = r'C:\Datasets\aihumantext\AI_Human.csv'

df = pd.read_csv(dataset)

df = df.sample(100) # sampling 100 because it will take too long to finetune on more; purely for demonstration

df.head()

Unnamed: 0,text,generated
474568,We can claim that the idea of studying genus i...,0.0
334822,"When people become famous, they often face a l...",1.0
143359,"For a while now, driverless cars have a a deba...",0.0
285410,"Hey there! So, like, our society these days s...",1.0
171493,Everyone towns that cars are dangerous. Think ...,0.0


## Prep

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
# Prepare datasets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def random_ablation(text, percentage=0.25):
    words = text.split()
    num_to_remove = int(len(words) * percentage)
    indices_to_remove = random.sample(range(len(words)), num_to_remove)
    new_words = [word for i, word in enumerate(words) if i not in indices_to_remove]
    return ' '.join(new_words)

def ablation(remove_text=False):

    if remove_text:
        train_df['text'] = train_df['text'].apply(random_ablation)
        test_df['text'] = test_df['text'].apply(random_ablation)

train_df.head()

Unnamed: 0,text,generated
287414,Have you ever wondered what life today would b...,0.0
58633,the use technology more useful to read the emo...,0.0
270583,"In my opinion, summer projects should be stude...",1.0
300364,In life the only way Io GEI ahead in life is b...,0.0
139919,I think we should have computers that read emo...,0.0


In [5]:
def encode_data(df):
    return tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

train_encodings = encode_data(train_df)
test_encodings = encode_data(test_df)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['generated'])
test_labels = label_encoder.transform(test_df['generated'])

class SciBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SciBERTDataset(train_encodings, train_labels)
test_dataset = SciBERTDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


# SciBERT

In [6]:
model = AutoModelForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=2)

# If you're using a GPU, move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
print(device)

cuda


## Training

In [8]:
from tqdm import tqdm
# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):  # Number of epochs
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

  0%|          | 0/5 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 5/5 [00:22<00:00,  4.52s/it]


Epoch 1, Loss: 0.5651410818099976


100%|██████████| 5/5 [00:23<00:00,  4.66s/it]


Epoch 2, Loss: 0.24152451753616333


100%|██████████| 5/5 [00:27<00:00,  5.46s/it]

Epoch 3, Loss: 0.15473172068595886





In [9]:
model.eval()
with torch.no_grad():
    total, correct = 0, 0
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == batch['labels']).sum().item()
        total += batch['labels'].size(0)
    print(f'Test Accuracy: {correct / total:.2f}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.90


In [10]:
from sklearn.metrics import classification_report
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions.extend(torch.argmax(outputs.logits, dim=1).tolist())
        true_labels.extend(batch['labels'].tolist())

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [11]:
predictions

[0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]

In [12]:
true_labels

[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0]

In [13]:
print(classification_report(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        15
           1       0.80      0.80      0.80         5

    accuracy                           0.90        20
   macro avg       0.87      0.87      0.87        20
weighted avg       0.90      0.90      0.90        20

