# Data

In [1]:
import pandas as pd

dataset = r'C:\Datasets\aihumantext\AI_Human.csv'

df = pd.read_csv(dataset)

df.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [2]:
len(df)

487235

In [3]:
df = df.sample(100)

In [4]:
df

Unnamed: 0,text,generated
153680,The students of today are the future of tomorr...,0.0
308883,Some schools require students to complete summ...,0.0
242498,Do you think failure lead you to do better? In...,0.0
273403,"Kind, caring,nice. Yhose words explain the mea...",0.0
42632,"Dear State Senator,\n\nI am writing to express...",1.0
...,...,...
203357,"As an eighth grade student, I believe that req...",1.0
11866,The Electoral College should be abolished beca...,0.0
259647,Believe it or not cars are getting old. Its no...,0.0
239809,There are Mary types of educational activitie...,1.0


## Prep

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
# Prepare datasets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def random_ablation(text, percentage=0.25):
    words = text.split()
    num_to_remove = int(len(words) * percentage)
    indices_to_remove = random.sample(range(len(words)), num_to_remove)
    new_words = [word for i, word in enumerate(words) if i not in indices_to_remove]
    return ' '.join(new_words)

def ablation(remove_text=False):

    if remove_text:
        train_df['text'] = train_df['text'].apply(random_ablation)
        test_df['text'] = test_df['text'].apply(random_ablation)

train_df.head()

Unnamed: 0,text,generated
83560,Title: The Advantages of Limiting Car Usage\n\...,1.0
369064,I think government should change electoral vot...,0.0
69259,The whole point of school is to get you ready ...,0.0
99928,Seeking multiple opinions when making an impor...,1.0
208956,"My fellow citizens of America, we are in a cri...",0.0


In [7]:
def encode_data(df):
    return tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

train_encodings = encode_data(train_df)
test_encodings = encode_data(test_df)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['generated'])
test_labels = label_encoder.transform(test_df['generated'])

class SciBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SciBERTDataset(train_encodings, train_labels)
test_dataset = SciBERTDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


# SciBERT

In [8]:

model = AutoModelForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=2)

# If you're using a GPU, move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
print(device)

cuda


## Training

In [10]:
from tqdm import tqdm
# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):  # Number of epochs
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 5/5 [00:40<00:00,  8.15s/it]


Epoch 1, Loss: 0.513701856136322


100%|██████████| 5/5 [00:43<00:00,  8.74s/it]


Epoch 2, Loss: 0.37172913551330566


100%|██████████| 5/5 [00:43<00:00,  8.66s/it]

Epoch 3, Loss: 0.27747732400894165





In [11]:
# Evaluation
model.eval()
with torch.no_grad():
    total, correct = 0, 0
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == batch['labels']).sum().item()
        total += batch['labels'].size(0)
    print(f'Test Accuracy: {correct / total:.2f}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.95


In [14]:
# classification report
from sklearn.metrics import classification_report
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions.extend(torch.argmax(outputs.logits, dim=1).tolist())
        true_labels.extend(batch['labels'].tolist())

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [16]:
predictions

[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1]

In [15]:
true_labels

[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1]

In [18]:
print(classification_report(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       1.00      0.88      0.93         8

    accuracy                           0.95        20
   macro avg       0.96      0.94      0.95        20
weighted avg       0.95      0.95      0.95        20

