In [None]:
import torch
import gc

# Clear CUDA cache
torch.cuda.empty_cache()

# Run garbage collector
gc.collect()


In [3]:
# 📦 Imports
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 💻 Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
df = pd.read_csv("ai-vs-human-text/AI_Human.csv")
print(f"Dataset loaded with {len(df)} samples")
print(f"Sample data:\n{df.head()}")

dataset = Dataset.from_pandas(df)

train_testval = dataset.train_test_split(test_size=0.2, seed=42)
test_val = train_testval['test'].train_test_split(test_size=0.5, seed=42)

dataset_dict = DatasetDict({
    'train': train_testval['train'],
    'validation': test_val['train'],
    'test': test_val['test']
})

print(f"Train set: {len(dataset_dict['train'])}")
print(f"Validation set: {len(dataset_dict['validation'])}")
print(f"Test set: {len(dataset_dict['test'])}")


Dataset loaded with 487235 samples
Sample data:
                                                text  generated
0  Cars. Cars have been around since they became ...        0.0
1  Transportation is a large necessity in most co...        0.0
2  "America's love affair with it's vehicles seem...        0.0
3  How often do you ride in a car? Do you drive a...        0.0
4  Cars are a wonderful thing. They are perhaps o...        0.0
Train set: 389788
Validation set: 48723
Test set: 48724


In [6]:
tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
print("Tokenizer loaded successfully")


Tokenizer loaded successfully


In [7]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=1024
    )

tokenized_datasets = dataset_dict.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

tokenized_datasets.set_format("torch")
print("Dataset tokenized successfully")


Map: 100%|██████████| 389788/389788 [02:39<00:00, 2436.87 examples/s]
Map: 100%|██████████| 48723/48723 [00:18<00:00, 2566.18 examples/s]
Map: 100%|██████████| 48724/48724 [00:20<00:00, 2400.76 examples/s]

Dataset tokenized successfully





In [None]:
model = LongformerForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096",
    num_labels=2
)

for i, layer in enumerate(model.longformer.encoder.layer):
    if i < 6:
        for param in layer.parameters():
            param.requires_grad = False
        print(f"Layer {i} frozen")

model.to(device)
print("Model loaded and configured successfully")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer 0 frozen
Layer 1 frozen
Layer 2 frozen
Layer 3 frozen
Layer 4 frozen
Layer 5 frozen
Model loaded and configured successfully


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [9]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}


In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

print("Trainer setup complete")




ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [13]:
!pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                  Version
------------------------ -----------
accelerate               1.6.0
aiohappyeyeballs         2.6.1
aiohttp                  3.11.16
aiosignal                1.3.2
asttokens                3.0.0
attrs                    25.3.0
bleach                   6.2.0
certifi                  2025.1.31
charset-normalizer       3.4.1
comm                     0.2.2
datasets                 3.5.0
debugpy                  1.8.13
decorator                5.2.1
dill                     0.3.8
executing                2.2.0
filelock                 3.18.0
frozenlist               1.5.0
fsspec                   2024.12.0
huggingface-hub          0.30.1
idna                     3.10
ipykernel                6.29.5
ipython                  9.0.2
ipython_pygments_lexers  1.1.1
jedi                     0.19.2
Jinja2                   3.1.6
joblib                   1.4.2
jupyter_client           8.6.3
jupyter_core             5.7.2
kaggle                   1.7.4.2
MarkupSafe    

In [None]:
print("Starting training...")
trainer.train()

print("Evaluating on test set...")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Test results: {test_results}")


In [None]:
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")
print("Model and tokenizer saved successfully")
