In [None]:
!pip install datasets

In [4]:
# Install first if needed
# pip install transformers datasets scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# 1. Load your dataset
dataset_path = '/content/drive/MyDrive/plagiarismTrain_dataset.csv'
df = pd.read_csv(dataset_path)

# 2. Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

# 3. Create Huggingface datasets
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})
print(len(train_dataset))
print(len(val_dataset))


13131
3283


In [5]:
label_counts = df['label'].value_counts()
print("Class Distribution:")
print(label_counts)

# Calculate percentage for better understanding
label_percentages = label_counts / len(df) * 100
print("\nClass Percentages:")
print(label_percentages)

# Simple check for balance (e.g., if any class has less than 40%)
is_balanced = all(percentage >= 40 for percentage in label_percentages)
print(f"\nIs the dataset balanced? {is_balanced}")


Class Distribution:
label
1    8414
0    8000
Name: count, dtype: int64

Class Percentages:
label
1    51.261119
0    48.738881
Name: count, dtype: float64

Is the dataset balanced? True


In [6]:
# 4. Load pretrained tokenizer
model_checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 5. Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 6. Load pretrained model for classification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Map:   0%|          | 0/13131 [00:00<?, ? examples/s]

Map:   0%|          | 0/3283 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Perform evaluation at the end of each epoch
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,  # Loads the best model at the end
    metric_for_best_model="accuracy",
    report_to="none"  # Disable reporting to any tracking tool (like wandb)
)
# 8. Define evaluation metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 9. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# 10. Train the model
trainer.train()

# 11. Evaluate on validation set
eval_results = trainer.evaluate()
print("📊 Evaluation Results:", eval_results)


  trainer = Trainer(


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2914,0.334676,0.864758,0.798554,0.984551,0.881852
2,0.2356,0.233142,0.913189,0.863684,0.986334,0.920943


📊 Evaluation Results: {'eval_loss': 0.23314225673675537, 'eval_accuracy': 0.9131891562595187, 'eval_precision': 0.8636836628511967, 'eval_recall': 0.9863339275103981, 'eval_f1': 0.9209431345353676, 'eval_runtime': 14.1089, 'eval_samples_per_second': 232.69, 'eval_steps_per_second': 29.13, 'epoch': 2.0}


In [42]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import nltk

# Fix for NLTK
nltk.download('punkt')
nltk.download('wordnet')

# 1. Load your dataset
dataset_path = '/content/drive/MyDrive/plagiarismTrain_dataset.csv'
df = pd.read_csv(dataset_path)

# 2. Implement simple but effective data augmentation
def simple_augment(text, p=0.15):
    """Simple augmentation: randomly delete words, swap word order"""
    words = text.split()
    if len(words) <= 3:  # Skip very short texts
        return text

    # Randomly delete some words (with probability p)
    if random.random() < 0.5 and len(words) > 5:
        words = [w for w in words if random.random() > p]

    # Randomly swap some adjacent words
    if len(words) > 3 and random.random() < 0.5:
        for i in range(len(words) - 2):
            if random.random() < p:
                words[i], words[i+1] = words[i+1], words[i]

    return ' '.join(words)

# 3. Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

# 4. Apply data augmentation to create additional training examples
augmented_train_texts = [simple_augment(text) for text in train_texts]
print(f"Original training examples: {len(train_texts)}")
print(f"After augmentation: {len(train_texts) + len(augmented_train_texts)}")

# 5. Combine original and augmented data
combined_train_texts = train_texts + augmented_train_texts
combined_train_labels = train_labels + train_labels

# 6. Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(combined_train_texts)
X_val = vectorizer.transform(val_texts)

# 7. Train Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, combined_train_labels)

# 8. Evaluate the model
val_predictions = rf_model.predict(X_val)
accuracy = accuracy_score(val_labels, val_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='binary')

print("\n📊 Evaluation Results:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# 9. Save the model and vectorizer
import joblib
model_save_path = "./plagiarism_model_rf.pkl"
vectorizer_save_path = "./tfidf_vectorizer.pkl"

joblib.dump(rf_model, model_save_path)
joblib.dump(vectorizer, vectorizer_save_path)
print(f"Model saved to {model_save_path}")
print(f"Vectorizer saved to {vectorizer_save_path}")

# Optional: Test inference on a few examples
test_texts = val_texts[:5]  # Take a few examples from validation set
X_test = vectorizer.transform(test_texts)
test_predictions = rf_model.predict(X_test)

print("\nSample predictions:")
for text, pred in zip(test_texts, test_predictions):
    print(f"Text (truncated): {text[:50]}...")
    print(f"Prediction: {'Plagiarism' if pred == 1 else 'Not Plagiarism'}\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original training examples: 13131
After augmentation: 26262

📊 Evaluation Results:
Accuracy: 0.9823332318001827
Precision: 0.9765395894428153
Recall: 0.9893048128342246
F1 Score: 0.9828807556080283
Model saved to ./plagiarism_model_rf.pkl
Vectorizer saved to ./tfidf_vectorizer.pkl

Sample predictions:
Text (truncated): A few antiquarians, for example, Frank Barlow and ...
Prediction: Plagiarism

Text (truncated): Cardus died on 28 February 1975 at the Nuffield Cl...
Prediction: Not Plagiarism

Text (truncated): Boult was conceived in Chester, Cheshire, in North...
Prediction: Plagiarism

Text (truncated): Analysts loathed the amusement's control plot, man...
Prediction: Plagiarism

Text (truncated): Constant attacks and rumor-spreading amplified the...
Prediction: Not Plagiarism

