## 1. Install Required Libraries

In [10]:
%pip install -q tf-keras
%pip install -q transformers
%pip install -q datasets
%pip install -q nltk
%pip install -q scikit-learn
%pip install -q matplotlib

## 2. Import Required Libraries

In [11]:
import pandas as pd 
import json
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

# Suppress TensorFlow deprecation warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score, classification_report

## 3. Load Dataset from JSONL Files

In [17]:
def get_gdrive_dataset_dir(subpath='dataset'):
    """Locate the dataset folder, independent of working directory.
    
    Strategy (in order):
    1. Try to mount Google Drive (Colab only).
    2. Check GDRIVE_DATASET_PATH environment variable.
    3. Search upward from common script/notebook directories (/tmp, user home, etc.).
    4. Search the entire home directory tree (shallow).
    5. Search system common locations.
    """
    import pathlib
    
    # 1) Try Colab mount if available
    try:
        from google.colab import drive  # type: ignore
        try:
            drive.mount('/content/drive', force_remount=False)
            candidate = pathlib.Path('/content/drive/MyDrive') / subpath
            if candidate.exists():
                print(f"Using dataset folder (Colab): {candidate}")
                return str(candidate)
            print(f"Colab mounted but {candidate} not found, continuing to local search.")
        except Exception as e:
            print(f"Colab drive mount failed: {e}. Continuing to local search.")
    except Exception:
        pass
    
    # 2) Environment variable
    env_path = os.environ.get('GDRIVE_DATASET_PATH')
    if env_path:
        p = pathlib.Path(env_path)
        if p.exists():
            print(f"Using dataset folder (env): {p}")
            return str(p.resolve())
        print(f"GDRIVE_DATASET_PATH set but does not exist: {env_path}")
    
    # 3) Search from common notebook/script locations upward
    search_roots = [
        pathlib.Path.home(),  # User home directory
        pathlib.Path('/tmp'),  # Common temp folder
        pathlib.Path.cwd(),  # Current working directory
    ]
    
    # Also add parent directories of common locations (e.g., in case we're in /tmp/xxx)
    for root in list(search_roots):
        for _ in range(5):
            if root.parent == root:  # Reached filesystem root
                break
            search_roots.append(root.parent)
            root = root.parent
    
    # Deduplicate
    search_roots = list(set(search_roots))
    
    # 4) Search each root with limited depth
    for root in search_roots:
        try:
            for dirpath in root.rglob('*'):
                if dirpath.is_dir() and dirpath.name == subpath:
                    print(f"Found dataset folder: {dirpath}")
                    return str(dirpath.resolve())
                # Limit depth to avoid infinite search
                if dirpath.relative_to(root).parts.__len__() > 8:
                    break
        except (PermissionError, OSError):
            continue
    
    # Fallback: print what we searched and raise
    print("\nDataset resolution diagnostics:")
    print(f"Current working directory: {os.getcwd()}")
    print(f"Home directory: {pathlib.Path.home()}")
    print(f"Searched roots: {len(search_roots)}")
    for root in sorted(search_roots)[:10]:
        print(f" - {root}")
    
    raise RuntimeError(
        "Could not mount Google Drive or locate dataset folder. If running locally, "
        "set the environment variable GDRIVE_DATASET_PATH to the absolute path of your dataset folder, "
        "e.g., GDRIVE_DATASET_PATH=/Users/yourname/path/to/ml/dataset"
    )

def load_jsonl(filepath):
    """Load data from JSONL file"""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return data

# Resolve dataset directory (will attempt to mount Drive on Colab)
dataset_dir = get_gdrive_dataset_dir('dataset')

# Load training, validation, and test datasets from the resolved directory
train_data = load_jsonl(os.path.join(dataset_dir, 'train.jsonl'))

val_data = load_jsonl(os.path.join(dataset_dir, 'val.jsonl'))

test_data = load_jsonl(os.path.join(dataset_dir, 'test.jsonl'))

print(f"Dataset directory: {dataset_dir}")
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Testing set size: {len(test_data)}")


Colab drive mount failed: mount failed. Continuing to local search.

Dataset resolution diagnostics:
Current working directory: /content
Home directory: /root
Searched roots: 4
 - /
 - /content
 - /root
 - /tmp

Dataset resolution diagnostics:
Current working directory: /content
Home directory: /root
Searched roots: 4
 - /
 - /content
 - /root
 - /tmp


RuntimeError: Could not mount Google Drive or locate dataset folder. If running locally, set the environment variable GDRIVE_DATASET_PATH to the absolute path of your dataset folder, e.g., GDRIVE_DATASET_PATH=/Users/yourname/path/to/ml/dataset

## 4. Convert JSONL to DataFrame and Preprocess

In [None]:
# Convert to DataFrame
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)
test_df = pd.DataFrame(test_data)

# Display sample data
print("Sample training data:")
print(train_df[['text', 'label']].head())
print(f"\nLabel value counts (Train):")
print(train_df['label'].value_counts())

Sample training data:
                                                text  label
0  මත්ද්‍රව්‍ය ජාවාරමකට සම්බන්ධ පුද්ගලයෙකු පොලිස්...     AI
1  ශ්‍රී ලංකාවේ නව කැබිනට් මණ්ඩලයේ සංශෝධනය පිළිබඳ...     AI
2  2012 පෙබරවාරි මාසයේ වැල්ලම්පිටියේ දී යුද හමුදා...  HUMAN
3  (මනෝප්‍රිය ගුණසේකර)කතෝලික දේවස්ථාන හා තරුපහේ හ...  HUMAN
4  මැතිවරණ කොමිසම වෙත මැතිවරණ ආශ්‍රිත පැමිණිලි 45...     AI

Label value counts (Train):
label
HUMAN    39848
AI       32516
Name: count, dtype: int64


## 5. Map Labels to Numeric Values

In [None]:
# Create label mapping
label_mapping = {'HUMAN': 0, 'AI': 1}
reverse_mapping = {v: k for k, v in label_mapping.items()}

# Map labels to numeric values
train_df['label_encoded'] = train_df['label'].map(label_mapping)
val_df['label_encoded'] = val_df['label'].map(label_mapping)
test_df['label_encoded'] = test_df['label'].map(label_mapping)

# Check for any unmapped values
print(f"Train - Unmapped labels: {train_df['label_encoded'].isna().sum()}")
print(f"Val - Unmapped labels: {val_df['label_encoded'].isna().sum()}")
print(f"Test - Unmapped labels: {test_df['label_encoded'].isna().sum()}")

Train - Unmapped labels: 0
Val - Unmapped labels: 0
Test - Unmapped labels: 0


## 6. Text Preprocessing

In [None]:
def expand_contractions(text):
    """Expand common Sinhala contractions and perform Sinhala-specific preprocessing"""
    # Sinhala contractions and abbreviations
    contractions = {
        "ඔයා": "ඔබ",  # colloquial to formal "you"
        "මට": "මට",  # keep as is (already expanded)
        "තමා": "තමා",  # keep as is
        "දෙයි": "දෙයි",  # keep as is
    }
    
    for contraction, expanded in contractions.items():
        text = text.replace(contraction, expanded)
    
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove common Sinhala diacritics and combining marks if needed
    text = re.sub(r'[\u0981-\u0983]', '', text)  # Remove combining marks
    
    return text

# Apply text preprocessing
print("Preprocessing training data...")
train_df['expanded_text'] = train_df['text'].apply(expand_contractions)

print("Preprocessing validation data...")
val_df['expanded_text'] = val_df['text'].apply(expand_contractions)

print("Preprocessing test data...")
test_df['expanded_text'] = test_df['text'].apply(expand_contractions)

print("Preprocessing complete!")


Preprocessing training data...
Preprocessing validation data...
Preprocessing validation data...
Preprocessing test data...
Preprocessing test data...
Preprocessing complete!
Preprocessing complete!


## 7. Load BERT Tokenizer

In [None]:
# Load MobileBERT tokenizer (very lightweight model, ~150MB, supports multilingual)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('google/mobilebert-uncased')
print("MobileBERT tokenizer loaded successfully!")

MobileBERT tokenizer loaded successfully!


## 8. Tokenize and Encode Text Data

In [None]:
# Tokenize and encode the text data using tokenizer in batches to avoid memory spikes
print("Tokenizing data in batches...")

def tokenize_in_batches(texts, tokenizer, batch_size=32, max_length=512):
    """Tokenize a list of texts in small batches and return numpy arrays.
    Returns a dict with 'input_ids' and 'attention_mask' as numpy arrays.
    Pads every batch to `max_length` so concatenation shapes match.
    """
    input_ids_parts = []
    attention_mask_parts = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding='max_length',   # pad each batch to the fixed max_length
            truncation=True,
            max_length=max_length,
            return_tensors='tf'
        )
        # Convert to numpy to keep memory usage predictable and avoid nested lists
        ids = enc['input_ids'].numpy()
        mask = enc['attention_mask'].numpy()

        # Sanity check: ensure shape[1] == max_length
        if ids.shape[1] != max_length:
            # If tokenizer produced a different length for some reason, force-pad/truncate
            ids = np.pad(ids, ((0,0),(0,max_length-ids.shape[1])), constant_values=0)[:,:max_length]
            mask = np.pad(mask, ((0,0),(0,max_length-mask.shape[1])), constant_values=0)[:,:max_length]

        input_ids_parts.append(ids)
        attention_mask_parts.append(mask)

    input_ids = np.concatenate(input_ids_parts, axis=0)
    attention_mask = np.concatenate(attention_mask_parts, axis=0)

    return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Run batched tokenization
train_encodings = tokenize_in_batches(train_df['expanded_text'].tolist(), tokenizer, batch_size=32)
val_encodings = tokenize_in_batches(val_df['expanded_text'].tolist(), tokenizer, batch_size=32)
test_encodings = tokenize_in_batches(test_df['expanded_text'].tolist(), tokenizer, batch_size=32)

print("Tokenization complete!")
print(f"Training encodings shape: {train_encodings['input_ids'].shape}")

Tokenizing data in batches...


: 

## 9. Prepare Labels

In [None]:
# Extract input arrays and convert labels to numpy arrays
# Use dicts with both input_ids and attention_mask so the model gets both inputs
train_inputs = {
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask']
}
val_inputs = {
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask']
}
test_inputs = {
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask']
}

# Convert labels to numpy arrays
train_labels = np.array(train_df['label_encoded'].astype(int).tolist())
val_labels = np.array(val_df['label_encoded'].astype(int).tolist())
test_labels = np.array(test_df['label_encoded'].astype(int).tolist())

print(f"Train labels shape: {train_labels.shape}")
print(f"Val labels shape: {val_labels.shape}")
print(f"Test labels shape: {test_labels.shape}")
print(f"\nLabel distribution (Train): {np.bincount(train_labels)}")
print(f"Label distribution (Val): {np.bincount(val_labels)}")
print(f"Label distribution (Test): {np.bincount(test_labels)}")

Train labels shape: (72364,)
Val labels shape: (9045,)
Test labels shape: (9048,)

Label distribution (Train): [39848 32516]
Label distribution (Val): [4981 4064]
Label distribution (Test): [4982 4066]


## 10. Define BERT Classification Model

In [None]:
# Load pre-trained MobileBERT model for sequence classification (very lightweight, ~150MB)
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    'google/mobilebert-uncased',
    num_labels=2  # Binary classification: HUMAN (0) vs AI (1)
)

print("MobileBERT model loaded successfully!")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


OSError: The paging file is too small for this operation to complete. (os error 1455)

## 11. Compile the Model

In [None]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[metric]
)

print("Model compiled successfully!")
print("\nModel Summary:")
model.summary()

## 12. Train the BERT Model

In [None]:
# Train the BERT model
print("Training the model...")
history = model.fit(
    train_inputs,
    train_labels,
    epochs=3,  # Reduced epochs due to large dataset size
    batch_size=16,
    validation_data=(val_inputs, val_labels),
    verbose=1
)

print("\nTraining completed!")

## 13. Save the Trained Model

In [None]:
# Save the entire trained model
model.save_pretrained("models/bert_multilingual/")
print("Model saved to models/bert_multilingual/")

# Also save the tokenizer
tokenizer.save_pretrained("models/bert_multilingual/")
print("Tokenizer saved to models/bert_multilingual/")

## 14. Evaluate on Validation Set

In [None]:
# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(val_inputs, val_labels, verbose=0)
print(f'Validation Loss: {val_loss:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')

## 15. Evaluate on Test Set

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_inputs, test_labels, verbose=0)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

## 16. Plot Training and Validation Accuracy

In [None]:
# Plot accuracy
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
plt.title('Training and Validation Accuracy', fontsize=12, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', linewidth=2)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
plt.title('Training and Validation Loss', fontsize=12, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/training_history.png', dpi=100, bbox_inches='tight')
plt.show()

print("Training history plot saved!")

## 17. Generate Predictions and Confusion Matrix

In [None]:
# Predict labels for the test set
predictions = model.predict(test_inputs)
predicted_labels = np.argmax(predictions.logits, axis=1)

# Generate confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

# Display confusion matrix
plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['HUMAN', 'AI'])
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title('Confusion Matrix - BERT Text Classification', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('results/confusion_matrix.png', dpi=100, bbox_inches='tight')
plt.show()

print("Confusion matrix saved!")

## 18. Calculate ROC Curve and AUC Score

In [None]:
# Get probabilities for positive class (AI)
probabilities = tf.nn.softmax(predictions.logits)[:, 1]

# Calculate AUC score
auc_score = roc_auc_score(test_labels, probabilities)

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(test_labels, probabilities)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.4f}', linewidth=2, color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - BERT Text Classification', fontsize=12, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('results/roc_curve.png', dpi=100, bbox_inches='tight')
plt.show()

print(f"AUC Score: {auc_score:.4f}")

## 19. Classification Report

In [None]:
# Generate detailed classification report
report = classification_report(
    test_labels, 
    predicted_labels, 
    target_names=['HUMAN', 'AI'],
    digits=4
)

print("\n" + "="*60)
print("CLASSIFICATION REPORT - TEST SET")
print("="*60)
print(report)
print("="*60)

## 20. Summary of Model Performance

In [None]:
# Create a summary of model performance
summary_data = {
    'Metric': ['Test Accuracy', 'Test Loss', 'AUC Score'],
    'Value': [f'{test_accuracy:.4f}', f'{test_loss:.4f}', f'{auc_score:.4f}']
}

summary_df = pd.DataFrame(summary_data)

print("\n" + "="*50)
print("MODEL PERFORMANCE SUMMARY")
print("="*50)
print(summary_df.to_string(index=False))
print("="*50)

# Save summary
summary_df.to_csv('results/model_performance_summary.csv', index=False)
print("\nPerformance summary saved to results/model_performance_summary.csv")