In [1]:
# Environment setup (optional). Uncomment to install heavy optional deps for transformer fine-tuning.
!pip install -q transformers datasets torch sentencepiece pandas



In [1]:
import os
import re
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [13]:
# Load JSONL files from dataset path (file or directory)
# Set `data_path` to a single JSONL file (e.g. './dataset/all_data.jsonl')
# or to a directory containing multiple .jsonl files (e.g. './dataset')
data_path = './dataset'
dfs = []
if os.path.isfile(data_path):
    try:
        df = pd.read_json(data_path, lines=True)
        dfs.append(df)
        print(f'Loaded {len(df)} rows from {data_path}')
    except Exception as e:
        print('Failed to read', data_path, e)
elif os.path.isdir(data_path):
    for fn in os.listdir(data_path):
        if fn.endswith('.jsonl'):
            path = os.path.join(data_path, fn)
            try:
                df = pd.read_json(path, lines=True)
                dfs.append(df)
                print(f'Loaded {len(df)} rows from {fn}')
            except Exception as e:
                print('Failed to read', path, e)
else:
    raise FileNotFoundError(f'No file or directory found at {data_path}')
if not dfs:
    raise FileNotFoundError('No .jsonl files found in dataset path')
data = pd.concat(dfs, ignore_index=True)
# Keep only necessary columns (text, label)
if 'text' not in data.columns or 'label' not in data.columns:
    raise ValueError('Expected columns `text` and `label` in the JSONL records')
data = data[['text', 'label']].dropna().reset_index(drop=True)
data['label'] = data['label'].astype(str)
data.head(3)

Loaded 90457 rows from all_data.jsonl
Loaded 9048 rows from test.jsonl
Loaded 9048 rows from test.jsonl
Loaded 11796 rows from train.jsonl
Loaded 11796 rows from train.jsonl
Loaded 9045 rows from val.jsonl
Loaded 9045 rows from val.jsonl


Unnamed: 0,text,label
0,මත්ද්‍රව්‍ය ජාවාරමකට සම්බන්ධ පුද්ගලයෙකු පොලිස්...,AI
1,ශ්‍රී ලංකාවේ නව කැබිනට් මණ්ඩලයේ සංශෝධනය පිළිබඳ...,AI
2,2012 පෙබරවාරි මාසයේ වැල්ලම්පිටියේ දී යුද හමුදා...,HUMAN


In [14]:
# Quick class distribution check
print(data['label'].value_counts())
print('Total samples:', len(data))

HUMAN    66347
AI       53999
Name: label, dtype: int64
Total samples: 120346


In [17]:
# Basic preprocessing function for Sinhala (remove URLs, normalize whitespace).
# Keep it light — more advanced tokenization can be plugged in as needed.
SINHALA_RANGE = '\u0D80-\u0DFF'
url_re = re.compile(r'https?://\S+|www\.\S+')
def preprocess(text):
    if not isinstance(text, str):
        return ''
    # remove zero-width joiner/control characters often present in copied text
    text = text.replace('\u200d', '')
    text = url_re.sub(' ', text)
    # remove characters that are not Sinhala, digits, ASCII letters, basic punctuation or whitespace
    pattern = r"[^\u0D80-\u0DFF0-9A-Za-z\.,;:!\?\(\)\"'\-\/\s]"
    text = re.sub(pattern, ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
# Apply preprocessing to a small sample to verify
data['text_clean'] = data['text'].apply(preprocess)
data[['text', 'text_clean']].head(3)

Unnamed: 0,text,text_clean
0,මත්ද්‍රව්‍ය ජාවාරමකට සම්බන්ධ පුද්ගලයෙකු පොලිස්...,මත්ද්රව්ය ජාවාරමකට සම්බන්ධ පුද්ගලයෙකු පොලිස් අ...
1,ශ්‍රී ලංකාවේ නව කැබිනට් මණ්ඩලයේ සංශෝධනය පිළිබඳ...,ශ්රී ලංකාවේ නව කැබිනට් මණ්ඩලයේ සංශෝධනය පිළිබඳ ...
2,2012 පෙබරවාරි මාසයේ වැල්ලම්පිටියේ දී යුද හමුදා...,2012 පෙබරවාරි මාසයේ වැල්ලම්පිටියේ දී යුද හමුදා...


In [18]:
# Baseline: TF-IDF char n-grams + LogisticRegression pipeline
X = data['text_clean'].values
y = data['label'].values
# train/val split (stratify if labels are imbalanced)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
    ('clf', LogisticRegression(max_iter=2000, solver='lbfgs'))
])
pipeline.fit(X_train, y_train)
print('Baseline trained')

Baseline trained


In [19]:
# Evaluate on validation set
y_pred = pipeline.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Classification report: ')
print(classification_report(y_val, y_pred))
print('Confusion matrix: ', confusion_matrix(y_val, y_pred))

Accuracy: 0.9977565434150395
Classification report: 
              precision    recall  f1-score   support

          AI       1.00      1.00      1.00     10800
       HUMAN       1.00      1.00      1.00     13270

    accuracy                           1.00     24070
   macro avg       1.00      1.00      1.00     24070
weighted avg       1.00      1.00      1.00     24070

Confusion matrix:  [[10755    45]
 [    9 13261]]


In [20]:
# Save the trained pipeline for inference
out_dir = 'models'
os.makedirs(out_dir, exist_ok=True)
model_path = os.path.join(out_dir, 'tfidf_logreg_sinhala.joblib')
joblib.dump(pipeline, model_path)
print('Saved model to', model_path)

Saved model to models\tfidf_logreg_sinhala.joblib


In [None]:
# Inference helper
def predict_text(texts, model_path=model_path):
    model = joblib.load(model_path)
    clean = [preprocess(t) for t in texts]
    preds = model.predict(clean)
    return preds
# Example
examples = [
    'මෙය කෙටි උදාහරණයක් වෙයි - ලියවූ පරික්ෂණයක්.',
    'මෙම වක්‍ර ලිපිය මොඩල් එකෙන් ජනනය කරන ලදී.'
]
print('Predictions:', predict_text(examples))

Predictions: ['HUMAN' 'HUMAN']


**Optional: Transformer fine-tuning (heavy; requires GPU and may take long)**
Below is a stub showing how to prepare the dataset and fine-tune a multilingual transformer like `xlm-roberta-base` using `datasets` + `transformers`. Uncomment and run only if you have the resources.

In [None]:
# Transformer fine-tune stub (uncomment when ready)
# from datasets import Dataset, DatasetDict
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
#
# dtrain = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train}))
# dval = Dataset.from_pandas(pd.DataFrame({'text': X_val, 'label': y_val}))
# ds = DatasetDict({'train': dtrain, 'validation': dval})
# tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
# def tokenize_fn(ex):
#     return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=256)
# ds = ds.map(tokenize_fn, batched=True)
# model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)
# training_args = TrainingArguments(output_dir='transformer_out', evaluation_strategy='epoch', per_device_train_batch_size=8, num_train_epochs=3)
# trainer = Trainer(model=model, args=training_args, train_dataset=ds['train'], eval_dataset=ds['validation'])
# trainer.train()
# trainer.save_model('models/transformer_sinhala')

**Next steps & tips**
- If performance is limited, try: increasing TF-IDF n-gram range, using word-level tokenization, or training a transformer.
- For heavy transformer fine-tuning, use GPU, small batch sizes, and mixed precision where available.
- Consider data augmentation (backtranslation) or more labeled data if classes are imbalanced.