In [None]:
# Imports
import os
from pathlib import Path
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Paths (notebook is in `ml/`)
ROOT = Path('..')
DATA_DIR = ROOT / 'dataset'
MODELS_DIR = ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)
TRAIN_FILE = Path('./dataset/train.jsonl')
VAL_FILE = Path('./dataset/val.jsonl')
TEST_FILE = Path('./dataset/test.jsonl')

print('Data files:')
print(TRAIN_FILE.exists(), TRAIN_FILE)
print(VAL_FILE.exists(), VAL_FILE)
print(TEST_FILE.exists(), TEST_FILE)

Data files:
False ..\dataset\train.jsonl
False ..\dataset\val.jsonl
False ..\dataset\test.jsonl


In [9]:
# Load data (jsonl with fields `text` and `label`)
def load_jsonl(path):
    return pd.read_json(path, lines=True)

train = load_jsonl(TRAIN_FILE) if TRAIN_FILE.exists() else pd.DataFrame()
val = load_jsonl(VAL_FILE) if VAL_FILE.exists() else pd.DataFrame()
test = load_jsonl(TEST_FILE) if TEST_FILE.exists() else pd.DataFrame()

print('Train shape:', train.shape)
print('Val shape:', val.shape)
print('Test shape:', test.shape)

# Quick peek
display(train.head())

Train shape: (0, 0)
Val shape: (0, 0)
Test shape: (0, 0)


In [10]:
# Inspect class distribution
if not train.empty:
    print('Train label distribution:')
    display(train['label'].value_counts())
else:
    print('No training data found.')

No training data found.


In [None]:
# Simple text cleaning for Sinhala / news text (keeps most characters)
def clean_text(s):
    if not isinstance(s, str):
        return ''
    # remove URLs
    s = re.sub(r'https?://\S+|www\.\S+', ' ', s)
    # remove extra whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    return s

for df in (train, val, test):
    if not df.empty:
        df['text_clean'] = df['text'].astype(str).apply(clean_text)

display(train[['text_clean','label']].head())

In [None]:
# Prepare X / y (use val if available for validation)
if train.empty:
    raise RuntimeError('No training data found at: {}'.format(TRAIN_FILE))

X_train = train['text_clean'].values
y_train = train['label'].values

if not val.empty:
    X_val = val['text_clean'].values
    y_val = val['label'].values
else:
    X_val, y_val = None, None

if not test.empty:
    X_test = test['text_clean'].values
    y_test = test['label'].values
else:
    X_test, y_test = None, None

In [None]:
# Vectorize with TF-IDF (unigram + bigram). Tune `max_features` as needed.
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=30000, analyzer='word')
X_train_tfidf = vectorizer.fit_transform(X_train)
print('TF-IDF shape:', X_train_tfidf.shape)

if X_val is not None:
    X_val_tfidf = vectorizer.transform(X_val)
else:
    X_val_tfidf = None

if X_test is not None:
    X_test_tfidf = vectorizer.transform(X_test)
else:
    X_test_tfidf = None

In [None]:
# Train Logistic Regression (binary HUMAN vs AI). Use class_weight if classes imbalanced.
clf = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced')
clf.fit(X_train_tfidf, y_train)

print('Training complete')

# Evaluate on validation (if available) otherwise on train/test
def eval_and_print(model, X, y, name='data'):
    preds = model.predict(X)
    print('--- Evaluation on', name, '---')
    print('Accuracy:', accuracy_score(y, preds))
    print(classification_report(y, preds))
    print('Confusion matrix:')
    print(confusion_matrix(y, preds))

if X_val_tfidf is not None and y_val is not None:
    eval_and_print(clf, X_val_tfidf, y_val, 'validation')
elif X_test_tfidf is not None and y_test is not None:
    eval_and_print(clf, X_test_tfidf, y_test, 'test')
else:
    eval_and_print(clf, X_train_tfidf, y_train, 'train')

In [None]:
# Save vectorizer and model as joblib artifacts in `models/`
artifact_path = MODELS_DIR / 'tfidf_logreg_sinhala.joblib'
joblib.dump({'vectorizer': vectorizer, 'model': clf}, artifact_path)
print('Saved model bundle to', artifact_path)

# Also save separately if desired
joblib.dump(vectorizer, MODELS_DIR / 'tfidf_vectorizer.joblib')
joblib.dump(clf, MODELS_DIR / 'tfidf_logreg_model.joblib')
print('Saved separate vectorizer and model files to', MODELS_DIR)

In [None]:
# Example: load back and predict on a sample
bundle = joblib.load(artifact_path)
vec = bundle['vectorizer']
model = bundle['model']

def predict_text(text):
    c = clean_text(text)
    x = vec.transform([c])
    return model.predict(x)[0], model.predict_proba(x).max() if hasattr(model, 'predict_proba') else None

sample = 'මෙම පුවත ප්‍රදේශීය තොරතුරු අඩංගුයි.'
print('Sample prediction:', predict_text(sample))

## Next steps / tips
- Tune the vectorizer (`ngram_range`, `max_features`) and `LogisticRegression` hyperparameters.
- Consider more advanced tokenization for Sinhala (word-segmentation) or using pre-trained embeddings.
- Add cross-validation and a hyperparameter search (GridSearchCV / RandomizedSearchCV).
- If classes are imbalanced, try oversampling / undersampling or stronger class weighting.
- To reproduce training from the command line: run this notebook or convert key cells to a script `train.py`.