## Setup
- Requires the dataset JSONL files already present in `ml/dataset/` (train/val/test).
- Draws on the preprocessing steps from `ml/bert_text_classifier.ipynb` and `My_research_new.ipynb`.
- Uses TensorFlow for the BiLSTM and scikit-learn for evaluation.

In [None]:
# Optional: install dependencies if running in a fresh environment
# !pip install -q pandas scikit-learn tensorflow matplotlib seaborn joblib

In [None]:
import json
import os
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

print(tf.__version__)

2.19.0


In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Paths and basic settings
DATA_DIR = Path('/dataset')
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Update this path to where your dataset is stored in Google Drive
DRIVE_DATA_DIR = Path('/content/drive/MyDrive/dataset')

if DRIVE_DATA_DIR.exists():
    print(f"Copying files from {DRIVE_DATA_DIR} to {DATA_DIR}...")
    for filename in ['train.jsonl', 'val.jsonl', 'test.jsonl']:
        src_file = DRIVE_DATA_DIR / filename
        dst_file = DATA_DIR / filename
        if src_file.exists():
             shutil.copy(src_file, dst_file)
             print(f"Copied {filename}")
        else:
             print(f"Warning: {filename} not found in Drive source.")
else:
    print(f"Drive path {DRIVE_DATA_DIR} not found. Please adjust the path.")

TRAIN_PATH = DATA_DIR / 'train.jsonl'
VAL_PATH = DATA_DIR / 'val.jsonl'
TEST_PATH = DATA_DIR / 'test.jsonl'
MODEL_DIR = Path('ml/models/bilstm_sinhala')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

SEED = 42
MAX_TOKENS = 30000  # vocab size for TextVectorization
SEQ_LEN = 400       # truncate/pad length (tune as needed)
BATCH_SIZE = 64
EPOCHS = 6
EMBED_DIM = 128
LSTM_UNITS = 128

tf.random.set_seed(SEED)
np.random.seed(SEED)

assert TRAIN_PATH.exists(), 'Missing train.jsonl'
assert VAL_PATH.exists(), 'Missing val.jsonl'
assert TEST_PATH.exists(), 'Missing test.jsonl'

AssertionError: Missing train.jsonl

In [None]:
# Load JSONL files into DataFrames
def read_jsonl(path: Path) -> pd.DataFrame:
    return pd.read_json(path, lines=True)

train_df = read_jsonl(TRAIN_PATH)
val_df = read_jsonl(VAL_PATH)
test_df = read_jsonl(TEST_PATH)

for name, df in [('train', train_df), ('val', val_df), ('test', test_df)]:
    print(f'{name}: {len(df):,} rows | columns: {list(df.columns)}')

train_df.head()

In [None]:
# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])

def encode_labels(df: pd.DataFrame) -> np.ndarray:
    return label_encoder.transform(df['label'])

y_train = encode_labels(train_df)
y_val = encode_labels(val_df)
y_test = encode_labels(test_df)

NUM_CLASSES = len(label_encoder.classes_)
print('Classes:', label_encoder.classes_)

In [None]:
# Build TextVectorization for Sinhala text
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode='int',
    output_sequence_length=SEQ_LEN,
    standardize='lower_and_strip_punctuation'
)

# Adapt on training text only
text_vectorizer.adapt(train_df['text'].values)

def make_dataset(texts: pd.Series, labels: np.ndarray, training: bool) -> tf.data.Dataset:
    ds = tf.data.Dataset.from_tensor_slices((texts.values, labels))
    if training:
        ds = ds.shuffle(10000, seed=SEED)
    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return ds.map(lambda x, y: (text_vectorizer(x), y))

train_ds = make_dataset(train_df['text'], y_train, training=True)
val_ds = make_dataset(val_df['text'], y_val, training=False)
test_ds = make_dataset(test_df['text'], y_test, training=False)

for batch_x, batch_y in train_ds.take(1):
    print('Vectorized batch shape:', batch_x.shape, '| labels shape:', batch_y.shape)

## Model: Embedding + BiLSTM
The network uses an embedding layer initialized randomly, followed by a bidirectional LSTM stack and dropout regularization. The output layer is a dense softmax over the label set.

In [None]:
def build_model():
    inputs = tf.keras.Input(shape=(None,), dtype=tf.int64, name='tokens')
    x = tf.keras.layers.Embedding(MAX_TOKENS, EMBED_DIM, mask_zero=True)(inputs)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_UNITS // 2))(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    outputs = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(x)
    model = tf.keras.Model(inputs, outputs, name='bilstm_classifier')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

model = build_model()
model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath=str(MODEL_DIR / 'checkpoint.keras'),
        monitor='val_accuracy',
        save_best_only=True,
        mode='max'
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=2,
        restore_best_weights=True
    )
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Plot training curves
plt.figure(figsize=(8, 4))
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training vs validation accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Evaluate on the held-out test set
test_probs = model.predict(test_ds)
test_pred = np.argmax(test_probs, axis=1)

print('Test accuracy:', (test_pred == y_test).mean())
print('
Classification report')
print(classification_report(y_test, test_pred, target_names=label_encoder.classes_))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion matrix')
plt.show()

In [None]:
# Save model and preprocessing assets
model.save(MODEL_DIR / 'saved_model')
joblib.dump(label_encoder, MODEL_DIR / 'label_encoder.joblib')
# Save the vectorizer config to recreate later
vectorizer_config = text_vectorizer.get_config()
vectorizer_weights = text_vectorizer.get_weights()
with open(MODEL_DIR / 'vectorizer_config.json', 'w', encoding='utf-8') as f:
    json.dump(vectorizer_config, f)
np.savez_compressed(MODEL_DIR / 'vectorizer_weights.npz', *vectorizer_weights)
print('Saved to', MODEL_DIR)

In [None]:
# Inference helper
def predict_texts(texts):
    if isinstance(texts, str):
        texts = [texts]
    ds = tf.data.Dataset.from_tensor_slices(texts).batch(BATCH_SIZE)
    ds = ds.map(text_vectorizer).prefetch(tf.data.AUTOTUNE)
    probs = model.predict(ds)
    preds = np.argmax(probs, axis=1)
    labels = label_encoder.inverse_transform(preds)
    return list(labels)

sample_texts = [


180
371
13

,




]
print(predict_texts(sample_texts))