# Training & Modelling

This notebook trains and evaluates multiple models on the cleaned datasets prepared in `data-cleaning-preprocessing.ipynb`.

Models included:
- TF-IDF (char n-grams) + Logistic Regression
- Hybrid: TF-IDF (char + word) + numeric features + Logistic Regression
- Sentence-Transformers embeddings + Logistic Regression
- IndoBERT fine-tuning (optional, lightweight config)
- Google AI Studio Embeddings (API Key) + Logistic Regression
- Vertex AI (Text Embeddings) + Logistic Regression

You can choose between original or 50:50 balanced splits. Save artifacts to `models/`.

In [1]:
# Setup & configuration
import os
import json
import math
import joblib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from typing import List, Dict

MODELS_DIR = 'models'
PROCESSED_DIR = 'processed_data'
RANDOM_STATE = 42
os.makedirs(MODELS_DIR, exist_ok=True)

# External services configuration (leave empty to skip)
GOOGLE_AI_STUDIO_API_KEY = os.environ.get('GOOGLE_AI_STUDIO_API_KEY', '#')  # or paste here
VERTEX_PROJECT_ID = os.environ.get('VERTEX_PROJECT_ID', 'n8n-pmld')
VERTEX_LOCATION = os.environ.get('VERTEX_LOCATION', 'us-central1')
VERTEX_USE_GCLOUD_AUTH = bool(os.environ.get('VERTEX_USE_GCLOUD_AUTH', 'true'))  # if true, use ADC

# HuggingFace/transformers models
SENTENCE_TRANSFORMER_MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
INDOBERT_MODEL = 'indobenchmark/indobert-base-p1'  # for optional finetuning


In [None]:
# Install optional dependencies (run once)
# illustrative only
# !pip install scikit-learn scikit-learn-intelex sentence-transformers transformers torch --quiet
# For Google AI Studio (Gemini Embeddings):
# !pip install google-generativeai --quiet
# For Vertex AI embeddings:
# !pip install google-cloud-aiplatform --quiet


In [6]:
# Data loading

def load_split(use_balanced: bool = True) -> Dict[str, pd.DataFrame]:
    if use_balanced:
        train_path = os.path.join(PROCESSED_DIR, 'train_balanced_50_50.csv')
        test_path = os.path.join(PROCESSED_DIR, 'test_balanced_50_50.csv')
        holdout_path = os.path.join(PROCESSED_DIR, 'holdout_balanced_50_50.csv')
    else:
        train_path = os.path.join(PROCESSED_DIR, 'train_processed.csv')
        test_path = os.path.join(PROCESSED_DIR, 'test_processed.csv')
        holdout_path = os.path.join(PROCESSED_DIR, 'holdout_processed.csv')
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    holdout = pd.read_csv(holdout_path)
    return {'train': train, 'test': test, 'holdout': holdout}

splits = load_split(use_balanced=False)
{ k: (v.shape, v['label'].value_counts().to_dict()) for k, v in splits.items() }


{'train': ((8171, 13), {0: 7454, 1: 717}),
 'test': ((2335, 13), {0: 2126, 1: 209}),
 'holdout': ((1167, 13), {0: 942, 1: 225})}

In [None]:
import plotly.express as px

# Identify numeric features (excluding 'label')
numeric_features = splits['train'].select_dtypes(include='number').columns.tolist()
numeric_features = [col for col in numeric_features if col != 'label']

# Visualize count of each numeric feature, split by label using Plotly
for feature in numeric_features:
    fig = px.histogram(
        splits['train'],
        x=feature,
        color='label',
        barmode='overlay',
        nbins=200,
        title=f"Distribution of {feature} by Label"
    )
    fig.show()

In [11]:
splits['train']

Unnamed: 0,comment,label,comment_clean,has_math_alnum,money_symbol_count,fire_symbol_count,special_char_ratio,number_ratio,stylized_char_ratio,char_count,word_count,avg_word_length,is_very_short
0,aamiin ya rabb,0,aamiin ya rabb,False,0,0,0.000000,0.000000,0.0,14,3,4.666667,False
1,terima kasih mengajak jalan2 virtual raja ampa...,0,terima kasih mengajak jalan2 virtual raja ampa...,False,0,0,0.014493,0.014493,0.0,69,11,6.272727,False
2,bener prabu,0,bener prabu,False,0,0,0.000000,0.000000,0.0,11,2,5.500000,False
3,tonton video ya hehe,0,tonton video ya hehe,False,0,0,0.000000,0.000000,0.0,20,4,5.000000,False
4,coach nova plis suruh pda bljr sepak penalti a...,0,coach nova plis suruh pda bljr sepak penalti a...,False,0,0,0.015152,0.015152,0.0,66,13,5.076923,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8166,keren sih terimakasih papua 🔥🔥,0,keren sih terimakasih papua 🔥🔥,False,0,2,0.066667,0.000000,0.0,30,5,6.000000,False
8167,fulll ngakakkk asw,0,full ngakakk asw,False,0,0,0.000000,0.000000,0.0,18,3,6.000000,False
8168,yep bang ngelantur samping gue ketakutan gue a...,0,yep bang ngelantur samping gue ketakutan gue a...,False,0,0,0.000000,0.000000,0.0,68,11,6.181818,False
8169,menang hoki korsel nyerang goal sayang korsel ...,0,menang hoki korsel nyerang goal sayang korsel ...,False,0,0,0.000000,0.000000,0.0,80,11,7.272727,False


In [7]:
# Metrics and utilities
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split


def evaluate_and_report(y_true, y_pred, title: str):
    print(f"\n=== {title} ===")
    print(classification_report(y_true, y_pred, digits=4))
    return {
        'precision_1': precision_score(y_true, y_pred, pos_label=1),
        'recall_1': recall_score(y_true, y_pred, pos_label=1),
        'f1_1': f1_score(y_true, y_pred, pos_label=1),
    }


def get_xy(df: pd.DataFrame, text_col: str = 'comment_clean'):
    return df[text_col].astype(str).tolist(), df['label'].astype(int).values


results = []


In [16]:
# Model 1: TF-IDF (char) + Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

X_train, y_train = get_xy(splits['train'])
X_test, y_test = get_xy(splits['holdout'])

char_tfidf_lr = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(2,4), min_df=2)),
    ('clf', LogisticRegression(max_iter=5000, class_weight='balanced', n_jobs=1, C=2.0, solver='liblinear'))
])

char_tfidf_lr.fit(X_train, y_train)
y_pred = char_tfidf_lr.predict(X_test)
metrics = evaluate_and_report(y_test, y_pred, 'Char TF-IDF + LogisticRegression (test)')
results.append({'model':'char_tfidf_lr','metrics':metrics})

joblib.dump(char_tfidf_lr, os.path.join(MODELS_DIR, 'lr_char.pkl'))



=== Char TF-IDF + LogisticRegression (test) ===
              precision    recall  f1-score   support

           0     0.8381    1.0000    0.9119       942
           1     1.0000    0.1911    0.3209       225

    accuracy                         0.8440      1167
   macro avg     0.9190    0.5956    0.6164      1167
weighted avg     0.8693    0.8440    0.7980      1167



['models/lr_char.pkl']

In [17]:
# Model 2: Hybrid (char + word TF-IDF + numeric features)
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

num_features = ['char_count','word_count','avg_word_length','number_ratio','special_char_ratio','stylized_char_ratio','money_symbol_count','fire_symbol_count']

# Prepare dataframes for ColumnTransformer
train_df = splits['train'].copy()
test_df = splits['holdout'].copy()

# ColumnTransformer expects array-like; we will build union of text features and numeric
char_vec = ('char', TfidfVectorizer(analyzer='char', ngram_range=(2,4), min_df=2), 'comment_clean')
word_vec = ('word', TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=2), 'comment_clean')

preprocess = ColumnTransformer([
    char_vec,
    word_vec,
    ('num', StandardScaler(with_mean=False), num_features)
], remainder='drop', sparse_threshold=0.3)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

hybrid_clf = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=5000, class_weight='balanced', C=2.0, solver='liblinear'))
])

hybrid_clf.fit(train_df, y_train)
y_pred_h = hybrid_clf.predict(test_df)
metrics_h = evaluate_and_report(y_test, y_pred_h, 'Hybrid (char+word+numeric) + LogisticRegression (test)')
results.append({'model':'hybrid_lr','metrics':metrics_h})

joblib.dump(hybrid_clf, os.path.join(MODELS_DIR, 'hybrid_model.pkl'))



=== Hybrid (char+word+numeric) + LogisticRegression (test) ===
              precision    recall  f1-score   support

           0     0.9039    0.9989    0.9491       942
           1     0.9921    0.5556    0.7123       225

    accuracy                         0.9135      1167
   macro avg     0.9480    0.7772    0.8307      1167
weighted avg     0.9209    0.9135    0.9034      1167



['models/hybrid_model.pkl']

In [None]:
# Model 3: Sentence-Transformers embeddings + Logistic Regression
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

st_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)

def embed_texts(texts: List[str], batch_size: int = 64):
    return st_model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

Xtr_emb = embed_texts(splits['train']['comment_clean'].tolist())
Xte_emb = embed_texts(splits['test']['comment_clean'].tolist())

clf_emb = LogisticRegression(max_iter=2000, class_weight='balanced')
clf_emb.fit(Xtr_emb, y_train)
y_pred_emb = clf_emb.predict(Xte_emb)
metrics_emb = evaluate_and_report(y_test, y_pred_emb, 'Sentence-Transformer Embeddings + LogisticRegression (test)')
results.append({'model':'st_embed_lr','metrics':metrics_emb})

joblib.dump({'encoder': SENTENCE_TRANSFORMER_MODEL, 'clf': clf_emb}, os.path.join(MODELS_DIR, 'st_embed_lr.pkl'))


Batches: 100%|██████████| 233/233 [00:24<00:00,  9.44it/s]
Batches: 100%|██████████| 67/67 [00:06<00:00, 10.80it/s]



=== Sentence-Transformer Embeddings + LogisticRegression (test) ===
              precision    recall  f1-score   support

           0     0.9449    0.9280    0.9364      2126
           1     0.9293    0.9459    0.9375      2126

    accuracy                         0.9370      4252
   macro avg     0.9371    0.9370    0.9370      4252
weighted avg     0.9371    0.9370    0.9370      4252



['models/st_embed_lr.pkl']

In [None]:
# Model 4: IndoBERT fine-tuning (optional, quick demo)
# Note: This is a minimal example; for full training use Trainer API with epochs.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

indobert_tokenizer = AutoTokenizer.from_pretrained(INDOBERT_MODEL)
indobert_model = AutoModelForSequenceClassification.from_pretrained(INDOBERT_MODEL, num_labels=2)

def tokenize_fn(batch):
    return indobert_tokenizer(batch['comment_clean'], truncation=True, padding='max_length', max_length=128)

# Small subset for quick demo to avoid long runtimes
train_small = splits['train'].sample(n=min(4000, len(splits['train'])), random_state=RANDOM_STATE)
test_small = splits['test'].sample(n=min(2000, len(splits['test'])), random_state=RANDOM_STATE)

import datasets as hfds
train_ds = hfds.Dataset.from_pandas(train_small[['comment_clean','label']].rename(columns={'label':'labels'}))
test_ds = hfds.Dataset.from_pandas(test_small[['comment_clean','label']].rename(columns={'label':'labels'}))

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

cols = ['input_ids','attention_mask','labels']
train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in cols])
test_ds = test_ds.remove_columns([c for c in test_ds.column_names if c not in cols])
train_ds.set_format('torch')
test_ds.set_format('torch')

args = TrainingArguments(
    output_dir=os.path.join(MODELS_DIR, 'indobert'),
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    m = evaluate_and_report(labels, preds, 'IndoBERT (eval)')
    return {'precision_1': m['precision_1'], 'recall_1': m['recall_1'], 'f1_1': m['f1_1']}

try:
    trainer = Trainer(model=indobert_model, args=args, train_dataset=train_ds, eval_dataset=test_ds, tokenizer=indobert_tokenizer)
    # Optional training (can be time-consuming). Commented by default.
    # trainer.train()
    eval_metrics = trainer.evaluate()
    results.append({'model':'indobert_finetune_1epoch','metrics':eval_metrics})
except ImportError as e:
    print('Accelerate/Trainer not available, running manual evaluation without Trainer...')
    indobert_model.eval()
    device = torch.device('cpu')
    indobert_model.to(device)
    from torch.utils.data import DataLoader
    loader = DataLoader(test_ds, batch_size=32, shuffle=False)
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].numpy()
            logits = indobert_model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(labels)
    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_labels)
    m = evaluate_and_report(y_true, y_pred, 'IndoBERT (manual eval)')
    results.append({'model':'indobert_manual_eval','metrics':m})

indobert_model.save_pretrained(os.path.join(MODELS_DIR, 'indobert'))
indobert_tokenizer.save_pretrained(os.path.join(MODELS_DIR, 'indobert'))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 4000/4000 [00:00<00:00, 12821.90 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 28255.79 examples/s]


('models/indobert/tokenizer_config.json',
 'models/indobert/special_tokens_map.json',
 'models/indobert/vocab.txt',
 'models/indobert/added_tokens.json',
 'models/indobert/tokenizer.json')

In [None]:
# Model 5: Google AI Studio (Gemini) Embeddings + Logistic Regression
# Requires: `pip install google-generativeai`
try:
    import google.generativeai as genai
    has_genai = True
except Exception:
    has_genai = False

from sklearn.linear_model import LogisticRegression

def embed_with_genai(texts: List[str], api_key: str, model: str = 'text-embedding-004', task_type: str = 'retrieval_document'):
    assert has_genai, 'google-generativeai not installed. Run install cell.'
    assert api_key, 'Provide GOOGLE_AI_STUDIO_API_KEY in env or set GOOGLE_AI_STUDIO_API_KEY variable.'
    genai.configure(api_key=api_key)
    # Batch to avoid payload limits
    embeddings = []
    batch_size = 128
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        resp = genai.embed_content(model=model, content=batch, task_type=task_type)
        vecs = resp['embedding'] if 'embedding' in resp else resp['data'][0]['embedding']
        # API returns list under 'embedding' for batch; normalize shape
        if isinstance(vecs[0], (int, float)):
            vecs = [vecs]
        embeddings.extend(vecs)
    return np.array(embeddings, dtype=np.float32)

if has_genai and GOOGLE_AI_STUDIO_API_KEY:
    Xtr_g = embed_with_genai(splits['train']['comment_clean'].tolist(), GOOGLE_AI_STUDIO_API_KEY)
    Xte_g = embed_with_genai(splits['test']['comment_clean'].tolist(), GOOGLE_AI_STUDIO_API_KEY)
    y_train = splits['train']['label'].astype(int).values
    y_test = splits['test']['label'].astype(int).values
    clf_g = LogisticRegression(max_iter=2000, class_weight='balanced')
    clf_g.fit(Xtr_g, y_train)
    y_pred_g = clf_g.predict(Xte_g)
    metrics_g = evaluate_and_report(y_test, y_pred_g, 'Google AI Studio (Gemini) Embeddings + LogisticRegression (test)')
    results.append({'model':'gemini_embed_lr','metrics':metrics_g})
    joblib.dump({'provider':'google_ai_studio','model':'text-embedding-004','clf': clf_g}, os.path.join(MODELS_DIR, 'gemini_embed_lr.pkl'))
else:
    print('Skipping Google AI Studio embeddings (install google-generativeai and/or set GOOGLE_AI_STUDIO_API_KEY).')


E0000 00:00:1759241581.461673 6163239 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759241887.394719 6163239 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.



=== Google AI Studio (Gemini) Embeddings + LogisticRegression (test) ===
              precision    recall  f1-score   support

           0     0.9900    0.9741    0.9820      2126
           1     0.9745    0.9901    0.9823      2126

    accuracy                         0.9821      4252
   macro avg     0.9822    0.9821    0.9821      4252
weighted avg     0.9822    0.9821    0.9821      4252



In [None]:
# Model 6: Vertex AI Text Embeddings + Logistic Regression
# Requires: `pip install google-cloud-aiplatform`
try:
    from google.cloud import aiplatform
    has_vertex = True
except Exception:
    has_vertex = False

from sklearn.linear_model import LogisticRegression

VERTEX_EMBED_MODEL = "text-embedding-004"  # for TextEmbeddingModel

def embed_with_vertex(texts: List[str], project_id: str, location: str) -> np.ndarray:
    assert has_vertex, 'google-cloud-aiplatform not installed.'
    assert project_id, 'Set VERTEX_PROJECT_ID.'
    aiplatform.init(project=project_id, location=location)
    # Use TextEmbeddingModel endpoint
    from vertexai.language_models import TextEmbeddingModel
    model = TextEmbeddingModel.from_pretrained(VERTEX_EMBED_MODEL)
    embeddings = []
    batch_size = 128
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        resp = model.get_embeddings(batch)
        for r in resp:
            embeddings.append(r.values)
    return np.array(embeddings, dtype=np.float32)

if has_vertex and VERTEX_PROJECT_ID:
    Xtr_v = embed_with_vertex(splits['train']['comment_clean'].tolist(), VERTEX_PROJECT_ID, VERTEX_LOCATION)
    Xte_v = embed_with_vertex(splits['test']['comment_clean'].tolist(), VERTEX_PROJECT_ID, VERTEX_LOCATION)
    y_train = splits['train']['label'].astype(int).values
    y_test = splits['test']['label'].astype(int).values
    clf_v = LogisticRegression(max_iter=2000, class_weight='balanced')
    clf_v.fit(Xtr_v, y_train)
    y_pred_v = clf_v.predict(Xte_v)
    metrics_v = evaluate_and_report(y_test, y_pred_v, 'Vertex AI Embeddings + LogisticRegression (test)')
    results.append({'model':'vertex_embed_lr','metrics':metrics_v})
    joblib.dump({'provider':'vertex_ai','model':VERTEX_EMBED_MODEL,'clf': clf_v}, os.path.join(MODELS_DIR, 'vertex_embed_lr.pkl'))
else:
    print('Skipping Vertex AI embeddings (install google-cloud-aiplatform and/or set VERTEX_PROJECT_ID).')


In [None]:
# Results Summary & Model Comparison
summary = []
for r in results:
    name = r['model']
    m = r['metrics']
    if isinstance(m, dict) and 'f1_1' in m:
        summary.append({'model': name, 'f1_1': float(m['f1_1']), 'precision_1': float(m['precision_1']), 'recall_1': float(m['recall_1'])})

summary_df = pd.DataFrame(summary).sort_values('f1_1', ascending=False)
print(summary_df)

# Save summary to models/metadata.json
meta_path = os.path.join(MODELS_DIR, 'metadata.json')
with open(meta_path, 'w') as f:
    json.dump({'results': summary}, f, indent=2)
print(f'Metadata saved to {meta_path}')


             model      f1_1  precision_1  recall_1
1    char_tfidf_lr  0.985224     0.998551  0.972248
2        hybrid_lr  0.984256     0.998548  0.970367
0  gemini_embed_lr  0.982268     0.974537  0.990122
3      st_embed_lr  0.937529     0.929298  0.945908
Metadata saved to models/metadata.json


In [5]:
from typing import Union
import os

# Lazy loaders for saved models
char_lr = joblib.load(os.path.join(MODELS_DIR, 'lr_char.pkl')) if os.path.exists(os.path.join(MODELS_DIR, 'lr_char.pkl')) else None





In [6]:
hybrid_lr = joblib.load(os.path.join(MODELS_DIR, 'hybrid_model.pkl')) if os.path.exists(os.path.join(MODELS_DIR, 'hybrid_model.pkl')) else None


In [None]:
# gemini_bundle = joblib.load(os.path.join(MODELS_DIR, 'gemini_embed_lr.pkl')) if os.path.exists(os.path.join(MODELS_DIR, 'gemini_embed_lr.pkl')) else None
# vertex_bundle = joblib.load(os.path.join(MODELS_DIR, 'vertex_embed_lr.pkl')) if os.path.exists(os.path.join(MODELS_DIR, 'vertex_embed_lr.pkl')) else None

# indobert_path = os.path.join(MODELS_DIR, 'indobert')
# indobert_model = None
# indobert_tokenizer = None
# if os.path.exists(indobert_path):
#     from transformers import AutoTokenizer, AutoModelForSequenceClassification
#     indobert_tokenizer = AutoTokenizer.from_pretrained(indobert_path)
#     indobert_model = AutoModelForSequenceClassification.from_pretrained(indobert_path)

# print("Models loaded successfully!")

In [7]:
# Inference 1: Char TF-IDF + Logistic Regression
def classify_char_tfidf(texts: Union[str, List[str]]):
    if isinstance(texts, str):
        texts = [texts]
    if char_lr is None:
        print("Char TF-IDF model not found. Train Model 1 first.")
        return None
    return char_lr.predict(texts).tolist()

# Test
test_texts = ["𝐃 𝐎 𝙍 𝘈 𝟟 gacor banget!", "terima kasih dok"]
print("Char TF-IDF predictions:", classify_char_tfidf(test_texts))


Char TF-IDF predictions: [0, 0]


In [8]:
# Inference 2: Hybrid (char+word+numeric) + Logistic Regression
def classify_hybrid(texts: Union[str, List[str]]):
    if isinstance(texts, str):
        texts = [texts]
    if hybrid_lr is None:
        print("Hybrid model not found. Train Model 2 first.")
        return None
    
    # Create dummy dataframe with required features
    dummy_df = pd.DataFrame({
        'comment_clean': texts, 
        'char_count': [len(t) for t in texts], 
        'word_count': [len(t.split()) for t in texts],
        'avg_word_length': [len(t)/max(1,len(t.split())) for t in texts], 
        'number_ratio': [sum(ch.isdigit() for ch in t)/max(1,len(t)) for t in texts],
        'special_char_ratio': [sum((not ch.isalnum()) and not ch.isspace() for ch in t)/max(1,len(t)) for t in texts],
        'stylized_char_ratio': [0.0 for _ in texts], 
        'money_symbol_count': [0 for _ in texts], 
        'fire_symbol_count': [0 for _ in texts]
    })
    return hybrid_lr.predict(dummy_df).tolist()

# Test
print("Hybrid predictions:", classify_hybrid(test_texts))


Hybrid predictions: [0, 0]


In [9]:
# Inference 3: Sentence-Transformers + Logistic Regression
def classify_sentence_transformer(texts: Union[str, List[str]]):
    if isinstance(texts, str):
        texts = [texts]
    if st_bundle is None:
        print("Sentence-Transformer model not found. Train Model 3 first.")
        return None
    
    try:
        from sentence_transformers import SentenceTransformer
        enc = SentenceTransformer(st_bundle['encoder'])
        X_emb = enc.encode(texts, batch_size=64, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
        return st_bundle['clf'].predict(X_emb).tolist()
    except Exception as e:
        print(f"Sentence-Transformer prediction failed: {e}")
        return None

# Test
print("Sentence-Transformer predictions:", classify_sentence_transformer(test_texts))


NameError: name 'st_bundle' is not defined

In [None]:
# Inference 4: IndoBERT
def classify_indobert(texts: Union[str, List[str]]):
    if isinstance(texts, str):
        texts = [texts]
    if indobert_model is None or indobert_tokenizer is None:
        print("IndoBERT model not found. Train Model 4 first.")
        return None
    
    try:
        import torch
        indobert_model.eval()
        device = torch.device('cpu')
        indobert_model.to(device)
        
        # Tokenize
        inputs = indobert_tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        
        # Predict
        with torch.no_grad():
            logits = indobert_model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        return preds
    except Exception as e:
        print(f"IndoBERT prediction failed: {e}")
        return None

# Test
print("IndoBERT predictions:", classify_indobert(test_texts))


In [None]:
# Inference 5: Google AI Studio (Gemini) + Logistic Regression
def classify_gemini(texts: Union[str, List[str]]):
    if isinstance(texts, str):
        texts = [texts]
    if gemini_bundle is None or not GOOGLE_AI_STUDIO_API_KEY:
        print("Gemini model not found or API key missing. Train Model 5 first.")
        return None
    
    try:
        import google.generativeai as genai
        genai.configure(api_key=GOOGLE_AI_STUDIO_API_KEY)
        embeddings = []
        batch_size = 128
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            resp = genai.embed_content(model='text-embedding-004', content=batch, task_type='retrieval_document')
            vecs = resp['embedding'] if 'embedding' in resp else resp['data'][0]['embedding']
            if isinstance(vecs[0], (int, float)):
                vecs = [vecs]
            embeddings.extend(vecs)
        X_emb = np.array(embeddings, dtype=np.float32)
        return gemini_bundle['clf'].predict(X_emb).tolist()
    except Exception as e:
        print(f"Gemini prediction failed: {e}")
        return None

# Test
print("Gemini predictions:", classify_gemini(test_texts))


In [None]:
# Inference 6: Vertex AI + Logistic Regression
def classify_vertex(texts: Union[str, List[str]]):
    if isinstance(texts, str):
        texts = [texts]
    if vertex_bundle is None or not VERTEX_PROJECT_ID:
        print("Vertex AI model not found or project ID missing. Train Model 6 first.")
        return None
    
    try:
        from google.cloud import aiplatform
        from vertexai.language_models import TextEmbeddingModel
        aiplatform.init(project=VERTEX_PROJECT_ID, location=VERTEX_LOCATION)
        model = TextEmbeddingModel.from_pretrained(vertex_bundle['model'])
        embeddings = []
        batch_size = 128
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            resp = model.get_embeddings(batch)
            for r in resp:
                embeddings.append(r.values)
        X_emb = np.array(embeddings, dtype=np.float32)
        return vertex_bundle['clf'].predict(X_emb).tolist()
    except Exception as e:
        print(f"Vertex AI prediction failed: {e}")
        return None

# Test
print("Vertex AI predictions:", classify_vertex(test_texts))


In [None]:
# Inference 7: LSTM (Keras)
def classify_lstm(texts: Union[str, List[str]]):
    if isinstance(texts, str):
        texts = [texts]
    if lstm_bundle is None or 'lstm_model' not in globals():
        print("LSTM model not found. Train Model 7 first.")
        return None
    
    try:
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        tk = lstm_bundle['tokenizer']
        max_len = lstm_bundle['max_len']
        seq = pad_sequences(tk.texts_to_sequences(texts), maxlen=max_len)
        preds = (lstm_model.predict(seq)[:,0] >= 0.5).astype(int).tolist()
        return preds
    except Exception as e:
        print(f"LSTM prediction failed: {e}")
        return None

# Test
print("LSTM predictions:", classify_lstm(test_texts))


In [10]:
# Unified Inference: Run all available models
def classify_all(texts: Union[str, List[str]]):
    """
    Run all available models on the input text(s)
    Returns a dictionary with model names as keys and predictions as values
    """
    if isinstance(texts, str):
        texts = [texts]
    
    outputs = {}
    
    # Try each model
    char_preds = classify_char_tfidf(texts)
    if char_preds is not None:
        outputs['char_tfidf_lr'] = char_preds
    
    hybrid_preds = classify_hybrid(texts)
    if hybrid_preds is not None:
        outputs['hybrid_lr'] = hybrid_preds
    
    st_preds = classify_sentence_transformer(texts)
    if st_preds is not None:
        outputs['st_embed_lr'] = st_preds
    
    indobert_preds = classify_indobert(texts)
    if indobert_preds is not None:
        outputs['indobert'] = indobert_preds
    
    gemini_preds = classify_gemini(texts)
    if gemini_preds is not None:
        outputs['gemini_embed_lr'] = gemini_preds
    
    vertex_preds = classify_vertex(texts)
    if vertex_preds is not None:
        outputs['vertex_embed_lr'] = vertex_preds
    
    lstm_preds = classify_lstm(texts)
    if lstm_preds is not None:
        outputs['keras_lstm'] = lstm_preds
    
    return outputs

# Test all models
print("=== All Model Predictions ===")
all_predictions = classify_all(test_texts)
for model_name, predictions in all_predictions.items():
    print(f"{model_name}: {predictions}")


=== All Model Predictions ===


NameError: name 'st_bundle' is not defined

In [None]:
classify_char_tfidf("DORA7 gacor banget!")

[1]