In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("./new_preprocessed_final.csv")

In [2]:
# Pick BERT similarity feature
bert_feat = None
for c in df.columns:
  cl = c.lower()
  if "bert_similarity" in cl or ("cosine" in cl and "bert" in cl):
    bert_feat = c
    break
if bert_feat is None:
  raise ValueError("Kolom fitur BERT similarity tidak ditemukan. Pastikan ada 'bert_similarity' di new_preprocessed_final.csv.")

# Pick label
label_col = None
for c in df.columns:
  if "is_clickbait" in c.lower():
    label_col = c
    break
if label_col is None:
  raise ValueError("Kolom label tidak ditemukan. Pastikan ada kolom is_clickbait_auto / is_clickbait_bert / is_clickbait_final.")

X = df[[bert_feat]]
y = df[label_col]

# Split
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

# LR pipeline
pipe = Pipeline([
  ("scaler", StandardScaler()),
  ("clf", LogisticRegression(max_iter=200, class_weight="balanced"))
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("== Feature used:", bert_feat)
print("\n=== Classification Report (BERT-only, LR) ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

== Feature used: bert_similarity

=== Classification Report (BERT-only, LR) ===
              precision    recall  f1-score   support

           0       0.89      0.57      0.70       514
           1       0.18      0.57      0.28        86

    accuracy                           0.57       600
   macro avg       0.54      0.57      0.49       600
weighted avg       0.79      0.57      0.64       600


=== Confusion Matrix ===
[[295 219]
 [ 37  49]]
