In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("./new_preprocessed_final.csv")

In [2]:
# Identify TFâ€‘IDF based features
# (cosine similarity + token overlap as lexical TF-IDF signals)
tfidf_features = []
for c in df.columns:
  if "cosine" in c.lower() and "bert" not in c.lower():
    tfidf_features.append(c)
  if "overlap" in c.lower():
    tfidf_features.append(c)

# Label column
label_col = None
for c in df.columns:
  if "is_clickbait" in c.lower():
    label_col = c
    break

if not label_col:
  raise ValueError("Label kolom tidak ditemukan. Pastikan ada kolom is_clickbait_auto / is_clickbait_bert.")

X = df[tfidf_features]
y = df[label_col]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression pipeline
pipe = Pipeline([
  ("scaler", StandardScaler()),
  ("clf", LogisticRegression(max_iter=200, class_weight="balanced"))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Output evaluation
print("== Features used:", tfidf_features)
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

== Features used: ['cosine_similarity', 'token_overlap']

=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       514
           1       0.67      0.98      0.79        86

    accuracy                           0.93       600
   macro avg       0.83      0.95      0.87       600
weighted avg       0.95      0.93      0.93       600


=== Confusion Matrix ===
[[472  42]
 [  2  84]]
