In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("./new_preprocessed_final.csv")

In [11]:
# Identify BERT similarity feature
bert_feat = None
for c in df.columns:
  if "bert_similarity" in c.lower() or ("cosine" in c.lower() and "bert" in c.lower()):
    bert_feat = c
    break

if bert_feat is None:
  raise ValueError("BERT similarity feature not found.")

# Identify label column
label_col = None
for c in df.columns:
  if "is_clickbait" in c.lower():
    label_col = c
    break

if label_col is None:
  raise ValueError("Label column not found.")

# Feature and label
X = df[[bert_feat]]
y = df[label_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest model
rf = RandomForestClassifier(
  n_estimators=300,
  max_depth=None,
  class_weight="balanced",
  random_state=42
)

# Train
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Results
print("=== Feature Used ===")
print(bert_feat)

print("\n=== Classification Report (RF + BERT Only) ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

=== Feature Used ===
bert_similarity

=== Classification Report (RF + BERT Only) ===
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       514
           1       0.16      0.14      0.15        86

    accuracy                           0.77       600
   macro avg       0.51      0.51      0.51       600
weighted avg       0.76      0.77      0.77       600


=== Confusion Matrix ===
[[451  63]
 [ 74  12]]


In [12]:
# Identify BERT similarity + extra feature
bert_feat = None
for c in df.columns:
  if "bert_similarity" in c.lower() or ("cosine" in c.lower() and "bert" in c.lower()):
    bert_feat = c
    break

if bert_feat is None:
  raise ValueError("BERT similarity feature not found.")

# Identify label column
label_col = None
for c in df.columns:
  if "is_clickbait" in c.lower():
    label_col = c
    break

if label_col is None:
  raise ValueError("Label column not found.")

# Extra features
extra_feats = ["title_length","sensational_word_count","punctuation_count","punctuation_ratio"]
extra_feats = [c for c in extra_feats if c in df.columns]

# Combine features
feature_cols = [bert_feat] + extra_feats

# Feature and label
X = df[feature_cols]
y = df[label_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest model
rf = RandomForestClassifier(
  n_estimators=300,
  max_depth=None,
  class_weight="balanced",
  random_state=42
)

# Train
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Results
print("=== Feature Used ===")
print(feature_cols)

print("\n=== Classification Report (RF + BERT Only) ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

=== Feature Used ===
['bert_similarity', 'title_length', 'sensational_word_count', 'punctuation_count', 'punctuation_ratio']

=== Classification Report (RF + BERT Only) ===
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       514
           1       0.22      0.09      0.13        86

    accuracy                           0.82       600
   macro avg       0.54      0.52      0.52       600
weighted avg       0.77      0.82      0.79       600


=== Confusion Matrix ===
[[486  28]
 [ 78   8]]
