In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("./new_preprocessed_final.csv")

In [4]:
# TF-IDF / token overlap
tfidf_feats = [c for c in df.columns if ("cosine" in c.lower() and "bert" not in c.lower()) or "overlap" in c.lower()]

# BERT similarity
bert_feat = None
for c in df.columns:
  if "bert_similarity" in c.lower() or ("cosine" in c.lower() and "bert" in c.lower()):
    bert_feat = c
    break

# Extra linguistic features
extra_feats = ["title_length","sensational_word_count","punctuation_count","punctuation_ratio"]
extra_feats = [c for c in extra_feats if c in df.columns]

# Combine all
feature_cols = tfidf_feats + ([bert_feat] if bert_feat else []) + extra_feats

# Label column
label_col = None
for c in df.columns:
  if "is_clickbait" in c.lower():
    label_col = c
    break

X = df[feature_cols]
y = df[label_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression pipeline
pipe = Pipeline([
  ("scaler", StandardScaler()),
  ("clf", LogisticRegression(max_iter=400, class_weight="balanced"))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("=== Features Used ===")
print(feature_cols)

print("\n=== Classification Report (LR + TF-IDF + Token + BERT + Extras) ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

=== Features Used ===
['cosine_similarity', 'token_overlap', 'bert_similarity', 'title_length', 'sensational_word_count', 'punctuation_count', 'punctuation_ratio']

=== Classification Report (LR + TF-IDF + Token + BERT + Extras) ===
              precision    recall  f1-score   support

           0       0.99      0.91      0.95       514
           1       0.65      0.97      0.78        86

    accuracy                           0.92       600
   macro avg       0.82      0.94      0.86       600
weighted avg       0.94      0.92      0.93       600


=== Confusion Matrix ===
[[469  45]
 [  3  83]]
