In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("./merged_for_similarity_rearranged_v3.csv")

In [33]:
# TF‑IDF lexical features
tfidf_feats = [c for c in df.columns if ("cosine" in c.lower() and "bert" not in c.lower()) or "overlap" in c.lower()]
print(tfidf_feats)

# Label
label_col = None
for c in df.columns:
  if "is_clickbait" in c.lower():
    label_col = c
    break

X = df[tfidf_feats]
y = df[label_col]

# # Train/test split
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

# # Random Forest
rf = RandomForestClassifier(
  n_estimators=300,
  max_depth=None,
  class_weight="balanced",
  random_state=42
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# print("== Features used ==")
# print(tfidf_feats)

# print("\n=== Classification Report (Random Forest + TF-IDF) ===")
# print(classification_report(y_test, y_pred))

# print("\n=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_pred))

['cosine_similarity', 'token_overlap']


In [35]:
importances = rf.feature_importances_
print(importances)

[0.51392372 0.48607628]


In [None]:
# # TF‑IDF lexical + extra features
# tfidf_feats = [c for c in df.columns if ("cosine" in c.lower() and "bert" not in c.lower()) or "overlap" in c.lower()]

# # Extra features (exclude BERT)
# extra_feats = ["title_length","sensational_word_count","punctuation_count","punctuation_ratio"]
# extra_feats = [c for c in extra_feats if c in df.columns]

# # Combine features
# feature_cols = tfidf_feats + extra_feats

# # Label
# label_col = None
# for c in df.columns:
#   if "is_clickbait" in c.lower():
#     label_col = c
#     break

# X = df[feature_cols]
# y = df[label_col]

# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(
#   X, y, test_size=0.2, random_state=42, stratify=y
# )

# scaler = StandardScaler()

# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Random Forest
# rf = RandomForestClassifier(
#   n_estimators=300,
#   max_depth=None,
#   class_weight="balanced",
#   random_state=42
# )

# # rf.fit(X_train, y_train)
# rf.fit(X_train_scaled, y_train)
# y_pred = rf.predict(X_test_scaled)

# print("== Features used ==")
# print(feature_cols)

# print("\n=== Classification Report (Random Forest + TF-IDF) ===")
# print(classification_report(y_test, y_pred))

# print("\n=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_pred))