In [None]:
# Import necessary libraries

import pandas as pd  # Data manipulation and analysis
import seaborn as sns  # Data visualization
import matplotlib.pyplot as plt  # Plotting utilities
import numpy as np  # Numerical computations

# Feature selection methods
from sklearn.feature_selection import mutual_info_classif, SelectFromModel, RFE

# Machine learning models
from sklearn.ensemble import RandomForestClassifier  # Random Forest for feature selection and classification
from sklearn.linear_model import LogisticRegression, LassoCV  # Logistic Regression & Lasso for feature selection
from sklearn.svm import LinearSVC, SVC  # Support Vector Machines (linear & non-linear)

# Data preprocessing and model evaluation
from sklearn.model_selection import train_test_split  # Splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler  # Standardization of feature values

# Utility for analyzing dataset distributions
from collections import Counter  # Count occurrences of elements in a dataset


In [None]:
# Load the dataset
file_path = "/Users/patrick/Desktop/Dissertation/Fake review project/data/reviews_with_features.csv"
reviews_dataset = pd.read_csv(file_path)

In [None]:
# Select only the numerical columns
numeric_cols = reviews_dataset.select_dtypes(include=['float64', 'int64']).copy()

# Ensure 'label' is included
if 'label' not in numeric_cols.columns:
    numeric_cols['label'] = reviews_dataset['label']

In [None]:
# Categorise Features into Static, RNN, and GNN (Manually Assigned)
# Static Features: General characteristics, excluding time-based and network-based features
static_features = [
    "category", "rating", "text_length", "sentiment", "readability",
    "text_length_x_readability", "log_text_length", "length_sentiment_ratio",
    "is_weekend", "avg_rating", "rating_deviation"
]

# RNN Features: Time-Based Features
rnn_features = [
    "spike_day_reviewers", "rolling_review_count", "rolling_rating_mean",
    "days_since_last_review"
]

# GNN Features: Network/Group-Based Features
gnn_features = [
    "category", "degree_centrality", "rolling_review_count",
    "spike_day_reviewers"
]


In [None]:
# Compute Correlation with Label
correlation_matrix = numeric_cols.corr()
correlation_threshold = 0.1
correlation_with_label = correlation_matrix['label'].sort_values(ascending=False)
selected_correlation_features = correlation_with_label[correlation_with_label.abs() > correlation_threshold].index.tolist()

# Convert to DataFrame for Display
correlation_df = pd.DataFrame({
    "Feature": selected_correlation_features,
    "Correlation": correlation_with_label[selected_correlation_features].values
})

# Print correlation results
print("\nFeatures with Absolute Correlation > 0.1:")
for feature, corr_value in zip(correlation_df["Feature"], correlation_df["Correlation"]):
    print(f"{feature}: {corr_value:.4f}")

# Visualize correlation with a bar chart
plt.figure(figsize=(8, 5))
sns.barplot(y=correlation_df["Feature"], x=correlation_df["Correlation"], palette="coolwarm")
plt.axvline(x=0.1, color="gray", linestyle="--", label="Threshold (0.1)")
plt.axvline(x=-0.1, color="gray", linestyle="--")
plt.xlabel("Correlation with Label")
plt.ylabel("Feature")
plt.title("Feature Correlation with Label")
plt.legend()
plt.show()


In [None]:
# Compute Mutual Information for ALL Features
X = numeric_cols.drop(columns=['label'], errors='ignore').fillna(0)
y = reviews_dataset['label']
mutual_info_scores = mutual_info_classif(X, y, discrete_features=False)
mutual_info_df = pd.DataFrame({'Feature': X.columns, 'Mutual_Info_Score': mutual_info_scores}).sort_values(by='Mutual_Info_Score', ascending=False)

# Set mutual information threshold
mutual_info_threshold = 0.02
selected_mutual_info_features = mutual_info_df[mutual_info_df['Mutual_Info_Score'] > mutual_info_threshold]['Feature'].tolist()

print("\nFeatures with Mutual Information Score > 0.02:")
for feature, mi_score in mutual_info_df[mutual_info_df['Mutual_Info_Score'] > mutual_info_threshold].values:
    print(f"{feature}: {mi_score:.4f}")
    
# Plot Mutual Information Scores
plt.figure(figsize=(12, 6))
sns.barplot(x=mutual_info_df['Mutual_Info_Score'], y=mutual_info_df['Feature'], palette='viridis')
plt.xlabel("Mutual Information Score")
plt.ylabel("Feature")
plt.title("Feature Importance Based on Mutual Information")
plt.show()


In [None]:
# Recursive Feature Elimination (RFE) for Each Model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# RFE for Logistic Regression
lr_rfe = LogisticRegression(max_iter=5000, solver='liblinear')
rfe_selector_lr = RFE(lr_rfe, n_features_to_select=10)
rfe_selector_lr.fit(X_scaled, y)
rfe_selected_features_lr = X.columns[rfe_selector_lr.support_].tolist()

# RFE for Random Forest
rf_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
rfe_selector_rf = RFE(rf_rfe, n_features_to_select=10)
rfe_selector_rf.fit(X_scaled, y)
rfe_selected_features_rf = X.columns[rfe_selector_rf.support_].tolist()

# RFE for SVM
svm_rfe = SVC(kernel='linear')  # Linear kernel works best with RFE
rfe_selector_svm = RFE(svm_rfe, n_features_to_select=10)
rfe_selector_svm.fit(X_scaled, y)
rfe_selected_features_svm = X.columns[rfe_selector_svm.support_].tolist()

# Print RFE-selected features for each model
print("\nFeatures Selected by RFE:")
print(f"Logistic Regression ({len(rfe_selected_features_lr)} features): {rfe_selected_features_lr}")
print(f"Random Forest ({len(rfe_selected_features_rf)} features): {rfe_selected_features_rf}")
print(f"SVM ({len(rfe_selected_features_svm)} features): {rfe_selected_features_svm}")


In [None]:
# Lasso Regression for Feature Selection
lasso = LassoCV(cv=5, random_state=42, alphas=np.logspace(-3, 1, 100), tol=1e-4, max_iter=5000)
lasso.fit(X_scaled, y)
threshold = np.percentile(np.abs(lasso.coef_), 60)
lasso_selected_features = X.columns[np.abs(lasso.coef_) > threshold].tolist()

# Print the features selected by Lasso Regression
print("\nFeatures Selected by Lasso Regression:")
print(f"({len(lasso_selected_features)} features): {lasso_selected_features}")


In [None]:
# Train-Test Split for Feature Importance Analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Random Forest Feature Importance
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
rf_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_}).sort_values(by='Importance', ascending=False)
rf_top_features = rf_importances['Feature'].tolist()[:10]

# Logistic Regression Coefficients
lr_model = LogisticRegression(max_iter=5000, solver='saga')
lr_model.fit(StandardScaler().fit_transform(X_train), y_train)
lr_coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': lr_model.coef_[0]}).sort_values(by='Coefficient', ascending=False)
lr_top_features = lr_coefficients['Feature'].tolist()[:10]

# SVM Feature Selection
svm_model = LinearSVC(dual=False, max_iter=5000)
svm_model.fit(StandardScaler().fit_transform(X_train), y_train)
selector = SelectFromModel(svm_model, prefit=True)
svm_top_features = X.columns[selector.get_support()].tolist()

# Print Top Features Selected by Random Forest
print("\nTop Features Selected by Random Forest (Feature Importance):")
print(f"({len(rf_top_features)} features): {rf_top_features}")

# Print Top Features Selected by Logistic Regression
print("\nTop Features Selected by Logistic Regression (Coefficient-Based):")
print(f"({len(lr_top_features)} features): {lr_top_features}")

# Print Top Features Selected by SVM
print("\nTop Features Selected by SVM (SelectFromModel):")
print(f"({len(svm_top_features)} features): {svm_top_features}")


In [None]:
# Define Model-Specific Feature Sets
# Traditional Models (Static Features Only)
rf_features = list(set(static_features) & set(rfe_selected_features_rf + rf_top_features))
lr_features = list(set(static_features) & set(lasso_selected_features + rfe_selected_features_lr + lr_top_features))
svm_features = list(set(static_features) & set(rfe_selected_features_svm + svm_top_features + selected_mutual_info_features))

# Neural Networks (Time-Based & Graph-Based Features)
gnn_features = list(set(gnn_features) & set(rfe_selected_features_rf + rf_top_features + selected_mutual_info_features))
rnn_features = list(set(rnn_features) & set(rfe_selected_features_lr + lasso_selected_features + selected_mutual_info_features))

# Display Final Feature Sets
print("\nFinal Features Selected for Each Model:")
print(f"Random Forest Features ({len(rf_features)}): {rf_features}")
print(f"Logistic Regression Features ({len(lr_features)}): {lr_features}")
print(f"SVM Features ({len(svm_features)}): {svm_features}")
print(f"GNN Features ({len(gnn_features)}): {gnn_features}")
print(f"RNN Features ({len(rnn_features)}): {rnn_features}")