In [None]:
# Import Dependencies

# General Libraries for numerical computations and data handling
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
import torch  # PyTorch framework for deep learning

# Machine Learning & Feature Selection
from sklearn.model_selection import train_test_split, GridSearchCV  # Data splitting and hyperparameter tuning
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # Feature scaling
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix  # Model evaluation metrics

# Deep Learning (GNN & RNN)
import torch.nn as nn  # Neural network components in PyTorch
import torch.optim as optim  # Optimization algorithms for training
from torch_geometric.data import Data  # Graph data structure for GNNs
from torch_geometric.nn import GCNConv, SAGEConv  # GNN layers (Graph Convolutional Network & GraphSAGE)

# Recurrent Neural Network (RNN) with Keras
from keras.models import Sequential  # Sequential API for building neural networks
from keras.layers import LSTM, Dense, Bidirectional, Dropout  # LSTM-based architecture layers


In [None]:
# Load Dataset
file_path = "/Users/patrick/Desktop/Dissertation/Fake review project/data/reviews_with_features.csv"
reviews_dataset = pd.read_csv(file_path)

# Define Feature Groups for Different Models

# Random Forest (RF) Features
rf_features = ['category', 'text_length_x_readability', 'text_length',
               'readability', 'length_sentiment_ratio', 'sentiment', 'log_text_length'
]

# Logistic Regression (LR) Features
lr_features = ['category', 'rating', 'text_length_x_readability', 'text_length', 
               'readability', 'length_sentiment_ratio', 'avg_rating', 'sentiment', 
               'rating_deviation'
]

# Support Vector Machine (SVM) Features
svm_features = ['category', 'rating', 'text_length_x_readability', 'text_length', 
                'readability', 'length_sentiment_ratio', 'sentiment', 
                'rating_deviation', 'log_text_length'
]

# Graph Neural Network (GNN) Features (Group-based)
gnn_features = ["category", "rolling_review_count"
]

# Recurrent Neural Network (RNN) Features (Temporal-based)
rnn_features = ['rolling_review_count', 'rolling_rating_mean', 'days_since_last_review'
]


In [None]:
# Logistic Regression Model with Hyperparameter Tuning
print("Training Logistic Regression with Hyperparameter Tuning...")

# Standardize feature values to ensure uniform scale
scaler_lr = StandardScaler()
X_lr = scaler_lr.fit_transform(reviews_dataset[lr_features].fillna(0))  # Fill missing values with 0
y = reviews_dataset['label']  # Target variable

# Split dataset into training and testing sets (80/20 split, stratified by label)
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
    X_lr, y, test_size=0.2, random_state=42, stratify=y
)

# Define hyperparameter grid for tuning
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear']}

# Perform grid search with cross-validation (5-fold)
grid_search = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'), param_grid, cv=5)

# Train the model using the best hyperparameter combination
grid_search.fit(X_train_lr, y_train_lr)
print("Best Parameters:", grid_search.best_params_)

# Retrieve the best model from grid search
best_lr_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_lr = best_lr_model.predict(X_test_lr)

# Evaluate model performance
accuracy = accuracy_score(y_test_lr, y_pred_lr)
conf_matrix = confusion_matrix(y_test_lr, y_pred_lr)

# Display evaluation metrics
print(f"Logistic Regression Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_lr, y_pred_lr))
print(f"False Positives: {conf_matrix[0,1]}, False Negatives: {conf_matrix[1,0]}")


In [None]:
# Random Forest Model
print("Training Random Forest Model...")

# Standardize feature values to ensure consistent scaling
scaler_rf = StandardScaler()
X_rf = scaler_rf.fit_transform(reviews_dataset[rf_features])  # Use predefined RF features
y = reviews_dataset["label"]  # Target variable

# Split dataset into training and testing sets (80/20 split, stratified by label)
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
    X_rf, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train the Random Forest model with 100 estimators
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_rf, y_train_rf)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_rf)

# Evaluate model performance
rf_accuracy = accuracy_score(y_test_rf, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test_rf, y_pred_rf)  # Compute confusion matrix

# Display evaluation results
print("Random Forest Training Completed.")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print("\nClassification Report (Random Forest):")
print(classification_report(y_test_rf, y_pred_rf))
print(f"False Positives: {conf_matrix_rf[0,1]}, False Negatives: {conf_matrix_rf[1,0]}")


In [None]:
# Support Vector Machine (SVM) Model
print("Training Support Vector Machine (SVM) Model...")

# Standardize feature values for consistent scaling
scaler_svm = StandardScaler()
X_svm = scaler_svm.fit_transform(reviews_dataset[svm_features])  # Use predefined SVM features
y = reviews_dataset["label"]  # Target variable

# Split dataset into training and testing sets (80/20 split, stratified by label)
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    X_svm, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train the SVM model using an RBF kernel
svm_model = SVC(kernel="rbf", random_state=42, probability=True)  # Enables probability estimates
svm_model.fit(X_train_svm, y_train_svm)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test_svm)

# Evaluate model performance
svm_accuracy = accuracy_score(y_test_svm, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test_svm, y_pred_svm)  # Compute confusion matrix

# Display evaluation results
print("SVM Training Completed.")
print(f"SVM Model Accuracy: {svm_accuracy:.4f}")
print("\nClassification Report (SVM):")
print(classification_report(y_test_svm, y_pred_svm))
print(f"False Positives: {conf_matrix_svm[0,1]}, False Negatives: {conf_matrix_svm[1,0]}")


In [None]:
# Prepare Graph Data for Graph Neural Network (GNN)
print("Preparing Graph Data for GNN...")

# Ensure 'user_id' column exists; if missing, create a unique ID for each row
if 'user_id' not in reviews_dataset.columns:
    total_users = len(reviews_dataset)
    reviews_dataset['user_id'] = np.arange(total_users)

# Construct edge index (user_id → category) for the graph structure
edge_index_array = np.vstack([
    reviews_dataset['user_id'].values,
    reviews_dataset['category'].values
])
edge_index = torch.tensor(edge_index_array, dtype=torch.long)

# Normalize numerical features for GNN training
scaler_gnn = MinMaxScaler()
numerical_gnn_features = ['rolling_review_count', 'rolling_rating_mean']
reviews_dataset[numerical_gnn_features] = scaler_gnn.fit_transform(reviews_dataset[numerical_gnn_features])

# Convert features and labels to PyTorch tensors
node_features = torch.tensor(reviews_dataset[gnn_features].values, dtype=torch.float)
labels = torch.tensor(reviews_dataset['label'].values, dtype=torch.long)

# Create a PyTorch Geometric Data object for GNN processing
data = Data(x=node_features, edge_index=edge_index, y=labels)

# Debugging output
print("GNN Data Preparation Completed.")
print(f"Number of Nodes: {data.num_nodes}")
print(f"Number of Edges: {data.num_edges}")
print(f"Node Feature Matrix Shape: {data.x.shape}")
print(f"Labels Shape: {data.y.shape}")


In [None]:
# Define the Graph Neural Network (GNN) Model
print("\nInitializing GNN Model...")

class EnhancedGNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(EnhancedGNNModel, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)  # First graph convolution layer
        self.conv2 = SAGEConv(hidden_dim, hidden_dim // 2)  # Second graph convolution layer
        self.conv3 = SAGEConv(hidden_dim // 2, output_dim)  # Output layer
        self.dropout = nn.Dropout(0.4)  # Dropout for regularization

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = torch.relu(self.conv1(x, edge_index))  # Apply ReLU activation after first layer
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))  # Apply ReLU activation after second layer
        x = self.dropout(x)
        x = self.conv3(x, edge_index)  # Node-level output
        return x

# Initialize model, optimizer, and loss function
hidden_dim = 64
gnn_model = EnhancedGNNModel(input_dim=node_features.size(1), hidden_dim=hidden_dim, output_dim=2)
optimizer_gnn = optim.Adam(gnn_model.parameters(), lr=0.01, weight_decay=1e-4)  # Adam optimizer with weight decay
loss_function = nn.CrossEntropyLoss()  # Cross-entropy loss for classification

print("GNN Model Initialized.")

# Train the GNN Model
print("\nTraining GNN Model...")
gnn_model.train()
for epoch in range(100):  # Train for 100 epochs
    optimizer_gnn.zero_grad()
    gnn_output = gnn_model(data)  # Forward pass
    loss_gnn = loss_function(gnn_output, data.y)  # Compute loss
    loss_gnn.backward()  # Backpropagation
    optimizer_gnn.step()  # Update weights

    # Print progress every 5 epochs
    if (epoch + 1) % 5 == 0 or epoch == 0:
        gnn_predictions = gnn_output.argmax(dim=1).detach().cpu().numpy()
        gnn_accuracy = accuracy_score(data.y.cpu().numpy(), gnn_predictions)
        print(f"Epoch {epoch + 1}: Loss = {loss_gnn.item():.4f}, Accuracy = {gnn_accuracy:.4f}")

print("GNN Training Completed.")

# Extract GNN Embeddings for Hybrid Model
print("\nExtracting GNN Embeddings...")
gnn_model.eval()
with torch.no_grad():
    gnn_embeddings = gnn_model(data).detach().cpu().numpy()  # Convert tensor to NumPy array

print("GNN Embeddings Extracted.")

# Evaluate the trained GNN Model
print("\nEvaluating GNN Model...")
gnn_model.eval()
with torch.no_grad():
    gnn_output = gnn_model(data)
    gnn_predictions = gnn_output.argmax(dim=1).cpu().numpy()  # Convert tensor to NumPy array
    true_labels = data.y.cpu().numpy()  # Extract true labels

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, gnn_predictions)

# Extract false positives and false negatives
false_positives = conf_matrix[0, 1]  # Misclassified as positive
false_negatives = conf_matrix[1, 0]  # Misclassified as negative

# Print final evaluation results
gnn_accuracy = accuracy_score(true_labels, gnn_predictions)
print(f"GNN Model Accuracy: {gnn_accuracy:.4f}")
print("\nGNN Classification Report:")
print(classification_report(true_labels, gnn_predictions))
print(f"\nFalse Positives: {false_positives}, False Negatives: {false_negatives}")


In [None]:
# Prepare Sequential Data for Recurrent Neural Network (RNN)
print("\nPreparing Sequential Data for RNN...")

# Select features used for the RNN model
rnn_features = ['spike_day_reviewers', 'rolling_review_count']

# Normalize only the RNN-related features
scaler_rnn = MinMaxScaler()
temporal_data = scaler_rnn.fit_transform(reviews_dataset[rnn_features])

# Function to create sequences of fixed length for RNN training
def create_sequences(data, labels, seq_length=10):
    sequences, seq_labels = [], []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])  # Extract sequence of features
        seq_labels.append(labels[i + seq_length])  # Assign label from last time step
    return np.array(sequences), np.array(seq_labels)

# Generate sequences for RNN model input
rnn_input, rnn_labels = create_sequences(temporal_data, reviews_dataset['label'].values, seq_length=10)

# Print shape information for verification
print(f"RNN Input Shape: {rnn_input.shape}")
print(f"RNN Labels Shape: {rnn_labels.shape}")
print("RNN Data Preparation Completed.")


In [None]:
# Define and Train Optimized Recurrent Neural Network (RNN) Model
print("\nInitializing RNN Model...")

from keras.layers import Input

# Define the RNN model with an explicit input layer
rnn_model = Sequential([
    Input(shape=(rnn_input.shape[1], rnn_input.shape[2])),  # Input layer defining sequence length and features
    Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.2)),  # First bidirectional LSTM layer
    LSTM(32, recurrent_dropout=0.2),  # Second LSTM layer
    Dropout(0.3),  # Dropout for regularization
    Dense(32, activation='relu'),  # Fully connected layer with ReLU activation
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model using Adam optimizer and binary cross-entropy loss
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("RNN Model Initialized.")

# Train the RNN model
print("\nTraining RNN Model...")
rnn_model.fit(rnn_input, rnn_labels, epochs=15, batch_size=32, verbose=1)
print("RNN Training Completed.")

# Extract RNN Embeddings for Hybrid Model
print("\nExtracting RNN Embeddings...")
rnn_embeddings = rnn_model.predict(rnn_input)
print("RNN Embeddings Extracted.")

# Evaluate RNN Model
print("\nEvaluating RNN Model...")

# Ensure labels have the correct shape
rnn_labels = rnn_labels.reshape(-1)  # Flatten labels array
rnn_predictions = (rnn_embeddings > 0.5).astype(int).flatten()  # Convert probability outputs to binary predictions

# Compute accuracy score
rnn_accuracy = accuracy_score(rnn_labels, rnn_predictions)

# Display classification results
print(f"RNN Model Accuracy: {rnn_accuracy:.4f}")
print("\nRNN Classification Report:")
print(classification_report(rnn_labels, rnn_predictions))

# Compute False Positives & False Negatives
false_positives = np.sum((rnn_predictions == 1) & (rnn_labels == 0))  # Incorrectly predicted as positive
false_negatives = np.sum((rnn_predictions == 0) & (rnn_labels == 1))  # Incorrectly predicted as negative

print(f"False Positives: {false_positives}, False Negatives: {false_negatives}")


In [None]:
# LOGISTIC REGRESSION HYBRID MODEL

# Ensure feature alignment by truncating datasets to the smallest available length
min_length = min(len(X_lr), len(gnn_embeddings), len(rnn_embeddings), len(y)) 

# Truncate datasets to ensure consistency
X_lr = X_lr[:min_length]  # Logistic Regression features
gnn_embeddings = gnn_embeddings[:min_length]
rnn_embeddings = rnn_embeddings[:min_length]
labels = torch.tensor(y[:min_length].values, dtype=torch.float).view(-1)

print("\nInitializing LR Hybrid Model...")
print("LR Hybrid Model Initialized.")

# Normalize features using MinMax scaling
scaler = MinMaxScaler()
X_combined = np.hstack([X_lr, gnn_embeddings, rnn_embeddings])
X_combined_normalized = scaler.fit_transform(X_combined)

# Compute interaction features between different components
X_lr_exp = np.expand_dims(X_lr, axis=2)  
gnn_exp = np.expand_dims(gnn_embeddings, axis=1)  
rnn_exp = np.expand_dims(rnn_embeddings, axis=1)  

lr_gnn_interaction = (X_lr_exp * gnn_exp).reshape(X_lr.shape[0], -1)  
lr_rnn_interaction = (X_lr_exp * rnn_exp).reshape(X_lr.shape[0], -1)  
gnn_rnn_interaction = (gnn_exp * rnn_exp).reshape(X_lr.shape[0], -1)  

# Combine interaction features with normalized inputs
interaction_features = np.hstack([lr_gnn_interaction, lr_rnn_interaction, gnn_rnn_interaction])
X_combined_normalized = np.hstack([X_combined_normalized, interaction_features])

# Train-Test Split
train_features, test_features, train_labels, test_labels = train_test_split(
    torch.tensor(X_combined_normalized, dtype=torch.float),
    labels,
    test_size=0.2,
    random_state=42
)

# Define LR Hybrid Model
class LRHybridModel(nn.Module):
    def __init__(self, input_dim):
        super(LRHybridModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 1)
        self.leaky_relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.bn1(x)

        x = self.leaky_relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.bn2(x)

        x = self.leaky_relu(self.fc3(x))
        x = self.bn3(x)

        x = torch.sigmoid(self.fc4(x))  # Binary classification output
        return x

# Initialize LR Hybrid Model
hybrid_model = LRHybridModel(input_dim=X_combined_normalized.shape[1])
optimizer_hybrid = optim.Adam(hybrid_model.parameters(), lr=0.002, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-4)

# Learning rate scheduler to reduce learning rate when validation loss plateaus
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_hybrid, mode='min', factor=0.5, patience=3)

# Define Focal Loss function for handling class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = nn.BCELoss()(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

loss_fn = FocalLoss()

# Early stopping parameters
best_accuracy = 0
patience = 10
patience_counter = 0

# Train the LR Hybrid Model
print("\nTraining LR Hybrid Model...")
for epoch in range(100):
    hybrid_model.train()
    optimizer_hybrid.zero_grad()
    
    predictions = hybrid_model(train_features).squeeze()
    loss_hybrid = loss_fn(predictions, train_labels)
    
    loss_hybrid.backward()
    optimizer_hybrid.step()
    scheduler.step(loss_hybrid)

    # Evaluate model on test set
    hybrid_model.eval()
    with torch.no_grad():
        test_predictions = hybrid_model(test_features).squeeze()
        test_predictions_binary = (test_predictions > 0.5).int()
        test_accuracy = accuracy_score(test_labels.numpy(), test_predictions_binary.numpy())

    # Print training progress every 5 epochs
    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(f"Epoch {epoch + 1}: Loss = {loss_hybrid.item():.4f}, Accuracy = {test_accuracy:.4f}")

    # Implement early stopping
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        patience_counter = 0
        torch.save(hybrid_model.state_dict(), "best_lr_hybrid_model.pth")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

print("LR Hybrid Training Completed.")

# Load the best model checkpoint
hybrid_model.load_state_dict(torch.load("best_lr_hybrid_model.pth", weights_only=True))

# Final Model Evaluation
hybrid_model.eval()
with torch.no_grad():
    final_predictions = hybrid_model(test_features).squeeze()
    final_predictions_binary = (final_predictions > 0.5).int()

# Compute Confusion Matrix
conf_matrix = confusion_matrix(test_labels.numpy(), final_predictions_binary.numpy())
false_positives = conf_matrix[0, 1]  # Incorrectly classified as positive
false_negatives = conf_matrix[1, 0]  # Incorrectly classified as negative

# Evaluate final model performance
lr_hybrid_best_accuracy = accuracy_score(test_labels.numpy(), final_predictions_binary.numpy())

# Print final evaluation results
print("\nExtracting LR Hybrid Model Predictions...")
print("RF Hybrid Model Predictions Extracted.")  # Possible typo? Should it be LR Hybrid?

print("\nEvaluating LR Hybrid Model...")
print(f"LR Hybrid Model Accuracy: {lr_hybrid_best_accuracy:.4f}")

print("\nLR Hybrid Model Classification Report:")
print(classification_report(test_labels.numpy(), final_predictions_binary.numpy(), zero_division=0))

print(f"\nFalse Positives: {false_positives}, False Negatives: {false_negatives}")


In [None]:
# RANDOM FOREST HYBRID MODEL

# Ensure feature alignment by truncating datasets to the smallest available length
min_length = min(len(X_rf), len(gnn_embeddings), len(rnn_embeddings), len(y)) 

# Truncate datasets to maintain consistency
X_rf = X_rf[:min_length]
gnn_embeddings = gnn_embeddings[:min_length]
rnn_embeddings = rnn_embeddings[:min_length]
labels = torch.tensor(y[:min_length].values, dtype=torch.float).view(-1)

print("\nInitializing RF Hybrid Model...")
print("RF Hybrid Model Initialized.")

# Normalize features using MinMax scaling
scaler = MinMaxScaler()
X_combined = np.hstack([X_rf, gnn_embeddings, rnn_embeddings])
X_combined_normalized = scaler.fit_transform(X_combined)

# Compute interaction features between different feature sets
X_rf_exp = np.expand_dims(X_rf, axis=2)  
gnn_exp = np.expand_dims(gnn_embeddings, axis=1)  
rnn_exp = np.expand_dims(rnn_embeddings, axis=1)  

rf_gnn_interaction = (X_rf_exp * gnn_exp).reshape(X_rf.shape[0], -1)  
rf_rnn_interaction = (X_rf_exp * rnn_exp).reshape(X_rf.shape[0], -1)  
gnn_rnn_interaction = (gnn_exp * rnn_exp).reshape(X_rf.shape[0], -1)  

# Combine interaction features with normalized inputs
interaction_features = np.hstack([rf_gnn_interaction, rf_rnn_interaction, gnn_rnn_interaction])
X_combined_normalized = np.hstack([X_combined_normalized, interaction_features])

# Train-Test Split
train_features, test_features, train_labels, test_labels = train_test_split(
    torch.tensor(X_combined_normalized, dtype=torch.float),
    labels,
    test_size=0.2,
    random_state=42
)

# Define RF Hybrid Model
class RFHybridModel(nn.Module):
    def __init__(self, input_dim):
        super(RFHybridModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 1)
        self.leaky_relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.bn1(x)

        x = self.leaky_relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.bn2(x)

        x = self.leaky_relu(self.fc3(x))
        x = self.bn3(x)

        x = torch.sigmoid(self.fc4(x))  # Binary classification output
        return x

# Initialize RF Hybrid Model
hybrid_model = RFHybridModel(input_dim=X_combined_normalized.shape[1])
optimizer_hybrid = optim.Adam(hybrid_model.parameters(), lr=0.002, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-4)

# Learning rate scheduler to reduce learning rate when validation loss plateaus
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_hybrid, mode='min', factor=0.5, patience=3)

# Define Focal Loss function for handling class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = nn.BCELoss()(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

loss_fn = FocalLoss()

# Early stopping parameters
best_accuracy = 0
patience = 10
patience_counter = 0

# Train the RF Hybrid Model
print("\nTraining RF Hybrid Model...")
for epoch in range(100):
    hybrid_model.train()
    optimizer_hybrid.zero_grad()
    
    predictions = hybrid_model(train_features).squeeze()
    loss_hybrid = loss_fn(predictions, train_labels)
    
    loss_hybrid.backward()
    optimizer_hybrid.step()
    scheduler.step(loss_hybrid)

    # Evaluate model on test set
    hybrid_model.eval()
    with torch.no_grad():
        test_predictions = hybrid_model(test_features).squeeze()
        test_predictions_binary = (test_predictions > 0.5).int()
        test_accuracy = accuracy_score(test_labels.numpy(), test_predictions_binary.numpy())

    # Print training progress every 5 epochs
    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(f"Epoch {epoch + 1}: Loss = {loss_hybrid.item():.4f}, Accuracy = {test_accuracy:.4f}")

    # Implement early stopping
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        patience_counter = 0
        torch.save(hybrid_model.state_dict(), "best_rf_hybrid_model.pth")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

print("RF Hybrid Training Completed.")

# Load the best model checkpoint
hybrid_model.load_state_dict(torch.load("best_rf_hybrid_model.pth", weights_only=True))

# Final Model Evaluation
hybrid_model.eval()
with torch.no_grad():
    final_predictions = hybrid_model(test_features).squeeze()
    final_predictions_binary = (final_predictions > 0.5).int()

# Compute Confusion Matrix
conf_matrix = confusion_matrix(test_labels.numpy(), final_predictions_binary.numpy())
false_positives = conf_matrix[0, 1]  # Incorrectly classified as positive
false_negatives = conf_matrix[1, 0]  # Incorrectly classified as negative

# Evaluate final model performance
rf_hybrid_best_accuracy = accuracy_score(test_labels.numpy(), final_predictions_binary.numpy())

# Print final evaluation results
print("\nExtracting RF Hybrid Model Predictions...")
print("RF Hybrid Model Predictions Extracted.")

print("\nEvaluating RF Hybrid Model...")
print(f"RF Hybrid Model Accuracy: {rf_hybrid_best_accuracy:.4f}")

print("\nRF Hybrid Model Classification Report:")
print(classification_report(test_labels.numpy(), final_predictions_binary.numpy(), zero_division=0))

print(f"\nFalse Positives: {false_positives}, False Negatives: {false_negatives}")


In [None]:
# SVM HYBRID MODEL

# Train SVM model and extract probability-based features
scaler_svm = MinMaxScaler()
X_svm = scaler_svm.fit_transform(reviews_dataset[svm_features])  # Normalize SVM-specific features
y = reviews_dataset['label']  # Target variable

# Split dataset into training and testing sets for SVM
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    X_svm, y, test_size=0.2, random_state=42, stratify=y
)

# Train SVM model with an RBF kernel
svm_model = SVC(kernel='rbf', probability=True, random_state=42)  # Enable probability predictions
svm_model.fit(X_train_svm, y_train_svm)

# Extract SVM probability scores to be used as features in the hybrid model
svm_probabilities = svm_model.predict_proba(X_svm)[:, 1].reshape(-1, 1)  # Use probability of class 1

# Ensure feature alignment by truncating datasets to the smallest available length
min_length = min(len(svm_probabilities), len(gnn_embeddings), len(rnn_embeddings), len(y)) 

# Truncate datasets to maintain consistency
svm_probabilities = svm_probabilities[:min_length]
gnn_embeddings = gnn_embeddings[:min_length]
rnn_embeddings = rnn_embeddings[:min_length]
labels = torch.tensor(y[:min_length].values, dtype=torch.float).view(-1)

print("\nInitializing SVM Hybrid Model...")
print("SVM Hybrid Model Initialized.")

# Normalize combined feature set
scaler = MinMaxScaler()
X_combined = np.hstack([svm_probabilities, gnn_embeddings, rnn_embeddings])
X_combined_normalized = scaler.fit_transform(X_combined)

# Compute interaction features between different components
svm_exp = np.expand_dims(svm_probabilities, axis=2)  
gnn_exp = np.expand_dims(gnn_embeddings, axis=1)  
rnn_exp = np.expand_dims(rnn_embeddings, axis=1)  

svm_gnn_interaction = (svm_exp * gnn_exp).reshape(svm_probabilities.shape[0], -1)  
svm_rnn_interaction = (svm_exp * rnn_exp).reshape(svm_probabilities.shape[0], -1)  
gnn_rnn_interaction = (gnn_exp * rnn_exp).reshape(svm_probabilities.shape[0], -1)  

# Combine interaction features with normalized input features
interaction_features = np.hstack([svm_gnn_interaction, svm_rnn_interaction, gnn_rnn_interaction])
X_combined_normalized = np.hstack([X_combined_normalized, interaction_features])

# Train-Test Split
train_features, test_features, train_labels, test_labels = train_test_split(
    torch.tensor(X_combined_normalized, dtype=torch.float),
    labels,
    test_size=0.2,
    random_state=42
)

# Define SVM Hybrid Model
class SVMHybridModel(nn.Module):
    def __init__(self, input_dim):
        super(SVMHybridModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 1)
        self.leaky_relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.bn1(x)

        x = self.leaky_relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.bn2(x)

        x = self.leaky_relu(self.fc3(x))
        x = self.bn3(x)

        x = torch.sigmoid(self.fc4(x))  # Binary classification output
        return x

# Initialize SVM Hybrid Model
hybrid_model = SVMHybridModel(input_dim=X_combined_normalized.shape[1])
optimizer_hybrid = optim.Adam(hybrid_model.parameters(), lr=0.002, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-4)

# Learning rate scheduler to reduce learning rate when validation loss plateaus
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_hybrid, mode='min', factor=0.5, patience=3)

# Define Focal Loss function for handling class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = nn.BCELoss()(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

loss_fn = FocalLoss()

# Early stopping parameters
best_accuracy = 0
patience = 10
patience_counter = 0

# Train the SVM Hybrid Model
print("\nTraining SVM Hybrid Model...")
for epoch in range(100):
    hybrid_model.train()
    optimizer_hybrid.zero_grad()
    
    predictions = hybrid_model(train_features).squeeze()
    loss_hybrid = loss_fn(predictions, train_labels)
    
    loss_hybrid.backward()
    optimizer_hybrid.step()
    scheduler.step(loss_hybrid)

    # Evaluate model on test set
    hybrid_model.eval()
    with torch.no_grad():
        test_predictions = hybrid_model(test_features).squeeze()
        test_predictions_binary = (test_predictions > 0.5).int()
        test_accuracy = accuracy_score(test_labels.numpy(), test_predictions_binary.numpy())

    # Print training progress every 5 epochs
    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(f"Epoch {epoch + 1}: Loss = {loss_hybrid.item():.4f}, Accuracy = {test_accuracy:.4f}")

    # Implement early stopping
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        patience_counter = 0
        torch.save(hybrid_model.state_dict(), "best_svm_hybrid_model.pth")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

print("SVM Hybrid Training Completed.")

# Load the best model checkpoint
hybrid_model.load_state_dict(torch.load("best_svm_hybrid_model.pth", weights_only=True))

# Final Model Evaluation
hybrid_model.eval()
with torch.no_grad():
    final_predictions = hybrid_model(test_features).squeeze()
    final_predictions_binary = (final_predictions > 0.5).int()

# Compute Confusion Matrix
conf_matrix = confusion_matrix(test_labels.numpy(), final_predictions_binary.numpy())
false_positives = conf_matrix[0, 1]  # Incorrectly classified as positive
false_negatives = conf_matrix[1, 0]  # Incorrectly classified as negative

# Evaluate final model performance
svm_hybrid_best_accuracy = accuracy_score(test_labels.numpy(), final_predictions_binary.numpy())

# Print final evaluation results
print("\nExtracting SVM Hybrid Model Predictions...")
print("SVM Hybrid Model Predictions Extracted.")

print("\nEvaluating SVM Hybrid Model...")
print(f"SVM Hybrid Model Accuracy: {svm_hybrid_best_accuracy:.4f}")

print("\nSVM Hybrid Model Classification Report:")
print(classification_report(test_labels.numpy(), final_predictions_binary.numpy(), zero_division=0))

print(f"\nFalse Positives: {false_positives}, False Negatives: {false_negatives}")


In [None]:
# Compare Accuracy Results
print("\nModel Performance Comparison:")
print("="*40)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test_lr, y_pred_lr):.4f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test_rf, y_pred_rf):.4f}")
print(f"SVM Accuracy: {accuracy_score(y_test_svm, y_pred_svm):.4f}")
print(f"GNN Accuracy: {gnn_accuracy:.4f}")
print(f"RNN Accuracy: {rnn_accuracy:.4f}")
print(f"LR Hybrid Model Accuracy: {lr_hybrid_best_accuracy:.4f}")
print(f"RF Hybrid Model Accuracy: {rf_hybrid_best_accuracy:.4f}")
print(f"SVM Hybrid Model Accuracy: {svm_hybrid_best_accuracy:.4f}")

In [None]:
import time
import gc  # Garbage collection

# Logistic Regression (LR)
gc.collect()
start_time_lr = time.time()
best_lr_model.fit(X_train_lr, y_train_lr)
end_time_lr = time.time()
lr_training_time = end_time_lr - start_time_lr

# Random Forest (RF)
gc.collect()
start_time_rf = time.time()
rf_model.fit(X_train_rf, y_train_rf)
end_time_rf = time.time()
rf_training_time = end_time_rf - start_time_rf

# Support Vector Machine (SVM)
gc.collect()
start_time_svm = time.time()
svm_model.fit(X_train_svm, y_train_svm)
end_time_svm = time.time()
svm_training_time = end_time_svm - start_time_svm

# Graph Neural Network (GNN)
gc.collect()
start_time_gnn = time.time()
for epoch in range(100):  # Training loop
    optimizer_gnn.zero_grad()
    gnn_output = gnn_model(data)
    loss_gnn = loss_function(gnn_output, data.y)
    loss_gnn.backward()
    optimizer_gnn.step()
end_time_gnn = time.time()
gnn_training_time = end_time_gnn - start_time_gnn

# Recurrent Neural Network (RNN)
gc.collect()
start_time_rnn = time.time()
rnn_model.fit(rnn_input, rnn_labels, epochs=50, batch_size=64, verbose=1)  # Ensure full training
end_time_rnn = time.time()
rnn_training_time = end_time_rnn - start_time_rnn

# Hybrid Models (LR, RF, SVM Hybrid)
gc.collect()
start_time_lr_hybrid = time.time()
lr_hybrid_training_time = train_hybrid_model(
    LRHybridModel(input_dim=X_combined_normalized.shape[1]),
    optimizer_hybrid, scheduler, train_features, train_labels,
    "LR Hybrid Model", "best_lr_hybrid_model.pth", loss_fn
)
end_time_lr_hybrid = time.time()
lr_hybrid_training_time = end_time_lr_hybrid - start_time_lr_hybrid

gc.collect()
start_time_rf_hybrid = time.time()
rf_hybrid_training_time = train_hybrid_model(
    RFHybridModel(input_dim=X_combined_normalized.shape[1]),
    optimizer_hybrid, scheduler, train_features, train_labels,
    "RF Hybrid Model", "best_rf_hybrid_model.pth", loss_fn
)
end_time_rf_hybrid = time.time()
rf_hybrid_training_time = end_time_rf_hybrid - start_time_rf_hybrid

gc.collect()
start_time_svm_hybrid = time.time()
svm_hybrid_training_time = train_hybrid_model(
    SVMHybridModel(input_dim=X_combined_normalized.shape[1]),
    optimizer_hybrid, scheduler, train_features, train_labels,
    "SVM Hybrid Model", "best_svm_hybrid_model.pth", loss_fn
)
end_time_svm_hybrid = time.time()
svm_hybrid_training_time = end_time_svm_hybrid - start_time_svm_hybrid

# Print Summary of Training Times
print("\n Summary of Training Times for All Models:")
print("=" * 50)
print(f"Logistic Regression: {lr_training_time:.2f} seconds")
print(f"Random Forest: {rf_training_time:.2f} seconds")
print(f"SVM: {svm_training_time:.2f} seconds")
print(f"GNN: {gnn_training_time:.2f} seconds")
print(f"RNN: {rnn_training_time:.2f} seconds")
print(f"LR Hybrid Model: {lr_hybrid_training_time:.2f} seconds")
print(f"RF Hybrid Model: {rf_hybrid_training_time:.2f} seconds")
print(f"SVM Hybrid Model: {svm_hybrid_training_time:.2f} seconds")
print("=" * 50)
