In [4]:
import os
import ast
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO
)

# Create directories
os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)

# === 1. Helper Functions ===
def safe_parse(lst_str):
    """Safely parse string representations of lists"""
    try:
        if isinstance(lst_str, list):
            return lst_str
        return ast.literal_eval(lst_str)
    except:
        return [lst_str]

# === 2. Load Embeddings ===
def load_embeddings(embedding_type):
    """Load document embeddings from either PT or CSV files

    Args:
        embedding_type: 'cbow', 'skipgram', 'tfidf', or 'sbert'
    """
    logging.info(f"Loading {embedding_type.upper()} embeddings...")

    try:
        # First try to load from PT file
        if embedding_type == 'tfidf':
            file_name = "tfidf_amazon.pt"
        elif embedding_type == 'sbert':
            file_name = f"{embedding_type}_output_amazon.pt"
        else:
            file_name = f"{embedding_type}_amazon.pt"
        data = torch.load(f"{file_name}")
        embeddings = data['embeddings'].numpy()
        labels = data['labels']
        indices = data['indices']

        # Create DataFrame
        df = pd.DataFrame(embeddings)
        df.insert(0, 'original_index', indices)
        df['labels'] = labels
        logging.info(f"Successfully loaded {embedding_type} embeddings from PT file: {embeddings.shape}")

    except Exception as e:
        # Fallback to CSV file
        logging.info(f"Failed to load from PT file: {e}")
        logging.info("Trying to load from CSV file...")

        if embedding_type == 'tfidf':
            file_name = "tfidf_amazon.csv"
        elif embedding_type == "sbert":
            return
        else:
            return
            # file_name = f"{embedding_type}_amazon.csv"

        csv_path = f"{file_name}"
        df = pd.read_csv(csv_path)

        # For TF-IDF, check if we need to rename the label column
        if embedding_type == 'tfidf' and 'CommentClass_en' in df.columns:
            df.rename(columns={'CommentClass_en': 'labels'}, inplace=True)

        logging.info(f"Successfully loaded {embedding_type} embeddings from CSV: {csv_path}")

    # Ensure labels are in the correct format
    if 'labels' in df.columns:
        df['labels'] = df['labels'].apply(safe_parse)
    else:
        # Try to find alternative label column
        label_candidates = ['CommentClass_en', 'label', 'classes', 'class']
        for col in label_candidates:
            if col in df.columns:
                df.rename(columns={col: 'labels'}, inplace=True)
                df['labels'] = df['labels'].apply(safe_parse)
                break
        else:
            logging.error(f"No label column found in {embedding_type} embeddings")
            raise ValueError(f"No label column found in {embedding_type} embeddings")
    return df

# === 3. Neural Network Models ===

class SimpleNN(nn.Module):
    """Simple Feed-Forward Neural Network for multi-label classification"""
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.3):
        super(SimpleNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


class DeepNN(nn.Module):
    """Deep Neural Network with multiple hidden layers"""
    def __init__(self, input_dim, hidden_dims, output_dim, dropout=0.4):
        super(DeepNN, self).__init__()

        layers = []
        prev_dim = input_dim

        # Add hidden layers
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = dim

        # Add output layer with sigmoid for multi-label
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


# === 4. Training and Evaluation ===

def prepare_data(df, test_size=0.2, random_state=42):
    """Prepare data for training and evaluation"""
    # Extract features and labels
    feature_cols = [col for col in df.columns if col not in ['original_index', 'labels', 'Unnamed: 0']]
    X = df[feature_cols].values

    # Process labels
    y_raw = df['labels'].tolist()

    # Binarize labels
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(y_raw)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=None
    )

    logging.info(f"Data shapes: X_train={X_train.shape}, y_train={y_train.shape}")
    logging.info(f"Classes: {mlb.classes_}")

    return X_train, X_test, y_train, y_test, mlb.classes_


def train_model(model, X_train, y_train, X_test, y_test, model_name, embedding_type,
                batch_size=32, lr=0.001, num_epochs=20, patience=3):
    """Train and evaluate a PyTorch model"""
    # Convert to tensors
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.FloatTensor(y_train)
    X_test_tensor = torch.FloatTensor(X_test)
    y_test_tensor = torch.FloatTensor(y_test)

    # Create dataset and dataloader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # For early stopping
    best_loss = float('inf')
    patience_counter = 0

    # Lists to store metrics
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    # Training loop
    for epoch in range(num_epochs):
        # Training
        model.train()
        epoch_loss = 0
        y_train_pred_all = []
        
        for inputs, targets in train_loader:
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Store predictions for accuracy calculation
            y_train_pred_batch = (outputs > 0.5).float()
            y_train_pred_all.append(y_train_pred_batch)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Calculate training accuracy
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_tensor)
            train_pred_binary = (train_outputs > 0.5).float().numpy()
            train_accuracy = accuracy_score(y_train, train_pred_binary)
            train_accuracies.append(train_accuracy)
            
            # Validation
            val_outputs = model(X_test_tensor)
            val_loss = criterion(val_outputs, y_test_tensor)
            val_losses.append(val_loss.item())
            
            # Calculate validation accuracy
            val_pred_binary = (val_outputs > 0.5).float().numpy()
            val_accuracy = accuracy_score(y_test, val_pred_binary)
            val_accuracies.append(val_accuracy)

            # Early stopping check
            if val_loss < best_loss:
                best_loss = val_loss
                patience_counter = 0
                # Save best model
                torch.save(model.state_dict(), f"best_{embedding_type}_{model_name}_classifier.pt")
            else:
                patience_counter += 1

        if (epoch + 1) % 5 == 0 or epoch == 0:
            logging.info(f'Epoch [{epoch+1}/{num_epochs}], '
                       f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}, '
                       f'Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

        # Check if early stopping criteria is met
        if patience_counter >= patience:
            logging.info(f"Early stopping at epoch {epoch+1}")
            break

    # Load best model for evaluation
    model.load_state_dict(torch.load(f"best_{embedding_type}_{model_name}_classifier.pt"))

    # Final evaluation
    model.eval()
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_pred_binary = (train_outputs > 0.5).float().numpy()
        
        val_outputs = model(X_test_tensor)
        val_pred_binary = (val_outputs > 0.5).float().numpy()

    # Calculate metrics for training set
    train_accuracy = accuracy_score(y_train, train_pred_binary)
    train_precision = precision_score(y_train, train_pred_binary, average='weighted', zero_division=0)
    train_recall = recall_score(y_train, train_pred_binary, average='weighted', zero_division=0)
    train_f1 = f1_score(y_train, train_pred_binary, average='weighted', zero_division=0)
    
    # Calculate metrics for validation/test set
    val_accuracy = accuracy_score(y_test, val_pred_binary)
    val_precision = precision_score(y_test, val_pred_binary, average='weighted', zero_division=0)
    val_recall = recall_score(y_test, val_pred_binary, average='weighted', zero_division=0)
    val_f1 = f1_score(y_test, val_pred_binary, average='weighted', zero_division=0)
    
    # Store metrics in dictionaries
    train_metrics = {
        'accuracy': train_accuracy,
        'precision': train_precision,
        'recall': train_recall,
        'f1': train_f1,
    }
    
    val_metrics = {
        'accuracy': val_accuracy,
        'precision': val_precision,
        'recall': val_recall,
        'f1': val_f1,
    }

    # Get per-class metrics for validation set
    class_report = classification_report(y_test, val_pred_binary,
                                        zero_division=0, output_dict=True)

    return model, train_metrics, val_metrics, train_losses, val_losses, train_accuracies, val_accuracies, class_report


def plot_training_curves(train_losses, val_losses, train_accuracies, val_accuracies, model_name, embedding_type):
    """Plot training and validation loss and accuracy curves"""
    # Plot loss curves
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title(f'Loss Curves for {model_name} ({embedding_type.upper()})')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Training Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.title(f'Accuracy Curves for {model_name} ({embedding_type.upper()})')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(f"{embedding_type}_{model_name}_training_curves.png")
    plt.close()


def plot_metrics_comparison(results_dict, set_type):
    """Plot performance metrics comparison between models and embedding types"""
    metrics = ['accuracy', 'precision', 'recall', 'f1']

    # Prepare data for plotting
    data = []
    for embedding_type, models_data in results_dict.items():
        for model_name, sets_data in models_data.items():
            metrics_dict = sets_data[set_type]
            model_label = f"{model_name} ({embedding_type.upper()})"
            for metric in metrics:
                data.append({
                    'Model': model_label,
                    'Metric': metric.capitalize(),
                    'Value': metrics_dict[metric]
                })

    df_plot = pd.DataFrame(data)

    # Create the plot
    plt.figure(figsize=(16, 10))
    sns.barplot(x='Model', y='Value', hue='Metric', data=df_plot)
    plt.title(f'{set_type.capitalize()} Set Performance Metrics Comparison')
    plt.ylim(0, 1)
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, axis='y')
    plt.tight_layout()
    plt.savefig(f"{set_type}_embedding_comparison.png")
    plt.close()


def plot_train_test_comparison(results_dict):
    """Plot comparison between training and test set performance for each model and embedding type"""
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    
    for metric in metrics:
        plt.figure(figsize=(14, 8))
        
        # Prepare data
        data = []
        for embedding_type, models_data in results_dict.items():
            for model_name, sets_data in models_data.items():
                model_label = f"{model_name} ({embedding_type.upper()})"
                train_value = sets_data['train'][metric]
                test_value = sets_data['val'][metric]
                
                data.append({
                    'Model': model_label,
                    'Set': 'Train',
                    'Value': train_value
                })
                data.append({
                    'Model': model_label,
                    'Set': 'Test',
                    'Value': test_value
                })
        
        df_plot = pd.DataFrame(data)
        
        # Create the plot
        sns.barplot(x='Model', y='Value', hue='Set', data=df_plot)
        plt.title(f'{metric.capitalize()} Comparison: Training vs Test Performance')
        plt.ylim(0, 1)
        plt.xticks(rotation=45, ha='right')
        plt.grid(True, axis='y')
        plt.tight_layout()
        plt.savefig(f"{metric}_train_test_comparison.png")
        plt.close()


# === 5. Main Function ===

def process_embedding_type(embedding_type):
    """Process a specific embedding type (CBOW, Skip-gram, or SBERT)"""
    logging.info(f"\n{'-'*70}\nProcessing {embedding_type.upper()} embeddings\n{'-'*70}")

    # Load embeddings
    df = load_embeddings(embedding_type)
    logging.info(f"Loaded {embedding_type} dataframe with shape: {df.shape}")

    # Prepare data
    X_train, X_test, y_train, y_test, classes = prepare_data(df)

    # Model parameters
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]

    # Define models to test
    models = [
        {
            'name': 'SimpleNN',
            'model': SimpleNN(input_dim, hidden_dim=128, output_dim=output_dim)
        },
        {
            'name': 'DeepNN',
            'model': DeepNN(
                input_dim,
                hidden_dims=[256, 128, 64],
                output_dim=output_dim
            )
        }
    ]

    # Train and evaluate each model
    results = {}

    for model_info in models:
        name = model_info['name']
        model = model_info['model']

        logging.info(f"\n{'='*50}\nTraining {name} with {embedding_type.upper()}\n{'='*50}")

        trained_model, train_metrics, val_metrics, train_losses, val_losses, train_accuracies, val_accuracies, class_report = train_model(
            model, X_train, y_train, X_test, y_test, name, embedding_type
        )

        # Log results
        logging.info(f"Results for {name} with {embedding_type.upper()}:")
        logging.info("Training Set Metrics:")
        for metric, value in train_metrics.items():
            logging.info(f"{metric.capitalize()}: {value:.4f}")
        
        logging.info("Test Set Metrics:")
        for metric, value in val_metrics.items():
            logging.info(f"{metric.capitalize()}: {value:.4f}")

        # Plot learning curves
        plot_training_curves(train_losses, val_losses, train_accuracies, val_accuracies, name, embedding_type)

        # Save detailed class report
        pd.DataFrame(class_report).transpose().to_csv(
            f"{embedding_type}_{name}_class_report.csv"
        )

        # Save model
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'class_names': classes,
            'train_metrics': train_metrics,
            'val_metrics': val_metrics,
            'architecture': str(trained_model)
        }, f"{embedding_type}_{name}_classifier.pt")

        # Store results
        results[name] = {
            'train': train_metrics,
            'val': val_metrics
        }

    return results


def main():
    """Main function to run the classification pipeline for all embedding types"""
    logging.info("Starting embedding classification comparing CBOW, Skip-gram and SBERT")

    all_results = {}

    # Process CBOW embeddings
    all_results['cbow'] = process_embedding_type('cbow')

    # Process Skip-gram embeddings
    all_results['skipgram'] = process_embedding_type('skipgram')

    # Process SBERT embeddings
    all_results['sbert'] = process_embedding_type('sbert')

    # Compare models and embeddings for both training and test sets
    plot_metrics_comparison(all_results, 'train')
    plot_metrics_comparison(all_results, 'val')
    
    # Compare training vs test performance
    plot_train_test_comparison(all_results)

    # Save overall results
    for set_type in ['train', 'val']:
        results_df = pd.DataFrame({
            f"{model}_{emb_type}_{set_type}": metrics
            for emb_type, models in all_results.items()
            for model, sets_data in models.items()
            for set_name, metrics in sets_data.items()
            if set_name == set_type
        })
        results_df.to_csv(f"{set_type}_embedding_comparison.csv")

    # Create a summary table
    summary_data = []
    for emb_type, models in all_results.items():
        for model_name, sets_data in models.items():
            row = {
                'Embedding': emb_type.upper(),
                'Model': model_name
            }
            
            # Add training metrics
            for k, v in sets_data['train'].items():
                row[f'Train_{k.capitalize()}'] = f"{v:.4f}"
                
            # Add test metrics
            for k, v in sets_data['val'].items():
                row[f'Test_{k.capitalize()}'] = f"{v:.4f}"
                
            summary_data.append(row)

    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv("embedding_classification_summary.csv", index=False)
    print("\nClassification results summary:")
    print(summary_df.to_string(index=False))

    # Calculate average performance per embedding type for both sets
    print("\nAverage performance by embedding type:")
    for set_type in ['train', 'val']:
        print(f"\n{set_type.upper()} SET:")
        for emb_type, models in all_results.items():
            avg_metrics = {}
            for metric in ['accuracy', 'precision', 'recall', 'f1']:
                avg_metrics[metric] = np.mean([models[model][set_type][metric] for model in models])
            print(f"{emb_type.upper()}: " + ", ".join([f"{k.capitalize()}: {v:.4f}" for k, v in avg_metrics.items()]))

    logging.info("Classification comparison completed! Results saved to 'results' directory.")


if __name__ == "__main__":
    main()

2025-05-08 00:10:41 - INFO - Starting embedding classification comparing CBOW, Skip-gram and SBERT
2025-05-08 00:10:41 - INFO - 
----------------------------------------------------------------------
Processing CBOW embeddings
----------------------------------------------------------------------
2025-05-08 00:10:41 - INFO - Loading CBOW embeddings...
  data = torch.load(f"{file_name}")
2025-05-08 00:10:42 - INFO - Successfully loaded cbow embeddings from PT file: (50000, 100)
2025-05-08 00:10:42 - INFO - Loaded cbow dataframe with shape: (50000, 102)
2025-05-08 00:10:42 - INFO - Data shapes: X_train=(40000, 100), y_train=(40000, 11)
2025-05-08 00:10:42 - INFO - Classes: ['Customer rating' 'CustomerService' 'Damage' 'Priceperformance' 'Product'
 'ProductFactory' 'ProductFailure' 'ProductQuality' 'ProductSize'
 'Shipping' 'Size']
2025-05-08 00:10:42 - INFO - 
Training SimpleNN with CBOW
2025-05-08 00:10:43 - INFO - Epoch [1/20], Train Loss: 0.4118, Train Acc: 0.1606, Val Loss: 0.3733, V


Classification results summary:
Embedding    Model Train_Accuracy Train_Precision Train_Recall Train_F1 Test_Accuracy Test_Precision Test_Recall Test_F1
     CBOW SimpleNN         0.2476          0.8781       0.8941   0.8847        0.2378         0.8702      0.8873  0.8774
     CBOW   DeepNN         0.2822          0.9210       0.8564   0.8849        0.2589         0.9076      0.8476  0.8741
 SKIPGRAM SimpleNN         0.2064          0.8480       0.9003   0.8722        0.1927         0.8394      0.8928  0.8643
 SKIPGRAM   DeepNN         0.2339          0.8808       0.8733   0.8758        0.2102         0.8667      0.8616  0.8629
    SBERT SimpleNN         0.6576          0.9877       0.9290   0.9566        0.6606         0.9877      0.9291  0.9567
    SBERT   DeepNN         0.6492          0.9908       0.9226   0.9542        0.6512         0.9911      0.9223  0.9541

Average performance by embedding type:

TRAIN SET:
CBOW: Accuracy: 0.2649, Precision: 0.8996, Recall: 0.8753, F1: 0.884