In [1]:
import torch
#from torch_geometric.datasets import Reddit, Amazon
#from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
#import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
#from torch.functional import F
#rom torch_geometric.nn import GCNConv, GATConv
from torch_geometric.loader import NeighborLoader
from torch_geometric.data import Data
#import torch.optim as optim
#import seaborn as sns
from baseline_models import *
#import tqdm
#from sklearn.manifold import TSNE
#from umap import UMAP
#import logging
#import concurrent

np.random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Assuming `data` is loaded with the node features (x) and labels (y)
data = torch.load('data/amazon_product_data_no_categories.pt')

In [3]:
# Extract node features and labels
node_features = data.x
node_labels = data.y

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(node_features, node_labels, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

# Create a TensorDataset for each dataset
train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

# Create a DataLoader for the training, validation, and test node sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [4]:
from sklearn.metrics import f1_score, balanced_accuracy_score


def accuracy(predictions, labels):
    correct = (predictions.argmax(dim=1) == labels).sum().item()
    total = labels.size(0)
    return correct / total


def f1(predictions, labels):
    preds = predictions.argmax(dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    return f1_score(labels, preds, average='weighted')
  
def balanced_accuracy(predictions, labels):
    preds = predictions.argmax(dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    return balanced_accuracy_score(labels, preds)

In [5]:
# Define the training and validation loops (as before, without edge_index)

def train_epoch(model, optimizer, loss_fn, train_loader, device, metrics):
    model.train()
    all_preds = []
    all_labels = []
    losses = []

    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        out = model(x_batch)
        loss = loss_fn(out, y_batch)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        all_preds.append(out.detach().cpu())
        all_labels.append(y_batch.detach().cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    avg_metrics = {metric_name: metric_fn(all_preds, all_labels) for metric_name, metric_fn in metrics.items()}
    avg_loss = np.mean(losses)

    return avg_loss, avg_metrics

@torch.no_grad()
def validate(model, loss_fn, val_loader, device, metrics):
    model.eval()
    all_preds = []
    all_labels = []
    losses = []

    for x_batch, y_batch in val_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        out = model(x_batch)
        loss = loss_fn(out, y_batch)

        losses.append(loss.item())
        all_preds.append(out.detach().cpu())
        all_labels.append(y_batch.detach().cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    avg_metrics = {metric_name: metric_fn(all_preds, all_labels) for metric_name, metric_fn in metrics.items()}
    avg_loss = np.mean(losses)

    return avg_loss, avg_metrics

# Training loop remains unchanged (just ensure you are passing the right DataLoader now)

def training_loop(model, optimizer, loss_fn, train_loader, val_loader, num_epochs, device, metrics):
    print("Starting training")
    train_losses, val_losses = [], []
    train_metrics_history = {metric_name: [] for metric_name in metrics}
    val_metrics_history = {metric_name: [] for metric_name in metrics}

    for epoch in range(1, num_epochs + 1):
        # Training
        train_loss, train_metrics = train_epoch(model, optimizer, loss_fn, train_loader, device, metrics)
        # Validation
        val_loss, val_metrics = validate(model, loss_fn, val_loader, device, metrics)
        
        # Logging results
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        for metric_name in metrics:
            train_metrics_history[metric_name].append(train_metrics[metric_name])
            val_metrics_history[metric_name].append(val_metrics[metric_name])

        metrics_str = ', '.join(
            [f'{metric_name}: {train_metrics[metric_name]:.3f} (train), {val_metrics[metric_name]:.3f} (val)'
             for metric_name in metrics])
        print(
            f"Epoch {epoch}/{num_epochs}: "
            f"Loss: {train_loss:.3f} (train), {val_loss:.3f} (val), "
            f"{metrics_str}"
        )

    return model, train_losses, val_losses, train_metrics_history, val_metrics_history



In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x_train = x_train.to(device)
x_val = x_val.to(device)
x_test = x_test.to(device)
y_train = y_train.to(device)
y_val = y_val.to(device)
y_test = y_test.to(device)
learning_rate = 0.01  
batch_size = 64

In [7]:
def experiment(model, train_loader, val_loader):
  print(model.__class__.__name__)
  
  optimizer = torch.optim.Adam(
      model.parameters(), lr=learning_rate)
  
  loss_fn = torch.nn.CrossEntropyLoss()
  
  metrics = {
    'accuracy': accuracy,
    'f1': f1,
    'balanced_accuracy': balanced_accuracy,
  }
  
  # Train the model
  model, train_losses, val_losses, train_metrics_history, val_metrics_history = training_loop(
    model, optimizer, loss_fn, train_loader, val_loader, num_epochs=10, device=device, metrics=metrics
  )
  
  return {
    'model': model,
    'train_losses': train_losses,
    'val_losses': val_losses,
    'train_metrics_history': train_metrics_history,
    'val_metrics_history': val_metrics_history
  }

In [8]:
mlp = MLP(node_features.shape[1], len(torch.unique(data.y)))

results = experiment(mlp, train_loader, val_loader)


MLP
Starting training
Epoch 1/10: Loss: 1.565 (train), 1.424 (val), accuracy: 0.489 (train), 0.544 (val), f1: 0.481 (train), 0.535 (val), balanced_accuracy: 0.474 (train), 0.527 (val)
Epoch 2/10: Loss: 1.309 (train), 1.253 (val), accuracy: 0.586 (train), 0.606 (val), f1: 0.583 (train), 0.599 (val), balanced_accuracy: 0.578 (train), 0.592 (val)
Epoch 3/10: Loss: 1.221 (train), 1.203 (val), accuracy: 0.617 (train), 0.624 (val), f1: 0.615 (train), 0.624 (val), balanced_accuracy: 0.609 (train), 0.617 (val)
Epoch 4/10: Loss: 1.186 (train), 1.192 (val), accuracy: 0.630 (train), 0.628 (val), f1: 0.627 (train), 0.629 (val), balanced_accuracy: 0.621 (train), 0.622 (val)
Epoch 5/10: Loss: 1.167 (train), 1.167 (val), accuracy: 0.636 (train), 0.638 (val), f1: 0.634 (train), 0.637 (val), balanced_accuracy: 0.628 (train), 0.629 (val)
Epoch 6/10: Loss: 1.155 (train), 1.165 (val), accuracy: 0.641 (train), 0.641 (val), f1: 0.639 (train), 0.637 (val), balanced_accuracy: 0.633 (train), 0.634 (val)
Epoch 

In [9]:
#save model
torch.save(results['model'], 'mlp.pt')

In [10]:
# svm = SVMClassifier(node_features.shape[1], len(torch.unique(data.y)))

# svm.fit(x_train, y_train)

# svm.predict(x_val)

In [11]:
log_reg = LogisticRegression(node_features.shape[1], len(torch.unique(data.y)))

log_reg.fit(x_train, y_train)

y_pred = log_reg.predict(x_val)

print(f'Accuracy: {accuracy(y_pred, y_val)}')

Accuracy: 0.16968567592008987
