From Gemini

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tensorflow as tf
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from scipy.spatial import ConvexHull
from collections import defaultdict

In [None]:
np.random.seed(4)
torch.manual_seed(4)
torch.cuda.manual_seed(4)
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
class MyNetworkDataset(Dataset):
  def __init__(self, nodes_df, edges_df, num_negative_samples=5):
    self.nodes = nodes_df
    self.edges = edges_df
    self.num_negative_samples = num_negative_samples

    # Create node mapping
    self.node_to_idx = {node_id: idx for idx, node_id in enumerate(self.nodes['node_id'].unique())}
    self.num_nodes = len(self.node_to_idx)

    # Create positive edge list
    self.positive_edges = []
    for _, row in edges_df.iterrows():
        buyer_idx = self.node_to_idx[row['buyer_id']]
        sp1_idx = self.node_to_idx[row['sponsor1_id']]
        sp2_idx = self.node_to_idx[row['sponsor2_id']]
        self.positive_edges.extend([(buyer_idx, sp1_idx), (buyer_idx, sp2_idx)])

    # Convert to set for faster lookup
    self.positive_edges_set = set((i, j) for i, j in self.positive_edges)
    self.positive_edges = torch.tensor(self.positive_edges, device=device)

  def __len__(self):
    return len(self.positive_edges)

  def generate_negative_edge(self):
    while True:
      # Randomly sample two nodes
      node1 = np.random.randint(0, self.num_nodes)
      node2 = np.random.randint(0, self.num_nodes)

      # Check if this is not a positive edge and nodes are different
      if node1 != node2 and (node1, node2) not in self.positive_edges_set:
          return (node1, node2)

  def __getitem__(self, idx):
    # Get positive edge
    pos_edge = self.positive_edges[idx]

    # Generate negative edges
    neg_edges = [self.generate_negative_edge() for _ in range(self.num_negative_samples)]
    neg_edges = torch.tensor(neg_edges, device=device)

    return pos_edge, neg_edges

class MyNetworkDatasetII(MyNetworkDataset):
  """
  Random walk sampling of negative edges
  """
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def generate_negative_edge(self):
    # Sometimes sample completely random
    if np.random.random() < 0.5:
        return super().generate_negative_edge()

    # Sometimes do a random walk from a positive edge node
    pos_edge = self.positive_edges[np.random.randint(len(self.positive_edges))]
    start_node = pos_edge[np.random.randint(2)]  # Pick one end of the edge
    current_node = start_node

    # Take a few random steps
    for _ in range(np.random.randint(2, 5)):
        # Get all nodes this one is connected to
        neighbors = [e[1] for e in self.positive_edges_set if e[0] == current_node] + \
                   [e[0] for e in self.positive_edges_set if e[1] == current_node]

        if neighbors:
            current_node = np.random.choice(neighbors)

    # Return edge between start and end of walk if it's not a positive edge
    if start_node != current_node and (start_node, current_node) not in self.positive_edges_set:
        return (start_node, current_node)

    # Fall back to random sampling if walk didn't work
    return super().generate_negative_edge()

In [None]:
from collections import defaultdict
class NodeEmbeddingAutoencoder(nn.Module):
    def __init__(self, num_nodes, embedding_dim):
        super().__init__()
        # Initialize embeddings with Xavier/Glorot initialization
        self.num_nodes = num_nodes
        self.node_embeddings = nn.Parameter(torch.randn(num_nodes, embedding_dim, device=device) / np.sqrt(embedding_dim))
        self.gradient_counts = torch.zeros(num_nodes)  # Track gradient updates per node

    def forward(self, pos_edge, neg_edges):
        # Get embeddings for positive edge
        pos_node1_embed = self.node_embeddings[pos_edge[0]]
        pos_node2_embed = self.node_embeddings[pos_edge[1]]
        pos_dist = torch.norm(pos_node1_embed - pos_node2_embed)

        # Get embeddings for negative edges
        neg_node1_embed = self.node_embeddings[neg_edges[:, 0]]
        neg_node2_embed = self.node_embeddings[neg_edges[:, 1]]
        neg_dist = torch.norm(neg_node1_embed - neg_node2_embed, dim=1)

        # Track which nodes got gradients in this batch
        with torch.no_grad():
            self.gradient_counts[pos_edge[0].item()] += 1
            self.gradient_counts[pos_edge[1].item()] += 1
            self.gradient_counts[neg_edges[:, 0].numpy()] += 1
            self.gradient_counts[neg_edges[:, 1].numpy()] += 1

        return pos_dist, neg_dist

    def get_embeddings(self):
        return self.node_embeddings.detach().cpu()

    def get_gradient_stats(self):
        return self.gradient_counts


class NodeEmbeddingAutoencoderII(NodeEmbeddingAutoencoder):
  """
  pull rarely updated nodes toward their ethnic group center
  """
  def __init__(self, embedding_dim, dataset, spread_factor=0.2):
    super().__init__(dataset.num_nodes, embedding_dim)

    # First, initialize ethnicity centers
    nodes_df['ethnicity'].fillna('Nan', inplace=True)
    unique_ethnicities = nodes_df['ethnicity'].fillna('Nan').unique()
    num_ethnicities = len(unique_ethnicities)
    ethnicity_centers = defaultdict(list, {k:[] for k in unique_ethnicities})

    # init
    self.nodes = dataset.nodes
    self.ethnicity_centers = ethnicity_centers

  def get_ethnicity_centers(self):
      # Dynamically update ethnicity centers based on current embeddings
      current_centers = {}
      for ethnicity in self.ethnicity_centers.keys():
          eth_mask = self.nodes['ethnicity'] == ethnicity
          eth_embeddings = self.node_embeddings[eth_mask]
          current_centers[ethnicity] = eth_embeddings.mean(dim=0)
      return current_centers

  def get_regularization_loss(self, update_threshold):
      # Identify rarely updated nodes
      rarely_updated = self.gradient_counts < self.gradient_counts.mean() * update_threshold

      if not rarely_updated.any():
          return 0.0

      # Get current ethnic centers
      current_centers = self.get_ethnicity_centers()

      # Calculate regularization loss for rarely updated nodes
      reg_loss = 0
      for idx in torch.where(rarely_updated)[0]:
          ethnicity = self.nodes.iloc[int(idx)]['ethnicity']
          center = current_centers[ethnicity]
          reg_loss += torch.norm(self.node_embeddings[idx] - center)

      return reg_loss / rarely_updated.sum()  # Average over rare nodes

In [None]:
# Load data
nodes_df = pd.read_csv('nyse_node_sp1.csv',
                       names=['name', 'committee', 'node_id', 'ethnicity', 'sponsor'])
edges_df = pd.read_csv('nyse_edge_buy_sp_sp1.csv',
                       names=['buyer_id', 'sponsor1_id', 'sponsor2_id', 'f1', 'f2', 'f3', 'f4',
                             'blackballs', 'whiteballs', 'year'])

# Create dataset and dataloader
dataset = MyNetworkDataset(nodes_df, edges_df, num_negative_samples=25)
dataloader = DataLoader(dataset, batch_size=128, shuffle=False)

In [None]:
# Initialize model
EMBEDDING_DIM = 2
REG_WEIGHT = 0.1  # Weight for regularization term
UPDATE_THRESHOLD = 0.1  # Threshold for considering a node rarely updated
model = NodeEmbeddingAutoencoderII(EMBEDDING_DIM, dataset)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Note: we use contrastive loss to help keep connected nodes closer in the embedding space and push things away when not connected

See[link](https://lilianweng.github.io/posts/2021-05-31-contrastive/#contrastive-loss)

In [None]:
# Margin for contrastive loss
MARGIN = 1.0
# Training
NUM_EPOCHS = 200


In [None]:
losses, regularization = [], []
for epoch in tqdm(range(NUM_EPOCHS)):
    total_loss, total_reg = 0, 0
    for batch_pos_edges, batch_neg_edges in dataloader:
        # Forward pass for each edge in batch
        batch_loss, batch_reg = 0, 0
        for pos_edge, neg_edges in zip(batch_pos_edges, batch_neg_edges):
            pos_dist, neg_dist = model(pos_edge, neg_edges)

            # Contrastive loss: minimize positive distances, maximize negative distances up to margin
            # Regularization loss pushes nodes that have not been updated towards their cluster mean
            contrastive_loss = pos_dist + torch.mean(torch.clamp(MARGIN - neg_dist, min=0))
            if epoch > 20:
                reg_loss = model.get_regularization_loss(update_threshold=UPDATE_THRESHOLD)
            else:
                reg_loss = 0
            loss = contrastive_loss + REG_WEIGHT * reg_loss
            batch_loss += loss
            batch_reg += reg_loss

        batch_loss = batch_loss / len(batch_pos_edges)
        batch_reg = batch_reg / len(batch_pos_edges)

        # Backward pass
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        total_loss += batch_loss.item()
        total_reg += batch_reg

    losses.append(total_loss/len(dataloader))
    regularization.append(total_reg/len(dataloader))
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss/len(dataloader)}")


**Save Embeddings**

In [None]:
node_embeddings = model.get_embeddings().numpy()
tensorflow_tensor = tf.convert_to_tensor(node_embeddings)
np.save('node_embeddings.npy', node_embeddings)

**Visualizations**

In [None]:
# After training, analyze gradient distribution:
gradient_counts = model.get_gradient_stats()
print("Gradient update statistics:")
print(f"Mean updates per node: {gradient_counts.mean():.2f}")
print(f"Min updates per node: {gradient_counts.min():.2f}")
print(f"Max updates per node: {gradient_counts.max():.2f}")

# Visualize gradient distribution
plt.figure(figsize=(10, 5))
plt.hist(gradient_counts, bins=50)
plt.title('Distribution of Gradient Updates Across Nodes')
plt.xlabel('Number of Updates')
plt.ylabel('Number of Nodes')
plt.show()

In [None]:
# Create the plot
sns.lineplot(x=list(range(len(losses))), y=losses, label='Contrastive loss')
sns.lineplot(x=list(range(len(regularization))), y=regularization, label='regularization')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.show()

In [None]:
# Get final embeddings
SUBS= 2000
node_embeddings = model.get_embeddings().numpy()[:SUBS]
full_node_df = nodes_df.copy()
nodes_df = nodes_df.iloc[:SUBS]
# full_edges_df = edges_df.copy()
# edges_df = edges_df.iloc[:SUBS]
# full_dataset = dataset
# dataset = Subset(full_dataset, range(SUBS)).dataset

In [None]:
plt.figure(figsize=(12, 8))

# Plot nodes
colors = nodes_df['ethnicity'].astype('category').cat.codes
scatter = plt.scatter(node_embeddings[:, 0], node_embeddings[:, 1],
                     c=colors, alpha=0.6, cmap='tab10')

# Plot positive edges
for edge in dataset.positive_edges:
    node1, node2 = edge
    if node1 >= SUBS or node2 >= SUBS:
        continue
    plt.plot([node_embeddings[node1, 0], node_embeddings[node2, 0]],
             [node_embeddings[node1, 1], node_embeddings[node2, 1]],
             'gray', alpha=0.1)

# Add legend
legend1 = plt.legend(*scatter.legend_elements(),
                    title="Ethnicity Groups",
                    loc="upper right")
plt.gca().add_artist(legend1)

plt.title('NYSE Network Node Embeddings\nConnected nodes are closer, unconnected nodes are farther')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))

ne = model.get_embeddings().numpy()
# Plot nodes
colors = full_node_df['ethnicity'].astype('category').cat.codes
scatter = plt.scatter(ne[:, 0], ne[:, 1],
                     c=colors, alpha=0.6, cmap='tab10')

# Plot positive edges
for edge in dataset.positive_edges:
    node1, node2 = edge
    plt.plot([ne[node1, 0], ne[node2, 0]],
             [ne[node1, 1], ne[node2, 1]],
             'gray', alpha=0.1)

# Add legend
legend1 = plt.legend(*scatter.legend_elements(),
                    title="Ethnicity Groups",
                    loc="upper right")
plt.gca().add_artist(legend1)

plt.title('NYSE Network Node Embeddings\nConnected nodes are closer, unconnected nodes are farther')
plt.show()

In [None]:


def plot_ethnic_clusters(node_embeddings, nodes_df, title, sample_edges=1000):
    plt.figure(figsize=(15, 10))

    # Plot by ethnicity with different subplots
    unique_ethnicities = nodes_df['ethnicity'].unique()
    num_ethnicities = len(unique_ethnicities)
    rows = (num_ethnicities + 2) // 3  # Ceiling division for number of rows

    for i, ethnicity in enumerate(unique_ethnicities, 1):
        plt.subplot(rows, 3, i)

        # Get indices for this ethnicity
        eth_mask = nodes_df['ethnicity'] == ethnicity
        eth_indices = nodes_df[eth_mask].index

        # Plot all nodes as light gray background
        plt.scatter(node_embeddings[:, 0], node_embeddings[:, 1],
                   c='lightgray', alpha=0.1, s=1)

        # Plot this ethnicity's nodes
        eth_points = node_embeddings[eth_indices]
        plt.scatter(eth_points[:, 0], eth_points[:, 1],
                   alpha=0.6, label=ethnicity)

        # Optional: Draw convex hull around the ethnic group
        if len(eth_points) > 3:  # Need at least 3 points for convex hull
            hull = ConvexHull(eth_points)
            for simplex in hull.simplices:
                plt.plot(eth_points[simplex, 0], eth_points[simplex, 1], 'k-', alpha=0.3)

        plt.title(f'{ethnicity} (n={len(eth_indices)})')
        plt.legend()

    plt.tight_layout()
    plt.suptitle(title, y=1.02)
    plt.show()

def plot_degree_distribution(node_embeddings, dataset, nodes_df):
    # Calculate node degrees
    degrees = np.zeros(len(nodes_df))
    for edge in dataset.positive_edges.cpu():
        node1, node2 = edge
        if node1 >= SUBS or node2 >= SUBS:
            continue
        degrees[edge[0]] += 1
        degrees[edge[1]] += 1

    plt.figure(figsize=(15, 5))

    # Degree distribution
    plt.subplot(1, 2, 1)
    plt.hist(degrees, bins=50)
    plt.title('Node Degree Distribution')
    plt.xlabel('Degree')
    plt.ylabel('Count')

    # Degree vs. position
    plt.subplot(1, 2, 2)
    scatter = plt.scatter(node_embeddings[:, 0], node_embeddings[:, 1],
                         c=degrees, cmap='viridis', alpha=0.6)
    plt.colorbar(scatter, label='Node Degree')
    plt.title('Node Positions Colored by Degree')

    plt.tight_layout()
    plt.show()

def plot_temporal_evolution(node_embeddings, edges_df, nodes_df, num_periods=4):
    # Get year range
    years = edges_df['year'].sort_values().unique()
    year_splits = np.array_split(years, num_periods)

    plt.figure(figsize=(20, 5))

    for i, year_group in enumerate(year_splits, 1):
        plt.subplot(1, num_periods, i)

        # Get active nodes in this period
        period_edges = edges_df[edges_df['year'].isin(year_group)]
        active_nodes = set()
        for col in ['buyer_id', 'sponsor1_id', 'sponsor2_id']:
            active_nodes.update(period_edges[col].unique())

        # Convert node IDs to indices
        active_indices = [dataset.node_to_idx[node_id] for node_id in active_nodes]

        # Plot all nodes as background
        plt.scatter(node_embeddings[:, 0], node_embeddings[:, 1],
                   c='lightgray', alpha=0.1, s=1)

        # Plot active nodes colored by ethnicity
        active_ethnicities = nodes_df.loc[nodes_df['node_id'].isin(active_nodes), 'ethnicity']
        scatter = plt.scatter(node_embeddings[active_indices, 0],
                            node_embeddings[active_indices, 1],
                            c=active_ethnicities.astype('category').cat.codes,
                            alpha=0.6)

        plt.title(f'Years {year_group[0]}-{year_group[-1]}\n(n={len(active_nodes)})')

    plt.tight_layout()
    plt.show()

def plot_high_degree_neighborhoods(node_embeddings, dataset, nodes_df, top_n=5):
    # Calculate node degrees
    degrees = np.zeros(len(nodes_df))
    for edge in dataset.positive_edges.cpu():
        degrees[edge[0]] += 1
        degrees[edge[1]] += 1

    # Get top nodes by degree
    top_indices = np.argsort(degrees)[-top_n:]

    plt.figure(figsize=(20, 4))
    for i, idx in enumerate(top_indices, 1):
        plt.subplot(1, top_n, i)

        # Get node info
        node_id = list(dataset.node_to_idx.keys())[list(dataset.node_to_idx.values()).index(idx)]
        node_info = nodes_df[nodes_df['node_id'] == node_id].iloc[0]

        # Get neighbors
        neighbors = []
        for edge in dataset.positive_edges.cpu():
            if edge[0] == idx:
                neighbors.append(edge[1])
            elif edge[1] == idx:
                neighbors.append(edge[0])

        # Plot all nodes as background
        plt.scatter(node_embeddings[:, 0], node_embeddings[:, 1],
                   c='lightgray', alpha=0.1, s=1)

        # Plot neighbors
        plt.scatter(node_embeddings[neighbors, 0], node_embeddings[neighbors, 1],
                   alpha=0.6, label='Neighbors')

        # Plot central node
        plt.scatter(node_embeddings[idx, 0], node_embeddings[idx, 1],
                   c='red', s=100, label='Center')

        plt.title(f"{node_info['name']}\n{node_info['ethnicity']}\nDegree: {int(degrees[idx])}")
        plt.legend()

    plt.tight_layout()
    plt.show()

# Create all visualizations
plot_ethnic_clusters(node_embeddings, nodes_df, 'Node Embeddings by Ethnicity')
plot_degree_distribution(node_embeddings, dataset, nodes_df)
# plot_temporal_evolution(node_embeddings, edges_df, nodes_df)
# plot_high_degree_neighborhoods(node_embeddings, dataset, nodes_df)

# Additional analysis: Calculate and print some statistics
print("\nNetwork Statistics:")
print("-" * 50)
