In [1]:
pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [2]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
# fetch dataset
census_income = fetch_ucirepo(id=20)

# data (as pandas dataframes)
X = census_income.data.features
y = census_income.data.targets
y['income'] = y['income'].replace('<=50K.', '<=50K', regex=True)
y['income'] = y['income'].replace('>50K.', '>50K', regex=True)
X = pd.concat([X, pd.DataFrame(y)], axis=1)
# metadata
print(census_income.metadata)

# variable information
print(census_income.variables)
print(X.columns)
print(len(X))

{'uci_id': 20, 'name': 'Census Income', 'repository_url': 'https://archive.ics.uci.edu/dataset/20/census+income', 'data_url': 'https://archive.ics.uci.edu/static/public/20/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data.  Also known as Adult dataset.', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5GP7S', 'creators': ['Ron Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income'] = y['income'].replace('<=50K.', '<=50K', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income'] = y['income'].replace('>50K.', '>50K', regex=True)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
def preprocess_data(data, categorical_columns, numerical_columns):
    # Separate categorical and numerical columns
    X_cat = data[categorical_columns]
    X_num = data[numerical_columns]

    # One-hot encode categorical columns
    cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    X_cat = cat_encoder.fit_transform(X_cat)

    # Standardize numerical columns
    num_scaler = StandardScaler()
    X_num = num_scaler.fit_transform(X_num)

    return X_num, X_cat, cat_encoder, num_scaler


In [6]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import torch
import torch.nn as nn
import torch.nn.functional as F
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import pyarrow as pa
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Attention(nn.Module):
    def __init__(self, attention_hidden_size):
        super().__init__()
        self.linear = nn.Linear(attention_hidden_size, attention_hidden_size).to(device)

    def forward(self, encoder_outputs):
      # Transform x using a linear layer; output shape will be (sq, b, hidden_size)
      x_transformed = self.linear(encoder_outputs)
      # Step 2: Compute attention scores using softmax across the sequence dimension (sq)
      # Attention scores shape: (sq, b, hidden_size) -> (b, sq, hidden_size) for softmax
      x_transposed = x_transformed.transpose(0, 1)  # Transposing for softmax operation
      attention_scores = F.softmax(x_transposed, dim=1)  # Applying softmax; shape remains (b, sq, hidden_size)
      # Step 3: Apply attention scores to the original input tensor
      # For weighted sum, first transpose x back: (sq, b, hidden_size) -> (b, sq, hidden_size)
      x = encoder_outputs.transpose(0, 1)  # Transposing x to match attention_scores shape
      # Compute the context vector as the weighted sum of the input vectors
      # (b, sq, hidden_size) * (b, sq, hidden_size) -> (b, hidden_size) after summing over sq dimension
      context_vector = torch.sum(attention_scores * x, dim=1)
      return context_vector





class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size,only_z=False):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, hidden_size)
        self.linear4 = nn.Linear(hidden_size, hidden_size)
        self.linear5 = nn.Linear(hidden_size, hidden_size)
        self.linear6 = nn.Linear(hidden_size, latent_size)
        self.linear_mu = nn.Linear(latent_size, latent_size)
        self.linear_logvar = nn.Linear(latent_size, latent_size)
        self.only_z = only_z
        self.relu1 = nn.GELU()
        self.relu2 = nn.GELU()
        self.relu3 = nn.GELU()
        self.relu4 = nn.GELU()
        self.relu5 = nn.GELU()
        self.relu6 = nn.GELU()


    def forward(self, x):
        out = self.relu1(self.linear1(x))
        out = self.relu2(self.linear2(out))
        out = self.relu3(self.linear3(out))
        out = self.relu4(self.linear4(out))
        out = self.relu5(self.linear5(out))
        out = self.relu6(self.linear6(out))
        mu = self.linear_mu(out)
        logvar = self.linear_logvar(out)

        # Reparameterization trick (as before)
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z = eps.mul(std).add_(mu)
        if self.only_z:
          return z
        return z, mu, logvar

class Decoder(nn.Module):
    def __init__(self, latent_size, hidden_size, output_size):
        super().__init__()
        self.linear1 = nn.Linear(latent_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, hidden_size)
        self.linear4 = nn.Linear(hidden_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu1 = nn.GELU()
        self.relu2 = nn.GELU()
        self.relu3 = nn.GELU()
        self.relu4 = nn.GELU()
        self.output=None

    def forward(self, z, sig=False):
        #print("decoder 1: ",z.shape)
        #print("decoder 1: ",z)
        out = self.relu1(self.linear1(z))
        out = self.relu2(self.linear2(out))
        out = self.relu3(self.linear3(out))
        out = self.relu4(self.linear4(out))
        out = self.output_layer(out)
        self.output=out
        return out


import torch
import torch.nn as nn
import torch.nn.functional as F


class VAE(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size, cat=False):
        super().__init__()
        self.encoder = Encoder(input_size, hidden_size, latent_size).to(device)
        self.decoder = Decoder(latent_size, hidden_size, input_size).to(device)
        self.cat=cat

    def forward(self, x):
        if self.cat:
          z, logits = self.encoder(x,True)
          recon = self.decoder(z,True)
          return recon, logits
        else:
          z, mu, logvar = self.encoder(x)
          recon = self.decoder(z)
          return recon, mu, logvar








In [7]:
import torch
import numpy as np
import pickle
rbf_hsic_matrix = torch.load('rbf_hsic_matrix_updated.pt')
linear_hsic_matrix = torch.load('linear_hsic_matrix_updated.pt')
mutual_information_matrix = torch.load('mutual_information_matrix.pt')
distance_correlation_matrix = torch.load('distance_correlation_matrix.pt')
chi2_matrix = torch.load('chi2_matrix.pt')
theils_u_matrix = torch.load('theils_u_matrix.pt')
cramers_v_matrix = torch.load('cramers_v_matrix.pt')

def load_measure_matrix(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data['matrix'], data['feature_names']

agreement_matrix, agreement_feature_names = load_measure_matrix('agreement_matrix.pkl')
binary_matrix, binary_feature_names = load_measure_matrix('binary_matrix.pkl')
categorical_matrix, categorical_feature_names = load_measure_matrix('categorical_matrix.pkl')
confusion_matrix, confusion_feature_names = load_measure_matrix('confusion_matrix.pkl')

num_features = rbf_hsic_matrix.shape[0]
from sklearn.preprocessing import OneHotEncoder

categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country','income']
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = encoder.fit_transform(X[categorical_columns])
feature_names = encoder.get_feature_names_out(categorical_columns)

column_mapping = {}
start_index = 0
for col in categorical_columns:
    column_mapping[col] = start_index
    start_index += len(encoder.categories_[categorical_columns.index(col)])

print(X_encoded.shape)

index = []
attr = []

for i in range(num_features):
    for j in range(i + 1, num_features):
        index.append([i, j])

        # Find the categorical columns associated with features i and j
        col_i = next(col for col, start_idx in column_mapping.items() if start_idx <= i < start_idx + len(encoder.categories_[categorical_columns.index(col)]))
        col_j = next(col for col, start_idx in column_mapping.items() if start_idx <= j < start_idx + len(encoder.categories_[categorical_columns.index(col)]))

        # Create the categorical column vector (1 for the corresponding column, 0 otherwise)
        categorical_col_vec = np.zeros(len(categorical_columns))
        categorical_col_vec[categorical_columns.index(col_i)] = 1
        categorical_col_vec[categorical_columns.index(col_j)] = 1

        list1 = [linear_hsic_matrix[i, j],
            rbf_hsic_matrix[i, j],
            mutual_information_matrix[i, j],
            distance_correlation_matrix[i, j],
            chi2_matrix[i, j],
            theils_u_matrix[i, j],
            cramers_v_matrix[i, j]]
       # print(agreement_matrix[i][j])
        for measure in agreement_matrix[i][j].keys():
          list1.append(agreement_matrix[i][j][measure])
      #  print(binary_matrix[i][j])
        for measure in binary_matrix[i][j].keys():
          if measure == 'mcnemar_test':
            list1.append(binary_matrix[i][j][measure][0])
          else:
            list1.append(binary_matrix[i][j][measure])
       # print(categorical_matrix[i][j])
        for measure in categorical_matrix[i][j].keys():
          list1.append(categorical_matrix[i][j][measure])
       # print(confusion_matrix[i][j])
        for measure in confusion_matrix[i][j].keys():
          list1.append(confusion_matrix[i][j][measure])
        list1.extend(categorical_col_vec)
        #for ele in list1:
          #  print(ele, type(ele))
      #  print(list1)
        attr.append(list1)


index = torch.tensor(index, dtype=torch.long).t().contiguous()
attr = torch.tensor(attr, dtype=torch.float).to(device)
print(index.shape)
print(attr.shape)



(48842, 107)
torch.Size([2, 5671])
torch.Size([5671, 131])


In [None]:
import torch
import torch
index1 = index[0]
index2 = index[1]

X_encoded = torch.tensor(X_encoded)
# Optimization using torch.expand and torch.gather
index1_expanded = index1.expand(X_encoded.shape[0], -1)
index2_expanded = index2.expand(X_encoded.shape[0], -1)
print(index1_expanded.shape)
# Efficiently gather feature pairs using indexing
features1 = torch.gather(X_encoded, dim=1, index=index1_expanded)
features2 = torch.gather(X_encoded, dim=1, index=index2_expanded)

dataset = list(zip(torch.stack([features1, features2], dim=2),X_encoded))

In [None]:
from collections import defaultdict
neighborhoods = defaultdict(list)
for i in range(index.shape[1]):
    n1_index = index1[i].item()
    n2_index = index2[i].item()
    neighborhoods[n1_index].append(i)
    neighborhoods[n2_index].append(i)

In [None]:
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F
from torch.distributions import Bernoulli
class ConditionalBatchNorm1d(nn.Module):
    def __init__(self, num_features, num_conditions):
        super().__init__()
        self.num_features = num_features

        self.gamma_layer = nn.Linear(num_conditions, num_features)
        self.beta_layer = nn.Linear(num_conditions, num_features)

    def forward(self, input, condition):

        out = F.batch_norm(input, None, None, training=True).to(device)  # Standard batch normalization
        gamma = self.gamma_layer(condition).to(device)
        beta = self.beta_layer(condition).to(device)

        out = gamma * out + beta

        return out

class DeepLinear(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.layers = torch.nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x, cond=None):
        x = self.layers(x)
        return x


class DeepConv(nn.Module):
    def __init__(self, hidden_dim, n_output_shape, neighbourhoods):
        super().__init__()
        self.num_variables = 107
        self.e_features = attr.shape[1]
        self.e_scoring_network = DeepLinear(self.num_variables+self.e_features, hidden_dim, output_dim=1)
        self.neighborhood_agg_network = DeepLinear(len(neighbourhoods[0]),hidden_dim, n_output_shape)
        self.output_dim = n_output_shape * len(neighbourhoods)
        max_neighborhood_size = max(len(v) for v in neighborhoods.values())
        neighborhood_edge_indices = torch.zeros((len(neighborhoods), max_neighborhood_size), dtype=torch.long)
        for node_index, edge_list in neighborhoods.items():
          neighborhood_edge_indices[node_index] = torch.tensor(edge_list)
        self.neighbourhoods = neighborhood_edge_indices.to(device)

    def forward(self, x):
        x.requires_grad=True
        batch_size = x.shape[0]
        # Save original values
        original_x0 = x[:, :, 0] # .clone() necessary here
        original_x1 = x[:, :, 1]
        x_mod = torch.zeros(batch_size, x.shape[1], self.num_variables,requires_grad=True).to(device)
        index1_expanded = index1.unsqueeze(0).expand(x_mod.shape[0], -1).to(device)
        index2_expanded = index2.unsqueeze(0).expand(x_mod.shape[0], -1).to(device)

        x_mod = torch.scatter(x_mod, 2, index1_expanded.unsqueeze(2), original_x0.unsqueeze(2))
        x_mod = torch.scatter(x_mod, 2, index2_expanded.unsqueeze(2), original_x1.unsqueeze(2))
        broadcasted_attr = attr.unsqueeze(0).expand(x_mod.shape[0], -1, -1)
        final_tensor = torch.cat([x_mod, broadcasted_attr], dim=2)
        all_edge_scores = self.e_scoring_network(final_tensor).squeeze()
        neighborhood_edge_indices = self.neighbourhoods[None, :, :]
        neighborhood_edge_indices=neighborhood_edge_indices.expand(all_edge_scores.shape[0],-1,-1)
        batch_size, v, r = neighborhood_edge_indices.shape
        e = all_edge_scores.shape[1]
        vector1_expanded = all_edge_scores.unsqueeze(1).expand(-1, v, -1)
        neighborhood_scores = torch.gather(vector1_expanded, 2, neighborhood_edge_indices)
        neighborhood_outputs = self.neighborhood_agg_network(neighborhood_scores)
        output = neighborhood_outputs.flatten(start_dim=1)
        return output

In [None]:
import torch

h_i = torch.tensor([[1.0, 2.0, 3.0, 4.0], [10.0, 20.0, 30.0, 40.0], [9.0,8.0,7.0,6.0],[-7.0,-2.3,-5.3,-4.2]])
odd = torch.tensor([False, True,False,True])
new_values = torch.tensor([[5.0, 5.6,5.8,5.9],[0.6,0.8,0.4,0.2]])

index = odd.nonzero().repeat(1,h_i.shape[1])
print(index)
# 3. Perform the scattering
updated_h_i = torch.scatter(h_i, dim=0, index=odd.nonzero().repeat(1,h_i.shape[1]), src=new_values)

print(updated_h_i)



tensor([[1, 1, 1, 1],
        [3, 3, 3, 3]])
tensor([[1.0000, 2.0000, 3.0000, 4.0000],
        [5.0000, 5.6000, 5.8000, 5.9000],
        [9.0000, 8.0000, 7.0000, 6.0000],
        [0.6000, 0.8000, 0.4000, 0.2000]])


In [None]:
import torch
from torch import nn
import torch.nn.functional as F

import random
class BernoulliApproximator(nn.Module):
  def __init__(self, hidden_dim):
    super().__init__()
    self.linear1 = nn.Linear(2, hidden_dim)
    self.linear2 = nn.Linear(hidden_dim, hidden_dim)
    self.linear3 = nn.Linear(hidden_dim, hidden_dim)
    self.linear4 = nn.Linear(hidden_dim, 1)
    self.relu = nn.ReLU()
    self.x=None
    self.out1 = None
    self.out2 = None
    self.out3 = None
    self.out4 = None


  def forward(self, x):
    self.x=x
    out = self.relu(self.linear1(x))
    self.out1 = out
    out = self.relu(self.linear2(out))
    self.out2 = out
    out = self.relu(self.linear3(out))
    self.out3 = out
    out = torch.sigmoid(self.linear4(out))
    self.out4 = out
    return out

class ComparatorNetwork(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.layers = torch.nn.Sequential(
            nn.Linear(2, hidden_dim),  # Input dimension changed to 2
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, 1)  # Output remains the same
        )

    def forward(self, x):
        return torch.sigmoid(self.layers(x))

model = torch.load('bernoullimodel4.pth')

for param in model.parameters():
    param.data = param.data.double()

class BernoulliSampleFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, probabilities):
        result = torch.zeros_like(probabilities)
        out4 = []
        out3 = []
        out2 = []
        out1 = []
        for i in range(probabilities.shape[1]):
          randomNumber =  (torch.rand((probabilities.shape[0]))*16).double()
          result[:,i] = model(torch.concat((probabilities[:,i].unsqueeze(1),randomNumber.unsqueeze(1)),dim=1)).squeeze().double()
          out4.append(model.out4)
          out3.append(model.out3)
          out2.append(model.out2)
          out1.append(model.out1)
        out1 = torch.stack(out1, dim=0)
        out2 = torch.stack(out2, dim=0)
        out3 = torch.stack(out3, dim=0)
        out4 = torch.stack(out4, dim=0)
        ctx.save_for_backward(result,out4,out3,out2,out1) # Store for backward pass
        return result

    @staticmethod
    def backward(ctx, grad_output):
      result, out4, out3, out2, out1= ctx.saved_tensors
      print("result: ",result)
      toReturn = torch.zeros_like(result)
      for i in range(result.shape[1]):
        print("linear 4:" ,model.linear4.in_features, " : ",model.linear4.out_features)
        linear4_copy = nn.Linear(model.linear4.in_features, model.linear4.out_features).to(device)
        linear4_copy.weight.data.copy_(model.linear4.weight.data.clone().detach().double())
        linear4_copy.bias.data.copy_(model.linear4.bias.data.clone().detach().double())
        print("grad_output i : ",out4[i,:,:].shape)
        print(out4[i,:,:])
        delta = out4[i,:,:].double()  # Sigmoid derivative
        print("l4 w: ",linear4_copy.weight.shape)
        delta = torch.mm(delta, linear4_copy.weight.double())

        # Backpropagation through hidden layers (ReLU)
        linear3_copy = nn.Linear(model.linear3.in_features, model.linear3.out_features).to(device)
        linear3_copy.weight.data.copy_(model.linear3.weight.data.clone().detach().double())
        linear3_copy.bias.data.copy_(model.linear3.bias.data.clone().detach().double())
        delta = delta * torch.where(out3[i,:,:] > 0, 1, 0)  # Derivative of ReLU
        delta = torch.mm(delta, linear3_copy.weight.double())


        linear2_copy = nn.Linear(model.linear2.in_features, model.linear2.out_features).to(device)
        linear2_copy.weight.data.copy_(model.linear2.weight.data.clone().detach().double())
        linear2_copy.bias.data.copy_(model.linear2.bias.data.clone().detach().double())
        delta = delta * torch.where(out2[i,:,:] > 0, 1, 0)  # Derivative of ReLU
        delta = torch.mm(delta, linear2_copy.weight.double())


        # Backpropagate through first layer (ReLU)
        linear1_copy = nn.Linear(model.linear1.in_features, model.linear1.out_features).to(device)
        linear1_copy.weight.data.copy_(model.linear1.weight.data.clone().detach().double())
        linear1_copy.bias.data.copy_(model.linear1.bias.data.clone().detach().double())
        delta = delta * torch.where(out1[i,:,:] > 0, 1, 0)  # Derivative of ReLU w.r.t. input x
        delta = torch.mm(delta, linear1_copy.weight.double())
        print("delta: ",delta)
        print(delta.shape)
        toReturn[:,i] = delta[:,0]
      print("toReturn: ",toReturn)
      return toReturn




model2 = torch.load('greater_than_model.pth')

class DBM(nn.Module):
    def __init__(self, nv, hidden_layers, ComparatorNetwork):
        super().__init__()
        self.input_layer = DeepConv(128,1,neighborhoods)
        self.weight = nn.ParameterList([nn.Parameter(torch.Tensor(hidden_layers[0], nv))])
        for i in range(len(hidden_layers)-1):
          self.weight.append(nn.Parameter(torch.Tensor(hidden_layers[i+1], hidden_layers[i])))
        self.bias = nn.ParameterList([nn.Parameter(torch.Tensor(nv))])
        for i in range(len(hidden_layers)):
          self.bias.append(nn.Parameter(torch.Tensor(hidden_layers[i])))

        self.nv = nv
        self.hidden_layers = hidden_layers
        self.L = len(hidden_layers)

        self.output_layer = Decoder(hidden_layers[-1],128,nv)

        self.reset_parameters()
        self.greaterThanApprox = ComparatorNetwork

    def reset_parameters(self):
        for w in self.weight:
            nn.init.orthogonal_(w)

        for b in self.bias:
            nn.init.zeros_(b)

    def forward(self, x):
        v_prob = torch.sigmoid(self.input_layer(x).squeeze())
        N = v_prob.size(0)
        device = x.device
        input = v_prob.clone()
        assert self.L != 1
        print("Gradients of v_prob1:", v_prob)
        energy_pos_samples = self.positive_phase(1, N, v_prob)
        print("energy_pos: ",torch.mean(torch.stack(energy_pos_samples)))
        energy_neg_samples = self.negative_phase(1, N)
        print("energy_neg: ",torch.mean(torch.stack(energy_neg_samples)))
        energy_loss = torch.mean(torch.stack(energy_pos_samples)) - torch.mean(torch.stack(energy_neg_samples))
        for i in range(self.L):
          if i==0:
            input = F.linear(input+self.bias[0], self.weight[i], self.bias[i+1])
          else:
            input = F.linear(input, self.weight[i], self.bias[i+1])
        return energy_loss, self.output_layer(input,True)
    def positive_phase(self, num_samples, N, v_prob):
      energy_pos_samples = []  # Store energy samples
      for _ in range(num_samples):
        v = self.bernoulli_sample(v_prob)
        print("gradinets of v: ",v)
        h = []
        for i in range(self.L):
          h_i = torch.full((N, self.hidden_layers[i]), 0.5, device=device,requires_grad=True)
          print("Gradients of h_i:", h_i.grad)
          h_i = self.bernoulli_sample(h_i)
          print("Gradients of h_i 1:", h_i.grad)
          h.append(h_i)
        v, h = self.local_search(v, h, True)
        print("Gradients of v1:", v.grad)
        print("Gradients of h1:", [h_i.grad for h_i in h])
        v, h = self.gibbs_step(v, h, True)
        print("Gradients of v2:", v.grad)
        print("Gradients of h2:", [h_i.grad for h_i in h])
        energy_pos, v, h = self.coupling(v, h, True)
        print("Gradients of v3:", v.grad)
        print("Gradients of h3:", [h_i.grad for h_i in h])
        energy_pos_samples.append(energy_pos)
      return energy_pos_samples

    def negative_phase(self, num_samples, N):
      energy_neg_samples = []  # Store energy samples

      for _ in range(num_samples):
        v = self.bernoulli_sample(torch.full((N, self.nv), 0.5, device=device, requires_grad=True))
        h = []
        for i in range(self.L):
            probs = torch.full((N, self.hidden_layers[i]), 0.5, device=device, requires_grad=True)
            h_i = self.bernoulli_sample(probs)
            h.append(h_i)
        v, h = self.local_search(v, h)
        v, h = self.gibbs_step(v, h)
        energy_neg, v, h = self.coupling(v, h)
        energy_neg_samples.append(energy_neg)
      return energy_neg_samples


    def local_search(self, v, h, fix_v=False):
        N = v.size(0)
        device= v.device
        _v = v.clone()
        _h = []
        for r in h:
          _h.append(r.clone())
        rand_u = torch.rand(N, device=device)
        print("gradinets of v2: ",v)
        print("Gradients of h2:", [h_i.grad for h_i in h])
        v, h = self.gibbs_step(v, h, fix_v, rand_u=rand_u, T=0)
        print("gradinets of v21: ",v)
        print("Gradients of h21:", [h_i.grad for h_i in h])
        converged = torch.ones(N, dtype=torch.bool, device=device) if fix_v \
                    else self.equals(v, _v)
        for i in range(self.L):
            converged = converged.logical_and(self.equals(h[i], _h[i]))
        while not converged.all():
            not_converged = converged.logical_not()
            _v = v[not_converged]
            _h = [h[i][not_converged] for i in range(self.L)]
            M = _v.size(0)
            print("gradinets of v3: ",v)
            print("Gradients of h3:", [h_i.grad for h_i in h])
            v_, h_ = self.gibbs_step(_v, _h, fix_v,
                                     rand_u=rand_u[not_converged], T=0)
            print("gradinets of v31: ",v)
            print("Gradients of h31:", [h_i.grad for h_i in h])
            if fix_v:
                converged_ = torch.ones(M, dtype=torch.bool, device=device)
            else:
                converged_ = self.equals(v_, _v)
                v = torch.scatter(v,0,not_converged.nonzero().repeat(1,v.shape[1]), v_)
            for i in range(self.L):
                converged_ = converged_.logical_and(self.equals(h_[i], _h[i]))
                h[i] = torch.scatter(h[i], 0, not_converged.nonzero().repeat(1,h_[i].shape[1]), h_[i])
            converged[not_converged] = converged_

        return v, h

    def equals(self, a, b):
      similarity_scores = abs(a-b)
      return torch.all(similarity_scores < 0.4, dim=1)

    def coupling(self, v, h, fix_v=False):
        N = v.size(0)
        device = v.device
        _v = v.clone()
        _h = []
        for r in h:
          _h.append(r.clone())
        v, h = self.mh_step(v, h, fix_v)
        energy = self.energy(v, h)
        if fix_v:
          converged = torch.ones(N, dtype=torch.bool, device=device)
        else:
          converged = self.equals(v, _v)
        for i in range(self.L):
            converged = converged.logical_and(self.equals(h[i], _h[i]))
        while not converged.all():
            not_converged = converged.logical_not()
            _v = v[not_converged]
            _h = [h[i][not_converged] for i in range(self.L)]
            M = _v.size(0)
            rand_v = None if fix_v else torch.rand_like(_v)
            rand_h = [torch.rand_like(_h[i]) for i in range(self.L)]
            rand_u = torch.rand(M, device=device)
            v_, h_ = self.mh_step(_v, _h, fix_v, rand_v, rand_h, rand_u)
            aaa = self.energy(v_, h_)
            bbb = self.energy(_v, _h)
            energy[not_converged] = energy[not_converged] + (aaa - bbb)
            if fix_v:
                converged_ = torch.ones(M, dtype=torch.bool, device=device)
            else:
                converged_ = self.equals(v_, _v)
                v = torch.scatter(v,0,not_converged.nonzero().repeat(1,v.shape[1]), v_)
            for i in range(self.L):
                converged_ = converged_.logical_and(self.equals(h_[i], _h[i]))
                h[i] = torch.scatter(h[i], 0, not_converged.nonzero().repeat(1,h_[i].shape[1]), h_[i])
            converged[not_converged] = converged_
        return energy, v, h

    def energy(self, v, h):
        energy = - torch.sum(v * self.bias[0].unsqueeze(0), 1)
        for i in range(self.L):
            logits = F.linear(v if i==0 else h[i-1], self.weight[i], self.bias[i+1])

            energy = energy - torch.sum(h[i] * logits, 1)
        return energy

    def greaterThan(self,a,b):
      if len(a.shape)>1:
        result = torch.zeros_like(a)
        for i in range(a.shape[1]):
          result[:,i] = self.greaterThanApprox(torch.concat((a[:,i].unsqueeze(1),b[:,i].unsqueeze(1)),dim=1)).squeeze()
        return result
      else:
        return self.greaterThanApprox(torch.concat((a.unsqueeze(1),b.unsqueeze(1)),dim=1)).squeeze()

    def bernoulli_sample(self,probabilities):
      return BernoulliSampleFunction.apply(probabilities)

    def gibbs_step(self, v, h, fix_v=False,
                   rand_v=None, rand_h=None, rand_u=None, rand_z=None, T=1):
        N = v.size(0)
        device = v.device

        v_ = v
        h_ = h

        if rand_u is None:
            rand_u = torch.rand(N, device=device)

        even = rand_u < 0.5
        odd = even.logical_not()
        if even.sum() > 0:
            if not fix_v:
                logits = F.linear(h_[0][even],
                                  self.weight[0].t(), self.bias[0])

                if T == 0:
                    v_ = torch.scatter(v_,0,even.nonzero().repeat(1,v_.shape[1]),self.greaterThan(logits,torch.full_like(logits,0.0,requires_grad=True)))
                else:
                    logits = logits / T

                    if rand_v is None:
                        v_ = torch.scatter(v_,0,even.nonzero().repeat(1,v_.shape[1]),self.bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        v_ = torch.scatter(v_,0,even.nonzero().repeat(1,v_.shape[1]),self.greaterThan(logits.sigmoid(),rand_v[even]))

            for i in range(1, len(h), 2):
                logits = F.linear(h_[i-1][even], self.weight[i], self.bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h_[i+1][even], self.weight[i+1].t(), None)

                if T == 0:
                    h_[i] =  torch.scatter(h_[i], 0, even.nonzero().repeat(1,h_[i].shape[1]),self.greaterThan(logits,torch.full_like(logits,0.0,requires_grad=True)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h_[i] = torch.scatter(h_[i], 0, even.nonzero().repeat(1,h_[i].shape[1]),self.bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h_[i] = torch.scatter(h_[i], 0, even.nonzero().repeat(1,h_[i].shape[1]),self.greaterThan(logits.sigmoid(),rand_h[i][even]))

            for i in range(0, len(h), 2):
                logits = F.linear(v_[even] if i==0 else h_[i-1][even],
                                  self.weight[i], self.bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h_[i+1][even], self.weight[i+1].t(), None)

                if T == 0:
                    h_[i] = torch.scatter(h_[i], 0, even.nonzero().repeat(1,h_[i].shape[1]),self.greaterThan(logits,torch.full_like(logits,0.0,requires_grad=True)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h_[i] = torch.scatter(h_[i], 0, even.nonzero().repeat(1,h_[i].shape[1]),self.bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h_[i] = torch.scatter(h_[i], 0, even.nonzero().repeat(1,h_[i].shape[1]),self.greaterThan(logits.sigmoid(),rand_h[i][even]))

        if odd.sum() > 0:
            for i in range(0, len(h), 2):
                logits = F.linear(v_[odd] if i==0 else h_[i-1][odd], self.weight[i], self.bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h_[i+1][odd], self.weight[i+1].t(), None)

                if T == 0:
                    h_[i] =  torch.scatter(h_[i], 0, odd.nonzero().repeat(1,h_[i].shape[1]),self.greaterThan(logits,torch.full_like(logits,0.0,requires_grad=True)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h_[i] = torch.scatter(h_[i], 0, odd.nonzero().repeat(1,h_[i].shape[1]),self.bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h_[i] = torch.scatter(h_[i], 0, odd.nonzero().repeat(1,h_[i].shape[1]),self.greaterThan(logits.sigmoid(),rand_h[i][odd]))

            if not fix_v:
                logits = F.linear(h_[0][odd], self.weight[0].t(), self.bias[0])

                if T == 0:
                    v_ =  torch.scatter(v_,0,odd.nonzero().repeat(1,v_.shape[1]),self.greaterThan(logits,torch.full_like(logits,0.0,requires_grad=True)))
                else:
                    logits = logits / T

                    if rand_v is None:
                        v_ = torch.scatter(v_,0,odd.nonzero().repeat(1,v_.shape[1]),self.bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        v_ = torch.scatter(v_,0,odd.nonzero().repeat(1,v_.shape[1]),self.greaterThan(logits.sigmoid(),rand_v[odd]))

            for i in range(1, len(h), 2):
                logits = F.linear(h_[i-1][odd], self.weight[i], self.bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h_[i+1][odd], self.weight[i+1].t(), None)

                if T == 0:
                    h_[i] = torch.scatter(h_[i], 0, odd.nonzero().repeat(1,h_[i].shape[1]), self.greaterThan(logits,torch.full_like(logits,0.0,requires_grad=True)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h_[i] = torch.scatter(h_[i], 0, odd.nonzero().repeat(1,h_[i].shape[1]), self.bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h_[i] = torch.scatter(h_[i], 0, odd.nonzero().repeat(1,h_[i].shape[1]), self.greaterThan(logits.sigmoid(),rand_h[i][odd]))
        return v_, h_


    def mh_step(self, v, h, fix_v=False,
                rand_v=None, rand_h=None, rand_u=None):
        N = v.size(0)
        device = v.device
        print("Gradients of v before bernoulli_sample:", v.grad)
        if fix_v:
            v_ = v
        else:
            if rand_v is None:
                v_ = self.bernoulli_sample(torch.sigmoid(torch.full_like(v,0.5,requires_grad=True)))
            else:
                v_ = self.greaterThan(torch.full_like(rand_v,0.5, requires_grad=True),rand_v)
        print("Gradients of v_ after bernoulli_sample/greaterThan:", v_.grad)
        print("Gradients of h before bernoulli_sample/greaterThan:", [h_i.grad for h_i in h])
        if rand_h is None:

            h_ = [torch.full_like(h[i],0.5,requires_grad=True) for i in range(self.L)]
            for i in range(self.L):
              h_[i] = self.bernoulli_sample(torch.sigmoid(h_[i]))
        else:
            h_ = [torch.ones_like(h[i],requires_grad=True) for i in range(self.L)]
            for i in range(self.L):
              h_[i]=self.greaterThan(torch.full_like(rand_h[i],0.5,requires_grad=True),rand_h[i])
        print("Gradients of h_ after bernoulli_sample/greaterThan:", [h_i.grad for h_i in h_])
        log_ratio = self.energy(v, h) - self.energy(v_, h_)

        if rand_u is None:
            accepted = log_ratio.exp().clamp(0, 1).bernoulli().bool()
        else:
            accepted = rand_u < log_ratio.exp()
        if not fix_v:
            v = torch.where(accepted.unsqueeze(1), v_, v)
        h = [torch.where(accepted.unsqueeze(1), h_[i], h[i]) for i in range(self.L)]
        return v, h

In [None]:
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
import math
import time
from torch.utils.data import DataLoader

torch.autograd.set_detect_anomaly(True)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

def train_dbn(dataloader, dbn_model, num_epochs, learning_rate, device):
    dbn_model.to(device)  # Ensure model is on the correct device
    dbn_model.train()
    optimizer = Adam(dbn_model.parameters(), lr=learning_rate)
    scheduler = LambdaLR(optimizer,
                         lr_lambda=lambda t: 1 / math.sqrt(1 + 0.001))
    for epoch in range(num_epochs):
        total_loss=0
        total_val=0
        i=0
        start_time = time.time()
        for step, (x, x_e) in enumerate(dataloader):
            i+=1
            #optimizer.zero_grad()

            loss, output = dbn_model(x.to(device).float())
            total_loss+=loss.mean().item()
            loss2=F.cross_entropy(output, x_e.to(device))
            loss = loss.mean() + 0.1*loss2

            total_val+=loss2.item()
            end_time = time.time()
            elapsed_time = end_time - start_time
            print("data ",step, " loss: ",loss," validation loss: ",loss2, f" time: {elapsed_time:.4f}")
            loss.backward()
            optimizer.step()
            scheduler.step()
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Epoch {epoch} avg loss: {total_loss/i:.4f} avg val loss: {total_val/i:.4f} time: {elapsed_time:.4f}")

dbn_model = DBM(num_features, [4,2,1],model2).to(device)
#state_dict = torch.load("dbn_modelv2_deepConv.pt")
#dbn_model.load_state_dict(state_dict)
train_dbn(dataloader, dbn_model, num_epochs=1000, learning_rate=0.001, device=device)

Gradients of v_prob1: tensor([[1.0000, 0.5666, 0.5662,  ..., 0.5598, 0.5510, 0.5311],
        [1.0000, 0.5666, 0.5662,  ..., 0.5598, 0.5510, 0.5311],
        [1.0000, 0.5666, 0.5662,  ..., 0.5598, 0.5510, 0.5311],
        ...,
        [1.0000, 0.5666, 0.5662,  ..., 0.5598, 0.5510, 0.5311],
        [1.0000, 0.5666, 0.5662,  ..., 0.5598, 0.5510, 0.5311],
        [1.0000, 0.5666, 0.5662,  ..., 0.5598, 0.5510, 0.5311]],
       grad_fn=<SigmoidBackward0>)
gradinets of v:  tensor([[0.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0000e+00,
         0.0000e+00],
        [3.7021e-23, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         2.0443e-39],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00, 0.0000e+00,
         1.0000e+00],
        ...,
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         0.0000e+00],
        [0.0000e+00, 1.0000e+00, 9.9664e-01,  ..., 0.0000e+00, 0.0000e+00,
         6.7391e-22],
        [1.0000e+00, 1.0000e+0

  print("Gradients of h_i 1:", h_i.grad)
  print("Gradients of h2:", [h_i.grad for h_i in h])
  print("Gradients of h21:", [h_i.grad for h_i in h])
  print("Gradients of h3:", [h_i.grad for h_i in h])


gradinets of v21:  tensor([[0.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0000e+00,
         0.0000e+00],
        [3.7021e-23, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         2.0443e-39],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00, 0.0000e+00,
         1.0000e+00],
        ...,
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         0.0000e+00],
        [0.0000e+00, 1.0000e+00, 9.9664e-01,  ..., 0.0000e+00, 0.0000e+00,
         6.7391e-22],
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0000e+00,
         0.0000e+00]], grad_fn=<BernoulliSampleFunctionBackward>)
Gradients of h21: [None, None, None]
gradinets of v3:  tensor([[0.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0000e+00,
         0.0000e+00],
        [3.7021e-23, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         2.0443e-39],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00, 0.0000e+00,
         1.0000e+0

  print("Gradients of h31:", [h_i.grad for h_i in h])
  print("Gradients of v1:", v.grad)
  print("Gradients of h1:", [h_i.grad for h_i in h])
  print("Gradients of v2:", v.grad)
  print("Gradients of h2:", [h_i.grad for h_i in h])
  print("Gradients of v before bernoulli_sample:", v.grad)
  print("Gradients of v_ after bernoulli_sample/greaterThan:", v_.grad)
  print("Gradients of h before bernoulli_sample/greaterThan:", [h_i.grad for h_i in h])
  print("Gradients of h_ after bernoulli_sample/greaterThan:", [h_i.grad for h_i in h_])


Gradients of v before bernoulli_sample: None
Gradients of v_ after bernoulli_sample/greaterThan: None
Gradients of h before bernoulli_sample/greaterThan: [None, None, None]
Gradients of h_ after bernoulli_sample/greaterThan: [None, None, None]
Gradients of v before bernoulli_sample: None
Gradients of v_ after bernoulli_sample/greaterThan: None
Gradients of h before bernoulli_sample/greaterThan: [None, None, None]
Gradients of h_ after bernoulli_sample/greaterThan: [None, None, None]
Gradients of v before bernoulli_sample: None
Gradients of v_ after bernoulli_sample/greaterThan: None
Gradients of h before bernoulli_sample/greaterThan: [None, None, None]
Gradients of h_ after bernoulli_sample/greaterThan: [None, None, None]
Gradients of v before bernoulli_sample: None
Gradients of v_ after bernoulli_sample/greaterThan: None
Gradients of h before bernoulli_sample/greaterThan: [None, None, None]
Gradients of h_ after bernoulli_sample/greaterThan: [None, None, None]
Gradients of v before be

  print("Gradients of v3:", v.grad)
  print("Gradients of h3:", [h_i.grad for h_i in h])


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [ 7.5542e-238, -8.1585e-239],
        [ 7.6670e-165, -8.1767e-166],
        [  5.7836e+02,  -1.2097e+02],
        [ 7.4320e-211, -8.0265e-212],
        [  4.9249e+02,  -7.6992e+01],
        [  4.9249e+02,  -7.6992e+01],
        [ 4.8112e-205, -5.1960e-206],
        [ 1.0514e-174, -1.1212e-175],
        [  8.5311e-54,  -9.0163e-55],
        [ 2.2830e-236, -2.4656e-237],
        [  3.6456e+02,  -3.5477e+01]], dtype=torch.float64)
torch.Size([12, 2])
linear 4: 128  :  1
grad_output i :  torch.Size([12, 1])
tensor([[ 9.9627e-01],
        [3.5464e-178],
        [3.3041e-140],
        [1.0342e-258],
        [ 1.4958e-38],
        [2.2141e-115],
        [9.7644e-236],
        [8.7816e-173],
        [ 1.0000e+00],
        [7.1270e-205],
        [ 1.0000e+00],
        [2.6986e-191]], dtype=torch.float64)
l4 w:  torch.Size([1, 128])
delta:  tensor([[  1.1311e+03,  -1.1543e+02],
        [ 1.5732e-175, -1.6777e-176],
        

  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
    handle._run()
  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(s

KeyboardInterrupt: 

In [None]:

for name, param in dbn_model.named_parameters():
    if param.requires_grad:
        print(name, param.grad.data)

import matplotlib.pyplot as plt

def visualize_grad_distribution(model):
    all_grads = []
    for name, param in model.named_parameters():
        if param.requires_grad:
            all_grads.append(param.grad.data.cpu().flatten())  # Flatten for easier histogramming

    plt.hist(all_grads, bins=50)
    plt.xlabel("Gradient Value")
    plt.ylabel("Frequency")
    plt.title("Distribution of Gradients in Model")
    plt.show()

visualize_grad_distribution(dbn_model)  # Call after .backward()



In [None]:
pip install torchviz

Collecting torchviz
  Downloading torchviz-0.0.2.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->torchviz)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->torchviz)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->torchviz)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->torchviz)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->torchviz)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->torchviz)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manyl

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.distributions import Bernoulli, Independent

torch.manual_seed(1234)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 32

L = 3
nh = 10
nv = 5

v = torch.rand(batch_size, nv, requires_grad=True, dtype=torch.float64)
h = [torch.randn(batch_size, nh, requires_grad=True, dtype=torch.float64) for _ in range(L)]


weight = nn.ParameterList([nn.Parameter(torch.randn(nh, nv, requires_grad=True, dtype=torch.float64))])
weight.extend([nn.Parameter(torch.randn(nh, nh, requires_grad=True, dtype=torch.float64)) for _ in range(L-1)])
bias = nn.ParameterList([nn.Parameter(torch.randn(nv, requires_grad=True, dtype=torch.float64))])
bias.extend([nn.Parameter(torch.randn(nh, requires_grad=True, dtype=torch.float64)) for _ in range(L)])
def energy1(v, h):
        energy = - torch.sum(v * bias[0].unsqueeze(0), 1)

        for i in range(L):
            logits = F.linear(v if i==0 else h[i-1],
                              weight[i], bias[i+1])

            energy -= torch.sum(h[i] * logits, 1)

        return energy

def mh_step1(v, h, fix_v=False,
                rand_v=None, rand_h=None, rand_u=None):
        N = v.size(0)
        device = v.device

        if fix_v:
            v_ = v
        else:
            if rand_v is None:
                v_ = torch.empty_like(v).bernoulli_()
            else:
                v_ = (rand_v < 0.5).float()

        if rand_h is None:
            h_ = [torch.empty_like(h[i]).bernoulli_() for i in range(L)]
        else:
            h_ = [(rand_h[i] < 0.5).float() for i in range(L)]

        log_ratio = energy1(v, h) - energy1(v_, h_)

        if rand_u is None:
            accepted = log_ratio.exp().clamp(0, 1).bernoulli().bool()
        else:
            accepted = rand_u < log_ratio.exp()

        if not fix_v:
            v = torch.where(accepted.unsqueeze(1), v_, v)
        h = [torch.where(accepted.unsqueeze(1), h_[i], h[i]) for i in range(L)]

        return v, h


result1 = mh_step1(v,h)


class ComparatorNetwork(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.layers = torch.nn.Sequential(
            nn.Linear(2, hidden_dim),  # Input dimension changed to 2
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, 1)  # Output remains the same
        )

    def forward(self, x):
        return torch.sigmoid(self.layers(x))

class BernoulliApproximator(nn.Module):
  def __init__(self, hidden_dim):
    super().__init__()
    self.linear1 = nn.Linear(2, hidden_dim)
    self.linear2 = nn.Linear(hidden_dim, hidden_dim)
    self.linear3 = nn.Linear(hidden_dim, hidden_dim)
    self.linear4 = nn.Linear(hidden_dim, 1)
    self.relu = nn.ReLU()
    self.x=None
    self.out1 = None
    self.out2 = None
    self.out3 = None
    self.out4 = None


  def forward(self, x):
    self.x=x
    out = self.relu(self.linear1(x))
    self.out1 = out
    out = self.relu(self.linear2(out))
    self.out2 = out
    out = self.relu(self.linear3(out))
    self.out3 = out
    out = torch.sigmoid(self.linear4(out))
    self.out4 = out
    return out

model = torch.load('bernoullimodel8.pth')

model2 = torch.load('greater_than2.pth')

for param in model.parameters():
    param.data = param.data.double()  # Convert to float64

for param in model2.parameters():
    param.data = param.data.double()



def energy(v, h):
        energy = - torch.sum(v * bias[0].unsqueeze(0), 1)
        for i in range(L):
            logits = F.linear(v if i==0 else h[i-1],
                              weight[i], bias[i+1])
            energy = energy - torch.sum(h[i] * logits, 1)
        return energy

def greaterThan(a,b):
      if len(a.shape)>1:
        result = torch.zeros_like(a)
        for i in range(a.shape[1]):
          result[:,i] = model2(torch.concat((a[:,i].unsqueeze(1),b[:,i].unsqueeze(1)),dim=1)).squeeze()
        return result
      else:
        return model2(torch.concat((a.unsqueeze(1),b.unsqueeze(1)),dim=1)).squeeze()




print("model test: ",model(torch.tensor([0.3,19]).double()))
print("model test: ",model(torch.tensor([0.3,4]).double()))
print("model test: ",model(torch.tensor([0.9,18]).double()))
print("model test: ",model(torch.tensor([0.9,10]).double()))
class BernoulliSampleFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, probabilities, model):
        result = torch.zeros_like(probabilities)
        out4 = []
        out3 = []
        out2 = []
        out1 = []
        for i in range(probabilities.shape[1]):
          randomNumber =  (torch.randint(0,20,(probabilities.shape[0],1))).double()
         # print("rn: ",randomNumber)
        #  print("rn: ",randomNumber.shape)
       #   print("probabilities[:,i]: ",probabilities[:,i])
          input = torch.concat((probabilities[:,i].unsqueeze(1),randomNumber),dim=1)
          result[:,i] = model(input).squeeze().double()
       #   print("result i : ",result[:,i])
          out4.append(model.out4)
          out3.append(model.out3)
          out2.append(model.out2)
          out1.append(model.out1)
        out1 = torch.stack(out1, dim=0)
        out2 = torch.stack(out2, dim=0)
        out3 = torch.stack(out3, dim=0)
        out4 = torch.stack(out4, dim=0)
        ctx.save_for_backward(result,out4,out3,out2,out1) # Store for backward pass
      #  print("result: ", result)
        return result.squeeze()

    @staticmethod
    def backward(ctx, grad_output):
      result, out4, out3, out2, out1= ctx.saved_tensors
   #   print("result: ",result)
      toReturn = torch.zeros_like(result)
      for i in range(result.shape[1]):
    #    print("linear 4:" ,model.linear4.in_features, " : ",model.linear4.out_features)
        linear4_copy = nn.Linear(model.linear4.in_features, model.linear4.out_features).to(device)
        linear4_copy.weight.data.copy_(model.linear4.weight.data.double())
        linear4_copy.bias.data.copy_(model.linear4.bias.data.double())
       # print("grad_output i : ",grad_output[:,i])
   #     print(out4[i,:,:])
         #print("grad_output: ",grad_output[:,i].shape)
      #  print("out4: ",out4[i,:,:])
        delta = (grad_output[:,i].unsqueeze(1) * (out4[i,:,:])).double()  # Sigmoid derivative
     #   print("delta: ",delta.shape)
    #    print("delta initial: ",delta)
  #      print("l4 w: ",linear4_copy.weight.shape)
        delta = torch.mm(delta,linear4_copy.weight.double())
     #   print("delta 4: ",delta)
     #   print(delta.shape)

        linear3_copy = nn.Linear(model.linear3.in_features, model.linear3.out_features).to(device)
        linear3_copy.weight.data.copy_(model.linear3.weight.data.double())
        linear3_copy.bias.data.copy_(model.linear3.bias.data.double())
        delta = delta * torch.where(out3[i,:,:] > 0, 1, 0)  # Derivative of ReLU
        delta = torch.mm(delta, linear3_copy.weight.double())
    #    print("delta3: ",delta)
       # print(delta.shape)

        linear2_copy = nn.Linear(model.linear2.in_features, model.linear2.out_features).to(device)
        linear2_copy.weight.data.copy_(model.linear2.weight.data.double())
        linear2_copy.bias.data.copy_(model.linear2.bias.data.double())
        delta = delta * torch.where(out2[i,:,:] > 0, 1, 0)  # Derivative of ReLU
        delta = torch.mm(delta, linear2_copy.weight.double())
      #  print("delta2: ",delta)
     #   print(delta.shape)

        # Backpropagate through first layer (ReLU)
        linear1_copy = nn.Linear(model.linear1.in_features, model.linear1.out_features).to(device)
        linear1_copy.weight.data.copy_(model.linear1.weight.data.double())
        linear1_copy.bias.data.copy_(model.linear1.bias.data.double())
        delta = delta * torch.where(out1[i,:,:] > 0, 1, 0)  # Derivative of ReLU w.r.t. input x
        delta = torch.mm(delta, linear1_copy.weight.double())
      #  print("delta final: ",delta)
      #  print(delta.shape)
        toReturn[:,i] = delta[:,0]
   #   print("toReturn: ",toReturn)
      return toReturn*1000, None

def bernoulli_sample(probabilities):
      #print("probs: ",probabilities)
      result = BernoulliSampleFunction.apply(probabilities, model)
    #  print("result: ",result)
      return torch.tensor(result)

#test = torch.autograd.gradcheck(energy, (v,h[0],h[1],h[2]),create_graph=True)
#print("AUTOGRAD-1: ",test)
#test = torch.autograd.gradcheck(greaterThan, (h[0],h[1]),create_graph=True)
#print("AUTOGRAD-2: ",test)
#test = torch.autograd.gradcheck(bernoulli_sample, (v))
#print("AUTOGRAD-3: ",test)
from torchviz import make_dot



from torch.autograd import Function

class MHStepFunction(Function):
    @staticmethod
    def forward(ctx, v, fix_v, rand_v, rand_h, rand_u, *h):
        N = v.size(0)
        device = v.device
        L = len(h)

        if fix_v:
            v_ = v
        else:
            if rand_v is None:
                input = torch.full_like(v, 0.5, requires_grad=True)
                v_ = bernoulli_sample(input)  # Replace with your implementation
            else:
                v_ = greaterThan(torch.full_like(rand_v, 0.5, requires_grad=True), rand_v)

        if rand_h is None:
            h_ = [torch.full_like(h[i], 0.5, requires_grad=True) for i in range(L)]
            for i in range(L):
                h_[i] = bernoulli_sample(h_[i])
        else:
            h_ = [torch.ones_like(h[i], requires_grad=True) for i in range(L)]
            for i in range(L):
                h_[i] = greaterThan(torch.full_like(rand_h[i], 0.5, requires_grad=True), rand_h[i])

        log_ratio = energy(v, h) - energy(v_, h_)

        if rand_u is None:
            accepted = log_ratio.exp().clamp(0, 1).bernoulli().bool()
        else:
            accepted = rand_u < log_ratio.exp()

        if not fix_v:
            v = torch.where(accepted.unsqueeze(1), v_, v)
        h = [torch.where(accepted.unsqueeze(1), h_[i], h[i]) for i in range(L)]

        # For backward, we need accepted and a mask indicating non-updated values
        ctx.save_for_backward(accepted)
        ctx.mark_non_differentiable(accepted)
        return v, *h

    @staticmethod
    def backward(ctx, grad_v, *grad_h):
        # Load the mask indicating non-updated values
        accepted = ctx.saved_tensors[0]
        grad_h = list(grad_h)
        # Zero out gradients where accepted is True (or not_accepted is False)
        grad_v = torch.where(accepted.unsqueeze(1),0,grad_v)
        for i in range(len(grad_h)):
            grad_h[i] = torch.where(accepted.unsqueeze(1),0,grad_h[i])

        print("v: ",grad_v)
        [print(h) for h in grad_h]
        return (grad_v, None, None, None, None, *grad_h,)  # Gradients for other inputs are None


def mh_step2(v, *h, fix_v=False, rand_v=None, rand_h=None, rand_u=None):
      return MHStepFunction.apply(v, fix_v, rand_v, rand_h, rand_u,*h)

result2 = MHStepFunction.apply(v,None,None,None,None,h[0],h[1],h[2])
y_hat = result2[1]
# Generate the graph
graph = make_dot(y_hat,show_attrs=True, show_saved=True)

# Render and save the graph (adjust the filename if needed)
graph.render("h_mh_step2_graph2", format="png")

test = torch.autograd.gradcheck(mh_step2, (v,h[0],h[1],h[2]))


#test = torch.autograd.gradcheck(mh_step2, (v,h[0],h[1],h[2]))
#print("AUTOGRAD1: ",test)
import matplotlib.pyplot as plt
import torch

def plot_histograms_and_stats(results, title):
    if len(results)==3:
      _,v,h = results
    else:
      v, *h = results

    # Flatten all the tensors to get a single tensor for easier handling
    all_tensors = v
    for h1 in h:
      for h2 in h1:
        all_tensors = torch.cat((all_tensors.flatten(),h2.flatten()),dim=0)
    all_tensors = all_tensors.flatten().detach()

    # Calculate mean and standard deviation
    mean = torch.mean(all_tensors).item()
    std = torch.std(all_tensors).item()

    # Creating histogram
    plt.figure(figsize=(10, 6))
    plt.hist(all_tensors.numpy(), bins=50, alpha=0.75, color='blue')
    plt.title(f'Histogram of all values in {title}\nMean: {mean:.4f}, Std: {std:.4f}')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

    return mean, std

# Assuming result1 and result2 have been generated correctly by the provided code snippet
mean1, std1 = plot_histograms_and_stats(result1, 'result1')
mean2, std2 = plot_histograms_and_stats(result2, 'result2')

print(f"Result1 - Mean: {mean1:.4f}, Std: {std1:.4f}")
print(f"Result2 - Mean: {mean2:.4f}, Std: {std2:.4f}")





def mh_step2(v, *h, fix_v=False, rand_v=None, rand_h=None, rand_u=None):
      return MHStepFunction.apply(v, fix_v, rand_v, rand_h, rand_u,*h)


def gibbs_step1(v, h, fix_v=False,
                   rand_v=None, rand_h=None, rand_u=None, rand_z=None, T=1):
        N = v.size(0)
        device = v.device

        v_, h_ = (v, h)

        if rand_u is None:
            rand_u = torch.rand(N, device=device)

        even = rand_u < 0.5
        odd = even.logical_not()

        if even.sum() > 0:
            if not fix_v:
                logits = F.linear(h_[0][even],
                                  weight[0].t(), bias[0])

                if T == 0:
                    v_[even] = (logits >= 0).float()
                else:
                    logits = logits/ T

                    if rand_v is None:
                        v_[even] = Independent(Bernoulli(logits=logits), 1).sample()
                    else:
                        v_[even] = (rand_v[even] < logits.sigmoid()).float()

            for i in range(1, len(h), 2):
                logits = F.linear(h_[i-1][even],
                                  weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h_[i+1][even],
                                       weight[i+1].t(), None)

                if T == 0:
                    h_[i][even] = (logits >= 0).float()
                else:
                    logits = logits/T

                    if rand_h is None:
                        h_[i][even] = Independent(Bernoulli(logits=logits), 1).sample()
                    else:
                        h_[i][even] = (rand_h[i][even] < logits.sigmoid()).float()

            for i in range(0, len(h), 2):
                logits = F.linear(v_[even] if i==0 else h_[i-1][even],
                                  weight[i], bias[i+1])
                if i+1 < len(h):
                    logits += F.linear(h_[i+1][even],
                                       weight[i+1].t(), None)

                if T == 0:
                    h_[i][even] = (logits >= 0).float()
                else:
                    logits /= T

                    if rand_h is None:
                        h_[i][even] = Independent(Bernoulli(logits=logits), 1).sample()
                    else:
                        h_[i][even] = (rand_h[i][even] < logits.sigmoid()).float()

        if odd.sum() > 0:
            for i in range(0, len(h), 2):
                logits = F.linear(v_[odd] if i==0 else h_[i-1][odd],
                                  weight[i],bias[i+1])
                if i+1 < len(h):
                    logits += F.linear(h_[i+1][odd],
                                       weight[i+1].t(), None)

                if T == 0:
                    h_[i][odd] = (logits >= 0).float()
                else:
                    logits = logits / T

                    if rand_h is None:
                        h_[i][odd] = Independent(Bernoulli(logits=logits), 1).sample()
                    else:
                        h_[i][odd] = (rand_h[i][odd] < logits.sigmoid()).float()

            if not fix_v:
                logits = F.linear(h_[0][odd],
                                  weight[0].t(), bias[0])

                if T == 0:
                    v_[odd] = (logits >= 0).float()
                else:
                    logits = logits / T

                    if rand_v is None:
                        v_[odd] = Independent(Bernoulli(logits=logits), 1).sample()
                    else:
                        v_[odd] = (rand_v[odd] < logits.sigmoid()).float()

            for i in range(1, len(h), 2):
                logits = F.linear(h_[i-1][odd],
                                  weight[i], bias[i+1])
                if i+1 < len(h):
                    logits += F.linear(h_[i+1][odd],
                                       weight[i+1].t(), None)

                if T == 0:
                    h_[i][odd] = (logits >= 0).float()
                else:
                    logits = logits / T

                    if rand_h is None:
                        h_[i][odd] = Independent(Bernoulli(logits=logits), 1).sample()
                    else:
                        h_[i][odd] = (rand_h[i][odd] < logits.sigmoid()).float()

        return v_, h_


def bernoulli_sample(probabilities):
      result = BernoulliSampleFunction.apply(probabilities, model)
      return torch.tensor(result)



from torch.autograd import Function

class GibbsStepFunction(Function):
    @staticmethod
    def forward(ctx, v,fix_v, rand_v, rand_h, rand_u, rand_z, T=1, L=L, *all_params):
        N = v.size(0)
        device = v.device
        h = list(h)
        h = all_params[:L]  # h is a list of tensors
        weight = all_params[L:2*L]  # weight is a list of tensors (from ParameterList)
        bias = all_params[2*L:]
        if rand_u is None:
            rand_u = torch.rand(N, device=device)

        even = rand_u < 0.5
        odd = even.logical_not()
        if even.sum() > 0:
          #  print("TEST",)
            if not fix_v:
                logits = F.linear(h[0][even],
                                  weight[0].t(), bias[0])
            #    print("v_1: ",v_.grad)
                if T == 0:
                    v = torch.scatter(v,0,even.nonzero().repeat(1,v.shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                 #   print("v_1: ",v_.grad)
                else:
                    logits = logits / T

                    if rand_v is None:
                        v =  torch.scatter(v,0,even.nonzero().repeat(1,v.shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                      #  print("v_2: ",v.grad)
                    else:
                        v = torch.scatter(v,0,even.nonzero().repeat(1,v.shape[1]),greaterThan(logits.sigmoid(),rand_v[even]))

            for i in range(1, len(h), 2):
              #  print("TEST2")
                logits = F.linear(h[i-1][even], weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][even], weight[i+1].t(), None)

                if T == 0:
                    h[i] =  torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T
                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits.sigmoid(),rand_h[i][even]))

            for i in range(0, len(h), 2):
               # print("TEST3")
                logits = F.linear(v[even] if i==0 else h[i-1][even],
                                  weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][even], weight[i+1].t(), None)

                if T == 0:
                    h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits.sigmoid(),rand_h[i][even]))

        if odd.sum() > 0:
            for i in range(0, len(h), 2):
                logits = F.linear(v[odd] if i==0 else h[i-1][odd], weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][odd], weight[i+1].t(), None)

                if T == 0:
                    h[i] =  torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits.sigmoid(),rand_h[i][odd]))

            if not fix_v:
                logits = F.linear(h[0][odd], weight[0].t(), bias[0])

                if T == 0:
                    v =  torch.scatter(v,0,odd.nonzero().repeat(1,v.shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_v is None:
                        v = torch.scatter(v,0,odd.nonzero().repeat(1,v.shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        v = torch.scatter(v,0,odd.nonzero().repeat(1,v.shape[1]),greaterThan(logits.sigmoid(),rand_v[odd]))

            for i in range(1, len(h), 2):
                logits = F.linear(h[i-1][odd], weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][odd], weight[i+1].t(), None)

                if T == 0:
                    h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]), greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]), bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]), greaterThan(logits.sigmoid(),rand_h[i][odd]))
        ctx.save_for_backward(even, odd, L, *all_params)
        return v, *h


    @staticmethod
    def backward(ctx, grad_v, *grad_h):
        even, odd, L, *all_params = ctx.saved_tensors
        even_v



def mh_step2(v, *h, fix_v=False, rand_v=None, rand_h=None, rand_u=None, rand_z=None, T=1):
      return MHStepFunction.apply(v, fix_v, rand_v, rand_h, rand_u,rand_z,T,*h)


def gibbs_step2(v, *h, fix_v=False,
                   rand_v=None, rand_h=None, rand_u=None, rand_z=None, T=1):
        N = v.size(0)
        device = v.device
        h = list(h)
        if rand_u is None:
            rand_u = torch.rand(N, device=device)

        even = rand_u < 0.5
        odd = even.logical_not()
        if even.sum() > 0:
          #  print("TEST",)
            if not fix_v:
                logits = F.linear(h[0][even],
                                  weight[0].t(), bias[0])
            #    print("v_1: ",v_.grad)
                if T == 0:
                    v = torch.scatter(v,0,even.nonzero().repeat(1,v.shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                 #   print("v_1: ",v_.grad)
                else:
                    logits = logits / T

                    if rand_v is None:
                        v =  torch.scatter(v,0,even.nonzero().repeat(1,v.shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                      #  print("v_2: ",v.grad)
                    else:
                        v = torch.scatter(v,0,even.nonzero().repeat(1,v.shape[1]),greaterThan(logits.sigmoid(),rand_v[even]))

            for i in range(1, len(h), 2):
              #  print("TEST2")
                logits = F.linear(h[i-1][even], weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][even], weight[i+1].t(), None)

                if T == 0:
                    h[i] =  torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T
                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits.sigmoid(),rand_h[i][even]))

            for i in range(0, len(h), 2):
               # print("TEST3")
                logits = F.linear(v[even] if i==0 else h[i-1][even],
                                  weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][even], weight[i+1].t(), None)

                if T == 0:
                    h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, even.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits.sigmoid(),rand_h[i][even]))

        if odd.sum() > 0:
            for i in range(0, len(h), 2):
                logits = F.linear(v[odd] if i==0 else h[i-1][odd], weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][odd], weight[i+1].t(), None)

                if T == 0:
                    h[i] =  torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]),greaterThan(logits.sigmoid(),rand_h[i][odd]))

            if not fix_v:
                logits = F.linear(h[0][odd], weight[0].t(), bias[0])

                if T == 0:
                    v =  torch.scatter(v,0,odd.nonzero().repeat(1,v.shape[1]),greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_v is None:
                        v = torch.scatter(v,0,odd.nonzero().repeat(1,v.shape[1]),bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        v = torch.scatter(v,0,odd.nonzero().repeat(1,v.shape[1]),greaterThan(logits.sigmoid(),rand_v[odd]))

            for i in range(1, len(h), 2):
                logits = F.linear(h[i-1][odd], weight[i], bias[i+1])
                if i+1 < len(h):
                    logits = logits + F.linear(h[i+1][odd], weight[i+1].t(), None)

                if T == 0:
                    h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]), greaterThan(logits,torch.full_like(logits,-0.05)))
                else:
                    logits = logits / T

                    if rand_h is None:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]), bernoulli_sample(torch.sigmoid(logits)))
                    else:
                        h[i] = torch.scatter(h[i], 0, odd.nonzero().repeat(1,h[i].shape[1]), greaterThan(logits.sigmoid(),rand_h[i][odd]))
        return v, *h

result4 = gibbs_step2(v,h[0],h[1],h[2])

y_hat = result4[0]
# Generate the graph
params_dict = {}
for i, param in enumerate(weight.parameters()):
    params_dict['weight_' + str(i)] = param
for i, param in enumerate(bias.parameters()):
    params_dict['bias_' + str(i)] = param

graph = make_dot(y_hat, params=params_dict, show_attrs=True, show_saved=True)
# Render and save the graph (adjust the filename if needed)
graph.render("gibbs_step2_graph", format="png")


test = torch.autograd.gradcheck(gibbs_step2, (v,h[0],h[1],h[2]))
print("AUTOGRAD2: ",test)

#mean3, std3 = plot_histograms_and_stats(result3, 'result3')
mean4, std4 = plot_histograms_and_stats(result4, 'result4')

#print(f"Result3 - Mean: {mean3:.4f}, Std: {std3:.4f}")
print(f"Result4 - Mean: {mean4:.4f}, Std: {std4:.4f}")

def coupling1(v, h, fix_v=False):
        N = v.size(0)
        device = v.device
        _v, _h = (v, h)

        v, h = mh_step1(v, h, fix_v)
        energy = energy1(v, h)

        converged = torch.ones(N, dtype=torch.bool, device=device) if fix_v \
                    else torch.all(v == _v, 1)
        for i in range(L):
            converged = converged.logical_and(torch.all(h[i] == _h[i], 1))

        while not converged.all():
            not_converged = converged.logical_not()
            _v = v[not_converged]
            _h = [h[i][not_converged] for i in range(L)]
            M = _v.size(0)

            rand_v = None if fix_v else torch.rand_like(_v)
            rand_h = [torch.rand_like(_h[i]) for i in range(L)]
            rand_u = torch.rand(M, device=device)

            v_, h_ = mh_step1(_v, _h, fix_v, rand_v, rand_h, rand_u)
            energy[not_converged] += energy1(v_, h_) - energy1(_v, _h)

            if fix_v:
                converged_ = torch.ones(M, dtype=torch.bool, device=device)
            else:
                converged_ = torch.all(v_ == _v, 1)
                v[not_converged] = v_

            for i in range(L):
                converged_ = converged_.logical_and(torch.all(h_[i] == _h[i], 1))
                h[i][not_converged] = h_[i]

            converged[not_converged] = converged_

        return energy, v, h

result5 = coupling1(v,h)

def equals(a, b):
      similarity_scores = abs(a-b)
      return torch.all(similarity_scores < 0.4, dim=1)

def coupling2(v, h, fix_v=False):
        N = v.size(0)
        device = v.device
        _v = v.clone()
        _h = []
        for r in h:
          _h.append(r.clone())
        v, h = mh_step2(v, h, fix_v)
        energy = energy1(v, h)
        if fix_v:
          converged = torch.ones(N, dtype=torch.bool, device=device)
        else:
          converged = equals(v, _v)
        for i in range(L):
            converged = converged.logical_and(equals(h[i], _h[i]))
        while not converged.all():
            not_converged = converged.logical_not()
            _v = v[not_converged]
            _h = [h[i][not_converged] for i in range(L)]
            M = _v.size(0)
            rand_v = None if fix_v else torch.rand_like(_v)
            rand_h = [torch.rand_like(_h[i]) for i in range(L)]
            rand_u = torch.rand(M, device=device)
            v_, h_ = mh_step2(_v, _h, fix_v, rand_v, rand_h, rand_u)
            aaa = energy(v_, h_)
            bbb = energy(_v, _h)
            energy[not_converged] = energy[not_converged] + (aaa - bbb)
            if fix_v:
                converged_ = torch.ones(M, dtype=torch.bool, device=device)
            else:
                converged_ = equals(v_, _v)
                v = torch.scatter(v,0,not_converged.nonzero().repeat(1,v.shape[1]), v_)
            for i in range(L):
                converged_ = converged_.logical_and(equals(h_[i], _h[i]))
                h[i] = torch.scatter(h[i], 0, not_converged.nonzero().repeat(1,h_[i].shape[1]), h_[i])
            converged[not_converged] = converged_
        return energy, v, *h

result6 = coupling2(v,h[0],h[1],h[2])

test = torch.autograd.gradcheck(coupling2, (v,h[0],h[1],h[2]))
print("AUTOGRAD3: ",test)

mean5, std5 = plot_histograms_and_stats(result5, 'result5')
mean6, std6 = plot_histograms_and_stats(result6, 'result6')

print(f"Result5 - Mean: {mean5:.4f}, Std: {std5:.4f}")
print(f"Result6 - Mean: {mean6:.4f}, Std: {std6:.4f}")

def local_search1(v, h, fix_v=False):
        N = v.size(0)
        device= v.device

        rand_u = torch.rand(N, device=device)
        _v, _h = (v, h)
        v, h = gibbs_step1(v, h, fix_v, rand_u=rand_u, T=0)

        converged = torch.ones(N, dtype=torch.bool, device=device) if fix_v \
                    else torch.all(v == _v, 1)
        for i in range(L):
            converged = converged.logical_and(torch.all(h[i] == _h[i], 1))

        while not converged.all():
            not_converged = converged.logical_not()
            _v = v[not_converged]
            _h = [h[i][not_converged] for i in range(L)]
            M = _v.size(0)

            v_, h_ = gibbs_step1(_v, _h, fix_v,
                                     rand_u=rand_u[not_converged], T=0)

            if fix_v:
                converged_ = torch.ones(M, dtype=torch.bool, device=device)
            else:
                converged_ = torch.all(v_ == _v, 1)
                v[not_converged] = v_

            for i in range(L):
                converged_ = converged_.logical_and(torch.all(h_[i] == _h[i], 1))
                h[i][not_converged] = h_[i]

            converged[not_converged] = converged_

        return v, h

result7 = local_search1(v,h)

def local_search2(v, h, fix_v=False):
        N = v.size(0)
        device= v.device
        _v = v.clone()
        _h = []
        for r in h:
          _h.append(r.clone())
        rand_u = torch.rand(N, device=device)
        v, h = gibbs_step2(v, h, fix_v, rand_u=rand_u, T=0)
        converged = torch.ones(N, dtype=torch.bool, device=device) if fix_v \
                    else equals(v, _v)
        for i in range(L):
            converged = converged.logical_and(equals(h[i], _h[i]))
        while not converged.all():
            not_converged = converged.logical_not()
            _v = v[not_converged]
            _h = [h[i][not_converged] for i in range(L)]
            M = _v.size(0)
            v_, h_ = gibbs_step2(_v, _h, fix_v,
                                     rand_u=rand_u[not_converged], T=0)
            if fix_v:
                converged_ = torch.ones(M, dtype=torch.bool, device=device)
            else:
                converged_ = equals(v_, _v)
                v = torch.scatter(v,0,not_converged.nonzero().repeat(1,v.shape[1]), v_)
            for i in range(L):
                converged_ = converged_.logical_and(equals(h_[i], _h[i]))
                h[i] = torch.scatter(h[i], 0, not_converged.nonzero().repeat(1,h_[i].shape[1]), h_[i])
            converged[not_converged] = converged_

        return v, *h

result8 = local_search2(v,h)

test = torch.autograd.gradcheck(local_search2, (v,h[0],h[1],h[2]))
print("AUTOGRAD4: ",test)
mean7, std7 = plot_histograms_and_stats(result7, 'result7')
mean8, std8 = plot_histograms_and_stats(result8, 'result8')

print(f"Result7 - Mean: {mean7:.4f}, Std: {std7:.4f}")
print(f"Result8 - Mean: {mean8:.4f}, Std: {std8:.4f}")

model test:  tensor([1.2562e-142], dtype=torch.float64, grad_fn=<SigmoidBackward0>)
model test:  tensor([1.], dtype=torch.float64, grad_fn=<SigmoidBackward0>)
model test:  tensor([6.3669e-35], dtype=torch.float64, grad_fn=<SigmoidBackward0>)
model test:  tensor([1.], dtype=torch.float64, grad_fn=<SigmoidBackward0>)


  return torch.tensor(result)
  return torch.tensor(result)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=torch.float64)
v:  tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],

GradcheckError: Jacobian mismatch for output 0 with respect to input 0,
numerical:tensor([[ 0.0000e+00,  0.0000e+00, -5.0000e+05,  ..., -3.4879e+05,
         -4.2973e+05, -8.4041e+03],
        [-5.0000e+05,  8.1936e-39, -5.0000e+05,  ..., -1.5121e+05,
          4.2973e+05,  8.4041e+03],
        [-5.0000e+05, -2.6432e+05,  5.0000e+05,  ...,  3.4879e+05,
         -7.0270e+04, -4.9160e+05],
        ...,
        [ 0.0000e+00,  5.0000e+05,  5.0000e+05,  ...,  1.5121e+05,
         -4.2973e+05, -8.4041e+03],
        [ 0.0000e+00,  5.0000e+05,  7.5957e-63,  ...,  3.4879e+05,
          4.2973e+05, -4.9160e+05],
        [ 5.0000e+05,  5.0000e+05, -5.0000e+05,  ...,  1.5121e+05,
         -4.2973e+05,  4.9160e+05]], dtype=torch.float64)
analytical:tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)


In [None]:
import numpy as np
import itertools
def generate_data(x):
    num_configurations = 20

    # Generate the continuous number sequence
    configurations = np.arange(num_configurations)

    # Threshold based on x
    threshold = int(x * num_configurations)
    y_values = (configurations < threshold).astype(int)

    # Combine input x, configurations, and output y
    data = np.concatenate((np.tile(x, (num_configurations, 1)),
                           configurations[:, np.newaxis],  # Reshape for concatenation
                           y_values[:, np.newaxis]), axis=1)

    return data

# Example usage
x = 0.08
data = generate_data(x)
print(data)

import torch
from torch import nn
class BernoulliApproximator(nn.Module):
  def __init__(self, hidden_dim):
    super().__init__()
    self.linear1 = nn.Linear(2, hidden_dim)
    self.linear2 = nn.Linear(hidden_dim, hidden_dim)
    self.linear3 = nn.Linear(hidden_dim, hidden_dim)
    self.linear4 = nn.Linear(hidden_dim, 1)
    self.relu = nn.ReLU()
    self.x=None
    self.out1 = None
    self.out2 = None
    self.out3 = None
    self.out4 = None


  def forward(self, x):
    self.x=x
    out = self.relu(self.linear1(x))
    self.out1 = out
    out = self.relu(self.linear2(out))
    self.out2 = out
    out = self.relu(self.linear3(out))
    self.out3 = out
    out = torch.sigmoid(self.linear4(out))
    self.out4 = out
    return out

probs = torch.rand(10000, 1)
data_list = []
for p in probs.flatten():  # Iterate over probabilities
    data_list.append(generate_data(p.item()))  # Convert to float for compatibility

probs = torch.rand(5000, 1)*0.1
for p in probs.flatten():  # Iterate over probabilities
    data_list.append(generate_data(p.item()))  # Convert to float for compatibility

probs = 0.9 * torch.rand(5000, 1)*0.1
for p in probs.flatten():  # Iterate over probabilities
    data_list.append(generate_data(p.item()))  # Convert to float for compatibility

data = np.vstack(data_list)  # Combine the data from all probabilities
np.random.shuffle(data)
# Create X and Y
X = torch.tensor(data[:, :2], dtype=torch.float32)  # Input (x and binary variables)
Y = torch.tensor(data[:, 2], dtype=torch.float32)  # Output (y)

print(X)
print(Y)
print(X.shape)
print(Y.shape)

model = BernoulliApproximator(hidden_dim=64)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

for epoch in range(500000):
  total_loss = 0
  optimizer.zero_grad()
  #  print(X[i])
  result = model(X)
   # print(result)
   # print(Y[i])
  loss = nn.BCELoss()(result.squeeze(),Y)
  total_loss+=loss
  loss.backward()
  optimizer.step()
  print('avg_loss: ',total_loss/len(X))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
avg_loss:  tensor(9.7755e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7747e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7740e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7733e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7726e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7719e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7711e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7704e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7697e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7690e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7683e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7676e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7668e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7661e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7654e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7647e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor(9.7640e-10, grad_fn=<DivBackward0>)
avg_loss:  tensor

KeyboardInterrupt: 

In [None]:
print(model(torch.tensor([0.3,])))

tensor([0.], grad_fn=<SigmoidBackward0>)


In [None]:
torch.save(model,'bernoullimodel8.pth')

In [None]:
import torch
import torch.nn as nn
import numpy as np
def generate_data(num_samples=100,range=100):
    a_values = np.random.uniform(low=-range, high=range, size=num_samples)
    b_values = np.random.uniform(low=-range, high=range, size=num_samples)
    y_values = (a_values > b_values).astype(int)
    data = np.concatenate((a_values[:, np.newaxis], b_values[:, np.newaxis], y_values[:, np.newaxis]), axis=1)
    return data

class ComparatorNetwork(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.linear1 = nn.Linear(2, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.x=None
        self.out1 = None
        self.out2 = None
        self.out3 = None
        self.out4 = None

    def forward(self, x):
        self.x=x
        out = self.relu(self.linear1(x))
        self.out1 = out
        out = self.relu(self.linear2(out))
        self.out2 = out
        out = torch.sigmoid(self.linear3(out))
        self.out3 = out
        return out

# Generate training data
data = generate_data()

data_list = []
data_list.append(generate_data(num_samples=50000))  # Convert to float for compatibility

data_list.append(generate_data(num_samples=100000,range=0.5))  # Convert to float for compatibility

data_list.append(generate_data(num_samples=50000, range=1))  # Convert to float for compatibility

data = np.vstack(data_list)
np.random.shuffle(data)

X = torch.tensor(data[:, :2], dtype=torch.float32)  # Input (a and b)
Y = torch.tensor(data[:, 2], dtype=torch.float32)  # Output

print(X)
print(Y)
print(X.shape)
print(Y.shape)

model2 = ComparatorNetwork(hidden_dim=2)
optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)

for epoch in range(100000):
  total_loss = 0
  optimizer.zero_grad()
  #  print(X[i])
  result = model2(X)
   # print(result)
   # print(Y[i])
  loss = nn.BCELoss()(result.squeeze(),Y)
  total_loss+=loss
  loss.backward()
  optimizer.step()
  print(epoch, ': avg_loss: ',total_loss/len(X))

torch.save(model,'greater_than2.pth')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
95000 : avg_loss:  tensor(1.2065e-11, grad_fn=<DivBackward0>)
95001 : avg_loss:  tensor(1.2065e-11, grad_fn=<DivBackward0>)
95002 : avg_loss:  tensor(1.2065e-11, grad_fn=<DivBackward0>)
95003 : avg_loss:  tensor(1.2065e-11, grad_fn=<DivBackward0>)
95004 : avg_loss:  tensor(1.2065e-11, grad_fn=<DivBackward0>)
95005 : avg_loss:  tensor(1.2065e-11, grad_fn=<DivBackward0>)
95006 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95007 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95008 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95009 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95010 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95011 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95012 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95013 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95014 : avg_loss:  tensor(1.2064e-11, grad_fn=<DivBackward0>)
95015

NameError: name 'model' is not defined

In [None]:
for epoch in range(1000000):
  total_loss = 0
  optimizer.zero_grad()
  #  print(X[i])
  result = model2(X)
   # print(result)
   # print(Y[i])
  loss = nn.BCELoss()(result.squeeze(),Y)
  total_loss+=loss
  loss.backward()
  optimizer.step()
  if epoch % 10000==0:
    print(epoch, ': avg_loss: ',total_loss/len(X))


torch.save(model2,'greater_than2.pth')

0 : avg_loss:  tensor(8.0563e-12, grad_fn=<DivBackward0>)
10000 : avg_loss:  tensor(5.5909e-12, grad_fn=<DivBackward0>)
20000 : avg_loss:  tensor(4.7526e-12, grad_fn=<DivBackward0>)
30000 : avg_loss:  tensor(3.6456e-12, grad_fn=<DivBackward0>)
40000 : avg_loss:  tensor(2.7233e-12, grad_fn=<DivBackward0>)
50000 : avg_loss:  tensor(2.0468e-12, grad_fn=<DivBackward0>)
60000 : avg_loss:  tensor(1.8403e-12, grad_fn=<DivBackward0>)
70000 : avg_loss:  tensor(1.5241e-12, grad_fn=<DivBackward0>)
80000 : avg_loss:  tensor(1.1878e-12, grad_fn=<DivBackward0>)
90000 : avg_loss:  tensor(9.8127e-13, grad_fn=<DivBackward0>)
100000 : avg_loss:  tensor(9.0604e-13, grad_fn=<DivBackward0>)
110000 : avg_loss:  tensor(7.5578e-13, grad_fn=<DivBackward0>)
120000 : avg_loss:  tensor(6.7345e-13, grad_fn=<DivBackward0>)
130000 : avg_loss:  tensor(6.1872e-13, grad_fn=<DivBackward0>)
140000 : avg_loss:  tensor(5.9059e-13, grad_fn=<DivBackward0>)
150000 : avg_loss:  tensor(5.0965e-13, grad_fn=<DivBackward0>)
160000

KeyboardInterrupt: 

In [None]:
torch.save(model2,'greater_than2.pth')

In [None]:
from google.colab import drive

drive.mount('/content/drive')
!cp 'dbn_modelv2_deepConv.pt' /content/drive/MyDrive

Combined VAE representation - VAE encoder for continous variables, other type of encoder for categorical variables, combined into decoder.

Working with this data I see why tabular data is much harder. Training a VAE on this is much more difficult than other times I've done it. Right now I've split the data into 2, categorical and continous but I'm going to have to split them into groupings of dependencies between variables.

OPTICS is clustering between datapoints but we want clustering of dependencies between variables. Even then there will be large groups of disparate variables.


CLUSTER ACCORDING TO COMBINED ENCODED LATENT SPACE.

In [None]:
import torch.optim as optim
import torch
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
import torch

import torch
import torch.nn as nn
import torch.nn.functional as F  # Provides additional layers and functions

import torch
import torch.distributions as dist


def train_vae(num_vae, dataloader, num_epochs, optimizer_num, device):
    num_vae.train()
    # Train Numerical VAE
    for epoch in range(num_epochs):
      num_vae.train()
      total_loss_num = 0
      for batch_num,_ in dataloader:  # Only iterate over numerical data
        batch_num = batch_num.to(device)
        optimizer_num.zero_grad()
        recon_num, mu_num, logvar_num = num_vae(batch_num)
        loss_num = F.mse_loss(recon_num, batch_num) + \
                       -0.5 * torch.sum(1 + logvar_num - mu_num.pow(2) - logvar_num.exp())
        loss_num.backward()
        optimizer_num.step()
        total_loss_num += loss_num.item()
      avg_loss_num = total_loss_num / len(dataloader)
      print(f'Epoch {epoch + 1}: Num. VAE Loss - {avg_loss_num:.4f}')


#--- Using the loop ---

# Instantiate your VAE model
num_vae = VAE(input_size=X_num.shape[1], hidden_size=512, latent_size=16).to(device)

# Optimizers for each VAE
optimizer_num = optim.Adam(num_vae.parameters(), lr=1e-3)

# Train!
train_vae(num_vae, dataloader, num_epochs=200, optimizer_num=optimizer_num, optimizer_cat=optimizer_cat, device=device)


In [None]:
from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def run_optics_and_visualize(data):
    # Perform OPTICS clustering using the provided metric
    optics = OPTICS(min_samples=20)
    optics.fit(data)
    # Get cluster labels
    cluster_labels = optics.labels_

    # Get reachability distances (useful for understanding cluster structure)
    reachability_distances = optics.reachability_
    # Create a dictionary for mapping cluster labels to colors
    color_map = plt.cm.get_cmap('tab10', max(cluster_labels) + 1)  # Adjust colormap as needed
    colors = [color_map(label) for label in cluster_labels]

    # Reduce dimensionality to 2D using PCA
    pca = PCA(n_components=2)
    data_reduced = pca.fit_transform(data)


    # Plot the reduced data with cluster labels as colors
    plt.scatter(data_reduced[:, 0], data_reduced[:, 1], c=cluster_labels)
    plt.title('OPTICS Clusters (PCA Visualization)')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.show()

    tsne = TSNE(n_components=2)
    data_reduced = tsne.fit_transform(data)

    # Plot the reduced data with cluster labels as colors
    plt.scatter(data_reduced[:, 0], data_reduced[:, 1], c=cluster_labels)
    plt.title('OPTICS Clusters (t-SNE Visualization)')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.show()

In [None]:
run_optics_and_visualize(np.concatenate((X_num, X_cat), axis=1))

In [None]:
run_optics_and_visualize(X_num)

In [None]:
run_optics_and_visualize(X_cat)