In [1]:
import os
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
class EmbeddingDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.features = dataframe.iloc[:, 1:-1].values #all values except the first column (id of graph and vertex)
        self.labels = dataframe.iloc[:, -1].values #last column indicates the label of the vertex

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        id = self.data.iloc[idx, 0]
        return features, label, id

In [3]:
class TestEmbeddingDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.features = dataframe.iloc[:, 1:].values #all values except the first column (id of graph and vertex)
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        id = self.data.iloc[idx, 0]
        return features, id

In [4]:
class MLP(nn.Module): # very simple multi layer perceptron, nothing fancy
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)
    
"""class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),  # Batch normalization
            nn.ReLU(),
            nn.Dropout(0.5),      # Dropout to prevent overfitting
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),   # Another batch normalization layer
            nn.ReLU(),
            nn.Dropout(0.5),      # Dropout after second hidden layer
            nn.Linear(64, 1),  
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)  # Sigmoid for binary classification"""


'class MLP(nn.Module):\n    def __init__(self, input_dim):\n        super(MLP, self).__init__()\n        self.layers = nn.Sequential(\n            nn.Linear(input_dim, 128),\n            nn.BatchNorm1d(128),  # Batch normalization\n            nn.ReLU(),\n            nn.Dropout(0.5),      # Dropout to prevent overfitting\n            nn.Linear(128, 64),\n            nn.BatchNorm1d(64),   # Another batch normalization layer\n            nn.ReLU(),\n            nn.Dropout(0.5),      # Dropout after second hidden layer\n            nn.Linear(64, 1),  \n            nn.Sigmoid()\n        )\n\n    def forward(self, x):\n        return self.layers(x)  # Sigmoid for binary classification'

In [5]:
def save_model(model, output_path):
    torch.save(model.state_dict(), output_path)
    print(f"Model saved to {output_path}")

In [6]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=20):
    model.train() # set training mode
    for epoch in range(num_epochs):
        running_loss = 0.0
        for features, labels, _ in train_loader:
            features, labels = features.to("cuda"), labels.to("cuda")
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * features.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")


In [7]:
"""def evaluate_model(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for features, labels, id in test_loader:
            features, labels = features.to("cuda"), labels.to("cuda")
            print(id)
            outputs = model(features)
            loss = criterion(outputs.squeeze(), labels)
            test_loss += loss.item() * features.size(0)
    avg_loss = test_loss / len(test_loader.dataset)
    print(f"Test Loss: {avg_loss:.4f}")"""

"""def evaluate_model(model, test_loader, criterion, output_file):
    model.eval()
    test_loss = 0.0
    results = []  # List to store results

    with torch.no_grad():
        for features, labels, id in test_loader:
            features, labels = features.to("cuda"), labels.to("cuda")
            outputs = model(features)
            output_values = outputs.flatten().tolist()

            for i in range(len(output_values)):
                results.append((id[i], output_values[i]))

            loss = criterion(outputs.squeeze(), labels)
            test_loss += loss.item() * features.size(0)

    avg_loss = test_loss / len(test_loader.dataset)
    print(f"Test Loss: {avg_loss:.4f}")

    # Convert the results to a DataFrame and save to CSV
    output_columns = ['ID'] + [f'Prediction_{j+1}' for j in range(len(results[0]) - 1)]
    df_results = pd.DataFrame(results, columns=output_columns)
    df_results.to_csv(output_file, index=False)"""

def evaluate_model(model, test_loader, output_file):
    model.eval()
    test_loss = 0.0
    results = []  # List to store results

    with torch.no_grad():
        for features, id in test_loader:
            features = features.to("cuda")
            outputs = model(features)
            output_values = outputs.flatten().tolist()

            for i in range(len(output_values)):
                results.append((id[i], output_values[i]))

    # Convert the results to a DataFrame and save to CSV
    output_columns = ['ID'] + [f'Prediction_{j+1}' for j in range(len(results[0]) - 1)]
    df_results = pd.DataFrame(results, columns=output_columns)
    df_results.to_csv(output_file, index=False)


In [9]:
df = pd.read_csv('./dataframe/new_vertexcover.csv')
#df_tests = pd.read_csv('./dataframe/vertexcover_tests.csv')

# Create dataset and split into train and test sets
dataset = EmbeddingDataset(df)
#dataset_test = TestEmbeddingDataset(df_tests)

# Create data loaders
train_loader = DataLoader(dataset, batch_size=128, shuffle=True)
#test_loader = DataLoader(dataset_test, batch_size=32, shuffle=False)

#print(test_loader.dataset.data.iloc[:, 0].values)

# Initialize the model, criterion, and optimizer
input_dim = df.shape[1] - 2  # Excluding the ID and label columns
model = MLP(input_dim).to("cuda")
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, num_epochs=200)
save_model(model, "model")
#evaluate_model(model, test_loader, "./outputs_test.csv")

Epoch 1/200, Loss: 0.6923
Epoch 2/200, Loss: 0.6914
Epoch 3/200, Loss: 0.6907
Epoch 4/200, Loss: 0.6900
Epoch 5/200, Loss: 0.6894
Epoch 6/200, Loss: 0.6887
Epoch 7/200, Loss: 0.6881
Epoch 8/200, Loss: 0.6875
Epoch 9/200, Loss: 0.6870
Epoch 10/200, Loss: 0.6863
Epoch 11/200, Loss: 0.6857
Epoch 12/200, Loss: 0.6851
Epoch 13/200, Loss: 0.6844
Epoch 14/200, Loss: 0.6836
Epoch 15/200, Loss: 0.6828
Epoch 16/200, Loss: 0.6818
Epoch 17/200, Loss: 0.6809
Epoch 18/200, Loss: 0.6799
Epoch 19/200, Loss: 0.6789
Epoch 20/200, Loss: 0.6778
Epoch 21/200, Loss: 0.6766
Epoch 22/200, Loss: 0.6754
Epoch 23/200, Loss: 0.6742
Epoch 24/200, Loss: 0.6730
Epoch 25/200, Loss: 0.6716
Epoch 26/200, Loss: 0.6703
Epoch 27/200, Loss: 0.6691
Epoch 28/200, Loss: 0.6678
Epoch 29/200, Loss: 0.6665
Epoch 30/200, Loss: 0.6652
Epoch 31/200, Loss: 0.6638
Epoch 32/200, Loss: 0.6624
Epoch 33/200, Loss: 0.6611
Epoch 34/200, Loss: 0.6601
Epoch 35/200, Loss: 0.6589
Epoch 36/200, Loss: 0.6576
Epoch 37/200, Loss: 0.6564
Epoch 38/2