In [1]:
import numpy as np, numpy.random
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm

ModuleNotFoundError: No module named 'pandas'

In [None]:
def set_seed(random_seed):
#     random.seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.backends.cudnn.benckmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_default_dtype(torch.float32)
    
set_seed(2024)

In [None]:
# Check CUDA
 
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
 
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {torch.cuda.current_device()}")
       
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

In [None]:
csvFile = pd.read_csv('Dataset/structure4.csv')
df_parameter = csvFile.iloc[:, :10]

df_SE = csvFile.iloc[:, 10:11]
df_Tvis = csvFile.iloc[:, 12:13] * 100
df_metric = pd.concat([df_Tvis, df_SE], axis=1)

df_parameter.head()

In [None]:
max_values_parameter = df_parameter.max()
max_values_parameter

In [None]:
# Normalization Parameters
df_parameter_norm = df_parameter / max_values_parameter
df_parameter_norm.head()

In [None]:
max_values_metric = df_metric.max()
max_values_metric

In [None]:
# Normalization Metrics (Tested: No Difference)

SE_max = 120
Tvis_max = 100

df_SE_norm = df_SE / SE_max
df_Tvis_norm = df_Tvis / Tvis_max
df_metric_norm = pd.concat([df_Tvis_norm, df_SE_norm], axis=1)

plt.scatter(df_Tvis_norm, df_SE_norm, s=0.1)
plt.xlabel("Tvis(%)_norm")
plt.ylabel("SE_norm")
plt.show

In [None]:
plt.scatter(df_Tvis, df_SE, s=0.1)
plt.xlabel("Tvis(%)")
plt.ylabel("SE")
plt.show

Dataset for Reward Model, $\forall x$: function $r_\theta : \mathcal{Y} \mapsto z$

$x$: 2d vector sum to 1

$y_i, y_j$: indicate by index

$z_{y_i, y_j | x}$: preference output

Data for Pretrained Base Pareto Model: function $f_\theta : \mathcal{X} \mapsto \mathcal{Y}$

this can be trained naively: choose the best completion $y_i$ under the most weighted metric(s)

In [None]:
# Generate Dataset for Reward Model and Pretrained Base Pareto Model
parameter_mat = df_parameter.to_numpy()
metric_mat = df_metric.to_numpy()

dim_x = 2
dim_y = len(df_parameter_norm.columns)
x_n = 100
y_n = len(df_parameter_norm)

Xset = []
Yset = []
Zset = []
Mset = []
Rset = []

for i in tqdm(range(x_n)):
    x = np.array(([i/x_n, 1-i/x_n]))
    Xset.append(x.flatten())
    value_x = np.dot(metric_mat, x.T)

    index_sorted = sorted(range(len(value_x)), key=lambda k: value_x[k])
    Yset.append(parameter_mat[index_sorted[-1]])
    Mset.append(metric_mat[index_sorted[-1]])

    
    # Reward Dataset Design
    Rset.append(np.array([value_x]).T)
        
#     z_mat = [[0 for i in range(y_n)] for j in range(y_n)]
    
#     for j in range(y_n):
#         z_mat[index_sorted[j]][index_sorted[j]] = np.random.binomial(1, 0.5, 1)
#         for k in range(1, y_n - j):
#             z_mat[index_sorted[j]][index_sorted[k]] = 1
#             z_mat[index_sorted[k]][index_sorted[j]] = 0
            
#     Zset.append(z_mat)


In [None]:
# Pareto from Data for Pretrain base model

plt.scatter([Mset[i][0] for i in range(x_n)], [Mset[i][1] for i in range(x_n)], s=0.1)
plt.xlabel("Tvis(%)")
plt.ylabel("SE")
plt.show()

In [None]:
# Pretrain base model of Pareto
class TwoToTenNet(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64):
        super(TwoToTenNet, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_dim)

    def forward(self, state):
        state = torch.relu(self.fc1(state))  # ReLU activation for the first layer
        state = self.fc2(state)  # Output layer without activation function
        return state

Xset_tensor = torch.from_numpy(np.array(Xset)).to(torch.float32)
Yset_tensor = torch.from_numpy(np.array(Yset)).to(torch.float32)
    
# Training data
input_data = Xset_tensor 
target_data = Yset_tensor 

# Create the model
model = TwoToTenNet(dim_x, dim_y)

# Define the loss functioy_n and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
ll = []
num_epochs = 10000
for epoch in tqdm(range(num_epochs)):
    # Forward pass
    outputs = model(input_data)
    
    # Compute the loss
    loss = criterion(outputs, target_data)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    ll.append(loss.item())

plt.plot(ll)
plt.xlabel("iterations")
plt.ylabel("loss")
plt.show

# Testing the model
test_input = torch.tensor([[0.5, 0.5]])  # Example input
with torch.no_grad():
    predicted_output = model(test_input)
    print("Predicted Output:", predicted_output)
    
PretrainModel = model

In [None]:
# Pretrain base model of Pareto
Pset = []
df_Pset = pd.DataFrame()

for i in tqdm(range(x_n)):
    with torch.no_grad():
        predicted_output = PretrainModel(torch.tensor([[i/x_n, 1-i/x_n]])).numpy()
        Pset.append(predicted_output)
        df_Pset = pd.concat([df_Pset, pd.DataFrame(predicted_output)], ignore_index=True)

df_Pset.columns = df_parameter.columns
df_Pset

In [None]:
# Reward Model
class RewardNet(nn.Module):
    def __init__(self):
        super(RewardNet, self).__init__()
        self.fc1 = nn.Linear(10, 64)  # Input layer: 10 features, Output layer: 64 neurons
        self.fc2 = nn.Linear(64, 1)  # Hidden layer: 64 neurons, Output layer: 1 neurons

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # ReLU activation for the first layer
        x = self.fc2(x)  # Output layer without activation function
        return x

    
RewardModels = []
for i in tqdm(range(x_n)):
    flag = True
    
    parameter_tensor = torch.from_numpy(parameter_mat).to(torch.float32)
    Rset_tensor = torch.from_numpy(Rset[i]).to(torch.float32)
    
    while(flag):
        # Training data
        input_data = parameter_tensor  # 2500 samples, 10 input features
        target_data = Rset_tensor  # 2500 samples, 1 target features

        # Create the model
        model = RewardNet()

        # Define the loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.SGD(model.parameters(), lr=0.01)

        # Training loop
        ll = []
        num_epochs = 1000
        for epoch in range(num_epochs):
            # Forward pass
            outputs = model(input_data)

            # Compute the loss
            loss = criterion(outputs, target_data)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            ll.append(loss.item())

        with torch.no_grad():
            predicted_output = model(torch.tensor(Pset[i]))
            flag = torch.isnan(predicted_output)
    
    plt.plot(ll)
    plt.xlabel("iterations")
    plt.ylabel("loss")
    plt.show

    RewardModels.append(model)


In [None]:
# RLHF by PG

# Policy Gradient Agent
class PolicyGradientAgent:
    def __init__(self, model, input_data, lr=0.01):
        self.policy_net = model
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.input_data = input_data

    def select_action(self, state):
        return self.policy_net(state)

    def update_policy(self, diff):
        # Define the loss functioy_n and optimizer
        criterion = nn.MSELoss()
        self.optimizer = optim.SGD(model.parameters(), lr=0.01)

        # Backward pass and optimization
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

# Example usage
agent = PolicyGradientAgent(PretrainModel, Xset_tensor)

# Training loop
total_rewards = []

num_epochs = 100

rewards_old = torch.zeros(x_n)

for i in tqdm(range(num_epochs)):
    for j in range(x_n):
        state = Xset_tensor[j]
        action = agent.select_action(state)
        rewards_new[j] = RewardModels[j](action)

#     diff = rewards_new - rewards_old
#     agent.update_policy(diff)
#     rewards_old = rewards_new


### NEED REDESIGN ACTION SPACE