In [3]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import json

# Load the dataset
df = pd.read_csv('./logs/500run2.csv')

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to compute BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# List to store each row of the new dataset
new_dataset_rows = []

for index, row in df.iterrows():
    unique_dag_id = row['Unique DAG ID']
    func_name = row['Function Name']
    input_file_path = row['Input File']
    
    # Load the JSON content from the input file
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    
    # Extract the specific part of the JSON relevant to the current function
    if func_name in data['data']:
        func_data = json.dumps(data['data'][func_name])
        embedding = get_bert_embedding(func_data)
        new_row = {
            'Unique DAG ID': unique_dag_id,
            'Function Name': func_name,
            'BERT Embedding': embedding,
            'Max CPU Usage': row['Max CPU Usage'],
            'Max Memory Usage': row['Max Memory Usage']
        }
        new_dataset_rows.append(new_row)

# Convert the list of new rows into a DataFrame
final_df = pd.DataFrame(new_dataset_rows)

# Display or save the final dataset
# print(final_df.head())  # For display
final_df.to_csv('functions_profiled_dataset_with_embeddings.csv', index=False)


In [5]:
# show all exisiting values for the function name in final_df
print(final_df['Function Name'].unique())

['AES1' 'AES2' 'AES3']


In [6]:
final_df

Unnamed: 0,Unique DAG ID,Function Name,BERT Embedding,Max CPU Usage,Max Memory Usage
0,AS-0eea2507-ed93-4f74-998f-f515e97d493a,AES1,"[[-0.36717856, -0.3797353, 0.14636903, -0.0644...",1.344703,26.845184
1,AS-0eea2507-ed93-4f74-998f-f515e97d493a,AES2,"[[-0.3921602, -0.3868453, 0.109582104, -0.0756...",0.673485,21.921792
2,AS-0eea2507-ed93-4f74-998f-f515e97d493a,AES3,"[[-0.37413302, -0.37243375, 0.15295929, -0.056...",1.990102,32.206848
3,AS-f06538f6-72db-4f30-b580-46eb0ff6c5f5,AES1,"[[-0.3881844, -0.40802562, 0.124924146, -0.069...",0.657686,21.516288
4,AS-f06538f6-72db-4f30-b580-46eb0ff6c5f5,AES2,"[[-0.40800732, -0.415584, 0.123709925, -0.0662...",0.669833,21.827584
...,...,...,...,...,...
1495,AS-24aca0bf-9b4c-4ccc-b74b-4f813993027a,AES2,"[[-0.4174613, -0.36204627, 0.15943323, -0.0755...",1.926353,29.966336
1496,AS-24aca0bf-9b4c-4ccc-b74b-4f813993027a,AES3,"[[-0.396648, -0.36316627, 0.16283928, -0.08371...",2.346458,40.497152
1497,AS-c31d19ad-4769-47b3-afc7-5d101cbb50da,AES1,"[[-0.3648185, -0.34650147, 0.1660411, -0.06320...",1.138118,24.358912
1498,AS-c31d19ad-4769-47b3-afc7-5d101cbb50da,AES2,"[[-0.38517582, -0.36410168, 0.135017, -0.06388...",3.271595,24.162304


In [2]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import json

# Load the dataset
df = pd.read_csv('./logs/500run2.csv')

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to compute DistilBERT embeddings
def get_distilbert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    # For DistilBERT, use `.last_hidden_state` to get the sequence of hidden-states at the output of the last layer
    # Here we're taking the mean of these outputs to get a single vector representation of the text
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# List to store each row of the new dataset
new_dataset_rows = []

for index, row in df.iterrows():
    unique_dag_id = row['Unique DAG ID']
    func_name = row['Function Name']
    input_file_path = row['Input File']
    
    # Load the JSON content from the input file
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    
    # Extract the specific part of the JSON relevant to the current function
    if func_name in data['data']:
        func_data = json.dumps(data['data'][func_name])
        embedding = get_distilbert_embedding(func_data)
        new_row = {
            'Unique DAG ID': unique_dag_id,
            'Function Name': func_name,
            'BERT Embedding': embedding.tolist(),  # Convert numpy array to list for easier handling
            'Max CPU Usage': row['Max CPU Usage'],
            'Max Memory Usage': row['Max Memory Usage']
        }
        new_dataset_rows.append(new_row)

# Convert the list of new rows into a DataFrame
final_df = pd.DataFrame(new_dataset_rows)

# Save the final dataset
final_df.to_csv('./logs/functions_profiled_dataset_with_distilbert_embeddings.csv', index=False)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
len(final_df['BERT Embedding'][0][0])

768

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming `df` is your DataFrame containing the embeddings, function names, and Max CPU Usage
# Let's simulate loading the DataFrame here
# df = pd.read_csv('functions_profiled_dataset_with_distilbert_embeddings.csv')

# Prepare the features and target variable
X = final_df.drop('Max CPU Usage', axis=1)
y = final_df['Max CPU Usage']

# One-hot encode the function name
encoder = OneHotEncoder(sparse_output=False)
function_names_encoded = encoder.fit_transform(X[['Function Name']])
function_names_encoded_df = pd.DataFrame(function_names_encoded, columns=encoder.get_feature_names_out())

# Concatenate the one-hot encoded DataFrame with the embeddings
# Assuming each embedding is stored as a list in the 'BERT Embedding' column
embeddings_df = pd.DataFrame(X['BERT Embedding'][0], columns=[f'embedding_{i}' for i in range(len(X['BERT Embedding'][0][0]))])
X_encoded = pd.concat([function_names_encoded_df, embeddings_df], axis=1)
X_encoded.columns = X_encoded.columns.astype(str)
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Your model is now trained and evaluated. You can adjust model parameters and preprocessing steps as needed.


Mean Squared Error: 2.514937125294399


In [26]:
def calculate_cpu_usage_error_rate(y_actual, y_predicted):
        """
        Calculate the error rate for CPU usage predictions based on the criteria:
        If the predicted value falls in the integer range of the actual value, then it is not an error.
        
        Args:
            y_actual (array-like): The actual CPU usage values.
            y_predicted (array-like): The predicted CPU usage values.
            
        Returns:
            float: The error rate.
            list: The list of errors.
        """
        errors = 0
        error_list = []
        for actual, predicted in zip(y_actual, y_predicted):
            actualError = abs(actual - predicted)/actual
            error_list.append(actualError)
            # Check if predicted falls outside the integer range of actual
            if not (int(actual) - 1 <= predicted < int(actual) + 1):
                errors += 1
        
        error_rate = errors / len(y_actual)
        return error_rate,error_list

In [27]:
errRate, errList = calculate_cpu_usage_error_rate(y_test, y_pred)
print(f'Error Rate: {errRate}')

Error Rate: 0.27111111111111114


In [28]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

class CPUDataset(Dataset):
    def __init__(self, features, labels):
        """
        features: Array of input features including BERT embeddings and encoded function names.
        labels: Array of target labels (CPU utilization).
        """
        scaler = StandardScaler()
        self.features = scaler.fit_transform(features)
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.float)

# Assuming X_encoded and y are your features and labels respectively
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

train_dataset = CPUDataset(X_train, y_train)
test_dataset = CPUDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [29]:
import torch.nn as nn
import torch.nn.functional as F

class CPUUtilizationModel(nn.Module):
    def __init__(self, input_size, hidden_size1=64, hidden_size2=32):
        super(CPUUtilizationModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Initialize the model
model = CPUUtilizationModel(input_size=X_train.shape[1])


In [30]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

train_model(model, train_loader, criterion, optimizer)


KeyError: 745

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            total_loss += loss.item()
    print(f'Test Loss: {total_loss / len(test_loader)}')

evaluate_model(model, test_loader)
