In [None]:
%reset -f

import sys
sys.path.append("../data/exadata/parquet_dataset/query_tool")

import os
import pandas as pd
import numpy as np
import datetime
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LinearRegression
from query_tool import M100DataClient
from matplotlib import pyplot as plt
from models import *

dataset_path = "../data/m100"
processed_path = "../data"
client = M100DataClient(dataset_path)



In [None]:
"""Cell purpose: pull desired features from plugins"""


#TODO: Merging method, working for columns that aren't timestamp,node,value
#For now I think it's necessary to do it this way because of memory issues
# def extract_features(feature_list, n_rows, columns, year_month):
#     df = pd.DataFrame()
#     for f in feature_list: 
#         query_df = client.query(f, 
#                       columns=columns,
#                       year_month=year_month)
#         
#         samples = np.random.randint(low = 0, high = query_df.shape[0], size = n_rows)
#         sampled_df = query_df if query_df.shape[0] < n_rows else query_df.iloc[samples] 
#         multi_idx= pd.MultiIndex.from_arrays([sampled_df["timestamp"].values, sampled_df["node"].values], names = ("timestamp", "node"))
#         data = {f: sampled_df["value"].values}
#         processed_df = pd.DataFrame(index = multi_idx, data=data)
#         if df.empty:
#             df = processed_df
#         else:
#             df = pd.concat([df, processed_df], join="outer")
#     return df

def extract_features(feature_list, n_rows, nodes_of_interest, columns, year_month):
    plugin_df = pd.DataFrame()
    node_dfs = []
    for f in feature_list: 
        query_df = client.query(f, 
                      columns=columns,
                      year_month=year_month)   
        query_df.dropna(inplace=True)
        node_list = query_df["node"].to_numpy().astype(np.int32)
        for noi in nodes_of_interest:
            idxs = np.where(node_list == noi, True, False)
            nodes = query_df.iloc[idxs].iloc[:n_rows]
            node_values = nodes["value"].values
            
            node_df = pd.DataFrame(index=nodes["timestamp"])
            new_name = str(noi) + "_" + str(f)
            node_df[new_name] = node_values
            node_dfs.append(node_df)
    plugin_df=pd.concat(node_dfs, axis=1, join="outer")
    # plugin_df = node_dfs[0]
    # for d in node_dfs[1:]:  
    #     plugin_df = plugin_df.concat(d, axis=1, join="outer")
    return plugin_df

# def extract_features_simple (feature_list, n_rows, nodes_of_interest, columns, year_month):
#     plugin_df = pd.DataFrame()
#     node_dfs = []
#     for f in feature_list: 
#         query_df = client.query(f, 
#                       columns=columns,
#                       year_month=year_month)   
#         for noi in nodes_of_interest:
#         df = quer_df[]
#     

n_rows = 10000
# nodes_of_interest = [i for i in range(10)]
nodes_of_interest = [0]
year_month = ["22-02"]

#ipmi collects physical/hardware properties of nodes
ipmi_features = ["ambient", "gpu0_core_temp"]
ipmi_columns = ["timestamp", "node", "value"]
ipmi_df = extract_features(ipmi_features, n_rows, nodes_of_interest, ipmi_columns, year_month)
ipmi_df.to_csv(os.path.join(processed_path, "ipmi.csv"))

#vertiv plugin collects data for AC cooling 
# vertiv_features = ["Supply_Air_Temperature"]
# vertiv_labels = ["Return_Air_Temperature"]
# vertiv_df = client.query(["Supply_Air_Temperature", "Return_Air_Temperature"],
#                   columns=['timestamp','value'],
#                   year_month=["22-02"])



# #ganglia collects metrics on utilization of hardware
ganglia_features = ["Tot"]
ganglia_columns = ["timestamp", "node", "value"]
ganglia_df = extract_features(ganglia_features, n_rows, nodes_of_interest, ganglia_columns, year_month)
ganglia_df.to_csv(os.path.join(processed_path, "ganglia.csv"))

# logics collects power consumption data from equipment and devices
# logics_df = client.query(["Tot"],
#                   columns=['timestamp', "device", "panel", 'value'],
#                   year_month=["22-02"])

#schneider collects data on liquid cooling system
# schneider_df = client.query(["Tot"],
#                   columns=['timestamp', "node", 'value'],
#                   year_month=["22-02"])

#weather data
# weather_df = client.query(["temp"],
#                   columns=['timestamp', 'value'],
#                   year_month=["22-02"])




In [None]:
ipmi_df
plugin_dfs = [ipmi_df] #start w/ small one for now
df = pd.concat(plugin_dfs)


In [None]:
"""Cell purpose: preprocess and/or merge data as desired"""
def df_to_X_y(df, feature_list, label_list):
    return df.loc[feature_list], df.loc[label_list]
    

feature_list = ["0_ambient"]
label_list = ["0_gpu0_core_temp"] #not what we actually want, but for testing
feature_df = df[feature_list] 
label_df = df[label_list]

#align t and t+1
feature_df = feature_df.iloc[:-1]
label_df = label_df.iloc[1:]


trainX = None
trainY = None



In [None]:
# Will have to make work with timestamp indexign later
class M100Data(Dataset):
    def __init__(self, feature_df, label_df, transform = None, label_transform = None):
        self.feature_df = feature_df
        self.label_df = label_df
        self.transform = transform
        self.feature_dim = self.feature_df.shape[1]
        self.label_dim = self.label_df.shape[1]
    def __len__(self):
        return self.feature_df.shape[0]

    def __getitem__(self, idx):
        feature = torch.as_tensor(self.feature_df.iloc[idx].values)
        label = torch.as_tensor(self.label_df.iloc[idx].values)
        return feature, label
    
train_ds = M100Data(feature_df, label_df)
trainloader = DataLoader(train_ds, batch_size = 8, shuffle = True)


In [None]:

epochs = 1
model = LinearNN(train_ds.feature_dim, train_ds.label_dim)
optimizer = torch.optim.SGD(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()
train(model, trainloader, optimizer, loss_fn, epochs)

In [None]:
"""Cell purpose: train"""
# model = LinearRegression().fit(merged_df["air_temp"].to_numpy().reshape(-1,1), merged_df["ambient_temp"].to_numpy().reshape(-1,1))

In [None]:
"""Cell purpose: inference"""
# preds = model.predict(test_df["air_temp"].to_numpy().reshape(-1,1))


In [None]:
"""Cell purpose: benchmarking"""

# labels = test_df["ambient_temp"].to_numpy().reshape(-1,1)
# 
# 
# timesteps = [t for t in range(preds.shape[0])]
# colors = iter(plt.cm.rainbow(np.linspace(0, 0.5, 2)))
# plt.scatter(preds, labels, c=["blue", "red"])
# 
# plt.legend(loc="upper left")
# plt.xlabel("Time")
# plt.ylabel("Ambient temperature")
# plt.grid()
# plt.show()



In [None]:
""
