In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.feather as feather
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
from datetime import datetime
import json

In [2]:
SIMULATION_DIR = "/soe/narehman/docker_home/simulations/ml_data/datacenter/datacenter/"
WORKLOAD_SIMPOINTS = {'clang' : [812],
             'gcc' : [939],
             'mongodb' : [4118],
             'mysql' : [1172],
             'postgres' : [2807],
             'verilator' : [31568],
             'xgboost' : [3311],}
EXP_NAME = "data"
CONFIG_NAME = "data"
SUITE_NAME = "datacenter"
SUBSUITE_NAME = "datacenter"

In [None]:
RAW_COMPRESSSED_DATA_DIR = "raw_compressed_data/"


## Notes
- Data stored un-normalized in workload.feathers

# COMPRESSION
- convert hex to int
- store to feather files for quick loading later

In [4]:
# convert hex addresses to ints
def hex_to_int(value):
    if isinstance(value, str):
        try:
            return int(value, 16) 
        except ValueError:
            return float('nan')  
    return value 

In [5]:
def get_csvs():
    csvs = {}
    for workload in WORKLOAD_SIMPOINTS.keys():
        csvs[workload] = []
        for simpoint in WORKLOAD_SIMPOINTS[workload]:
            csvs[workload].append(f"{SIMULATION_DIR}/{workload}/{simpoint}/ml_data.csv")
    return csvs

In [6]:
# should I compress it by simpoint?
def compress_raw_data():
    workload_csvs = get_csvs()
    for workload, csvs in workload_csvs.items():
        data = pd.concat([pd.read_csv(csv) for csv in csvs], ignore_index=True)
        #FIXME: handle this in scarab
        data['ft_start_addr'] = data['ft_start_addr'].apply(lambda x: hex_to_int(x))
        data.to_feather(f'{RAW_COMPRESSSED_DATA_DIR}/{workload}.feather')

In [None]:
compress_raw_data()

# GENERATE METADATA COLUMNS

In [None]:
ANNOTATED_DATA_DIR = "icache_consumed_data/" # stores data with generated metadata columns

In [None]:
def annotate_data(df):
    # Calc first ft after resteer
    df['cycles_since_rec_agg'] = df['cycles_since_btb_rec'] * df['cycles_since_ibtb_rec'] * df['cycles_since_misfetch_rec'] * df['cycles_since_mispred_rec']
    mask = df['off_path_reason'] > 0
    df['first_after_resteer'] = mask
    df['first_after_resteer'] = df['first_after_resteer'].astype(int)

    # calc window_id (window is defined as the time between two resteers)
    df['window_id'] = df['first_after_resteer'].cumsum()
    # get window lenth (in fts)
    df['length_window'] = df.groupby('window_id')['off_path'].transform('count')
    # get len off-path
    df['length_off_path'] = df[df['off_path'] == 1].groupby('window_id')['off_path'].transform('count')
    df['length_off_path'] = df['length_off_path'].bfill().ffill().astype(int)
    # Compute the distance from the first 1 in each group
    df['penalty'] = 1
    # ft penalty
    df.loc[df['off_path'] == 0, 'penalty'] = df[df['off_path'] == 0].groupby(['window_id', 'off_path']).cumcount(ascending=False) + 1
    df.loc[df['off_path'] == 1, 'penalty'] = df[df['off_path'] == 1].groupby(['window_id', 'off_path']).cumcount()
    # cycle penalty
    # TODO: ???
    df['icache_cycle'] = df['icache_cycle'].replace(18446744073709551615, np.nan)
    mispred_cycle_per_window = df.loc[df['consumed_icache'] == 1].loc[df['off_path_reason'] > 0].groupby('window_id')['icache_cycle'].first()
    df['icache_cycle_mispred'] = df['window_id'].map(mispred_cycle_per_window)
    df['cycle_penalty'] = (df['icache_cycle_mispred'] - df['icache_cycle']).abs()

    last_off_path_cycle = df.loc[df['consumed_icache'] == 1].groupby('window_id').last()['icache_cycle']
    df['icache_cycle_last_offpath'] = df['window_id'].map(last_off_path_cycle)
    df['off_path_cycles'] = df['icache_cycle_last_offpath'] - df['icache_cycle_mispred']

    first_on_path_cycle = df.loc[df['consumed_icache'] == 1].groupby('window_id').first()['icache_cycle']
    df['icache_cycle_first_onpath'] = df['window_id'].map(first_on_path_cycle)
    df['on_path_cycles'] = df['icache_cycle_mispred'] - df['icache_cycle_first_onpath']
    '''
    # off_path len in cycles 
    df = pd.merge(df, df[df['window_id'] != df['window_id'].shift][['window_id', 'icache_cycle']], on='window_id', how='left', suffixes=('', '_last_offpath'))
    df['off_path_cycles'] = df['icache_cycle_last_offpath'] - df['icache_cycle_mispred']
    '''
    return df

In [7]:
def get_metadata(df, workload):
    count = len(df)
    count_pos = len(df[df['off_path'] == 1])
    count_neg = len(df[df['off_path'] == 0])
    return {workload : { 'length' : count, 'pos' : count_pos, 'neg' : count_neg}}


In [None]:
# TODO: fix how these are generated
workload_feathers = {
    'clang' : ['raw_data_compressed/clang.feather'],
    'gcc' : ['raw_data_compressed/gcc.feather',],
    'mysql' : ['raw_data_compressed/mysql.feather',],
    'mongodb' : ['raw_data_compressed/mongodb.feather',],
    'postgres' : ['raw_data_compressed/postgres.feather',],
    'verilator' : ['raw_data_compressed/verilator.feather',],
    'xgboost' : ['raw_data_compressed/xgboost.feather',]
}

metadata = {}
for workload, feathers in workload_feathers.items():
    print(workload)
    df = pd.concat([pd.read_feather(feather) for feather in feathers])
    df = annotate_data(df)
    metadata |= get_metadata(df, workload)
    df.to_feather(f'{ANNOTATED_DATA_DIR}/{workload}.feather')

with open(f'{ANNOTATED_DATA_DIR}/metadata.json', 'w') as metadata_file:
    json.dump(metadata, metadata_file, indent=4)

clang
gcc
mysql
mongodb
postgres
verilator
xgboost


# CHUNK DATA
Split data into chunks (train/test split is allocated at the chunk granularity)

In [None]:
CHUNK_SIZE = 10000
UNNORMALIZED_CHUNKED_DATA_DIR = "icache_consumed_chunked_data_raw/"

In [None]:
def make_metadata_dict(df, workload, id):
    length = len(df)
    num_pos = len(df[df['off_path'] == 1])
    num_neg = len(df[df['off_path'] == 0])
    return {id: {'workload' : workload, 'length' : length, 'num_pos' : num_pos, 'num_neg' : num_neg}}

In [None]:
def chunk_data(workload_feathers):
    chunk_id = 0
    metadata_dict = {}
    for workload, feathers in workload_feathers.items():
        print(f'chunking {workload}')
        metadata = {}
        for simp in feathers:
            df = pd.read_feather(simp)
            df['workload'] = workload
            df['chunk_id'] = df.index // CHUNK_SIZE 
            chunks = [group for _, group in df.groupby('chunk_id')]
            del(df)
            for chunk in chunks:
                chunk.drop('chunk_id', axis=1, inplace=True)
                metadata|= make_metadata_dict(chunk, workload, chunk_id)
                chunk.to_feather(f'{UNNORMALIZED_CHUNKED_DATA_DIR}/{chunk_id}.feather')
                chunk_id += 1
        metadata_dict[workload] = metadata
           

    with open(f'{UNNORMALIZED_CHUNKED_DATA_DIR}/group_metadata.json', 'w') as metadata_file:
        json.dump(metadata_dict, metadata_file, indent=4)

# CREATE TRAIN / TEST SPLIT
Done at this point so the train / test sets can be normalized separately

In [None]:
def get_train_test_files(files):
    num_test = int(len(files) * 0.2)
    test_files = np.random.choice(files, size=num_test, replace=False)
    train_files = np.setdiff1d(files, test_files)
    return train_files, test_files

In [None]:
def get_idxs(files, metadata, workload):
    print(f'getting idxs for {workload}')
    idxs = [idx for file in files for idx in range(metadata[file]['start_idx'], metadata[file]['end_idx'] + 1)]
    return idxs

In [None]:
train_test = {}
train_test_files = {}

metadata_dict = json.load(open(f'{UNNORMALIZED_CHUNKED_DATA_DIR}/group_metadata.json'))

for workload, metadata in metadata_dict.items():
    print(workload)
    files = np.array(list(metadata.keys()))
    train_files, test_files = get_train_test_files(files)

    train_idxs = get_idxs(train_files, metadata, workload)
    test_idxs = get_idxs(test_files, metadata, workload)
    train_test[workload] = {'train' : train_idxs, 'test' : test_idxs}
    train_test_files[workload] = {'train': train_files.tolist(), 'test' : test_files.tolist()}

with open('icache_consumed_chunked_data/train_test_idxs.json', 'w') as metadata_file:
    json.dump(train_test, metadata_file, indent=4)

with open('icache_consumed_chunked_data/train_test_files.json', 'w') as metadata_file:
    json.dump(train_test_files, metadata_file, indent=4)


# SCALE DATA
Critical step for MLP!! Otherwise the network won't learn anything!!

In [None]:
SCALED_DATA_DIR = "icache_consumed_data/"
columns_to_scale = ['ft_start_addr', 
                    'ft_length', 
                    'ft_start_addr',
                    'cycles_since_btb_rec', 
                    'cycles_since_ibtb_rec', 
                    'cycles_since_misfetch_rec',
                    'cycles_since_mispred_rec',
                    'btb_miss_rate',
                    'ibtb_miss_rate',
                    'misfetch_rate',
                    'mispred_rate'
                    ]

In [None]:
# rescale data with pre-defined chunking / train_test split
# Fitting only on the train set prevents data leakage

f = open('icache_consumed_data/group_metadata.json')
metadata_dict = json.load(f)
f = open('icache_consumed_data/train_test_idxs.json')
train_test_idxs = json.load(f)
f = open('icache_consumed_data/train_test_files.json')
train_test_files = json.load(f)
all_idxs = []
for workload, metadata in metadata_dict.items():
    print(workload)
    # ????
    train_idxs = train_test_idxs[workload]['train']
    test_idxs = train_test_idxs[workload]['test']
    train_files = metadata_file[workload]['train']
    test_files = metadata_file[workload]['test']
    # normalize
    scaler = MinMaxScaler()
    # fit on train set
    train_df = pd.concat([pd.read_feather(f'{UNNORMALIZED_CHUNKED_DATA_DIR}/{file_no}.feather') for file_no in train_files])
    scaler.fit(train_df[columns_to_scale])
    del(train_df)
    # transform on train and test set (save some memory)
    for file_no in train_files:
        df = pd.read_feather(f'{SCALED_DATA_DIR}/{file_no}.feather')
        df[columns_to_scale] = scaler.transform(df[columns_to_scale])
        df.to_feather(f'{SCALED_DATA_DIR}/{file_no}.feather')
    for file_no in test_files:
        df = pd.read_feather(f'{SCALED_DATA_DIR}/{file_no}.feather')
        df[columns_to_scale] = scaler.transform(df[columns_to_scale])
        df.to_feather(f'{SCALED_DATA_DIR}/{file_no}.feather')
   