# Random forest model
Use timing data produced by the GPU executable to evaluate the quality of the machine learning model. This script does the following:

1. Reads timing information generated by the main executable detailing the timings of different parameter configurations. 
2. Trains a random forest model using a subset of the experiments to make predictions on execution parameters.
3. Uses a second subset of the read experiment data to evaluate the quality of the machine learning model, i.e., how much slower the machine learning model predictions are compared to the best possible timings.

## Utility functions

In [None]:
import os
from json import loads
import pandas as pd

def read_json_data_folder(input_folder):
    all_json_objects= {}
    for json_file_name in os.listdir(input_folder):
        json_file_path = os.path.join(input_folder, json_file_name)
        with open(json_file_path) as fin:
            data = loads(fin.read())
            all_json_objects[json_file_name] = {
                'path': json_file_path,
                'data': data
            }
    return all_json_objects

def extract_df_from_json_objects(json_objects, graph_name_no_path = True):
    df = pd.DataFrame()
    for json_object in json_objects.values():
        json_object = json_object['data']

        metadata = json_object['metadata']
        metadata['graph name'] = metadata['graph name']  if not graph_name_no_path else os.path.split(metadata['graph name'])[1]
        if 'binning' in json_object['experiments'].keys():
            binning_data = json_object['experiments']['binning']
            ranges = binning_data['ranges']
            kernels = list(binning_data.keys())
            kernels.remove('ranges')
            for kernel in kernels:
                bins = binning_data[kernel]['bins']
                for a_bin in bins:
                    kernel_information =a_bin['information'] 
                    if 'b_max' not in kernel_information.keys():
                        continue
                    sample = {
                        'b_max':kernel_information['b_max'],
                        'b_min':ranges[ranges.index(kernel_information['b_max'])-1] if ranges.index(kernel_information['b_max']) != 0 else 0,
                        'g':kernel_information['g'],
                        'a':kernel_information['a'],
                        'sm': 1 if kernel_information['name'].find('sm') != -1 else 0,
                        'kernel name':kernel_information['name'],
                        'size':a_bin['size']['first'],
                        'time':a_bin['time'],
                    }
                    sample.update(metadata)
                    sample = {key: [val] for key, val in sample.items()}
                    df = df.append(pd.DataFrame.from_dict(sample), ignore_index=True)
    return df
def get_best_timings(df:pd.DataFrame, rows = ['graph name', 'b_max']):
    filtered = df.loc[df.groupby(rows)['time'].idxmin()]
    return filtered

def remove_empty_bins(df):
    return df[df['size']!=0]

            

## Reading timing information

In [None]:
# train and test folders should contain experiment output JSON files generated by main executable
train_folder = 'build/json_files/train'
test_folder = 'build/json_files/test'

train_json_objects = read_json_data_folder(train_folder)
test_json_objects = read_json_data_folder(test_folder)

train_df = extract_df_from_json_objects(train_json_objects)
test_df = extract_df_from_json_objects(test_json_objects)

train_df = remove_empty_bins(train_df)
test_df = remove_empty_bins(test_df)

# the training set contains only the parameters resulting in the best timings
train_df = get_best_timings(train_df)

# This is the best kernel for each bin on each test graph
# will be used as ground truth when evaluating
test_oracle_df  = get_best_timings(test_df)


## Training random forest model and evaluation

In [None]:
feature_columns = ['b_max', 'b_min', 'size', 'avg_deg','max_deg']
label_columns = ['g', 'a', 'sm']

X_train, Y_train = train_df[feature_columns], train_df[label_columns]
# Train the model
from sklearn.ensemble import RandomForestClassifier
dt = RandomForestClassifier(criterion = 'entropy', random_state = 42)
dt.fit(X_train, Y_train)

# get the features of each bin in each test graph
features_to_test = test_oracle_df[feature_columns+["graph name"]]
# predict the values for each bin on each test graph
predictions = dt.predict(features_to_test[feature_columns])
# Get the timings of the predicted labels for each bin for each graph
predictions_and_graphs = features_to_test[['graph name', 'b_max']]
for i, label in enumerate(label_columns):
    predictions_and_graphs = predictions_and_graphs.assign(**{label : predictions[:, i]})
predictions_and_graphs["time"] = predictions_and_graphs.apply(lambda x: test_df[(test_df['graph name'] == x['graph name']) & (test_df['b_max'] == x['b_max']) & (test_df['g'] == x['g']) & (test_df['a'] == x['a']) & (test_df['sm'] == x['sm'])].iloc[0]['time'], axis = 1)
# compare to ground truth
for graph in predictions_and_graphs['graph name'].unique():
    graph_best_timings = test_oracle_df[test_oracle_df['graph name'] == graph][['b_max', 'time']]
    graph_predicted_timings = predictions_and_graphs[predictions_and_graphs['graph name'] == graph][['b_max', 'time']]
    loss = 0
    total_best_time = 0
    total_predicted_time = 0
    for b_max in graph_best_timings['b_max'].unique():
        best_time = graph_best_timings[graph_best_timings['b_max'] == b_max]['time'].iloc[0]
        predicted_time = graph_predicted_timings[graph_predicted_timings['b_max'] == b_max]['time'].iloc[0]
        total_best_time+=best_time
        total_predicted_time+=predicted_time
        loss+=predicted_time-best_time
    print("Graph", graph)
    print("Best time =", "{:.3f}".format(total_best_time), "Predicted time =", "{:.3f}".format(total_predicted_time), "Slowdown % =", "{:.2f}".format((total_predicted_time-total_best_time)/total_best_time*100)+"%")