In [56]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import json
import numpy as np

## Loading the Dataset

In [57]:
with open("dataset.json", "r") as json_file:
    dataset = json.load(json_file)

print(dataset)

{'select.c': [[{'aluf': 1.0, 'alui': 1.0, 'array_accesses': 1.0, 'array_reads': 1.0, 'array_writes': 0.0, 'branch': 1.0, 'has_inter_loop_carried_dependency': False, 'instruction_sum': 9.0, 'inter_loop_dependencies': 1.0, 'inter_loop_dependencies_anti': 0.0, 'inter_loop_dependencies_flow': 0.0, 'inter_loop_dependencies_output': 0.0, 'intra_loop_dependencies': 0.0, 'intra_loop_dependencies_anti': 0.0, 'intra_loop_dependencies_flow': 0.0, 'intra_loop_dependencies_output': 0.0, 'load': 1.0, 'loop_line_number': 0, 'loop_nest_level': 3, 'max_inter_loop_dependency_distance': 0.0, 'max_inter_loop_dependency_distance_anti': 0.0, 'max_inter_loop_dependency_distance_flow': 0.0, 'max_inter_loop_dependency_distance_output': 0.0, 'max_intra_loop_dependency_distance': 0.0, 'max_intra_loop_dependency_distance_anti': 0.0, 'max_intra_loop_dependency_distance_flow': 0.0, 'max_intra_loop_dependency_distance_output': 0.0, 'mem': 2.0, 'operand_sum': 21.0, 'other': 3.0, 'resmii_111': 2, 'resmii_121': 2, 'res

In [58]:
X = list()
Y = list()

for program_name in dataset:
    feature_info = dataset[program_name][0]
    timing_info = dataset[program_name][1]
    timing_info = [timing_info[key] for key in timing_info]

    num_loops_feature_info = len(feature_info)
    num_loops_timing_info = len(timing_info)

    if not num_loops_feature_info == num_loops_timing_info:
        print(f"================")
        print(f"Number of loops found in feature info: {num_loops_feature_info}")
        print(f"Number of loops found in feature info: {num_loops_timing_info}")
        print(f"Mismatch! Continuing...")

    feature_info = sorted(feature_info, key=lambda x: x["loop_line_number"])
    
    for i in range(len(feature_info)):
        loop_features = feature_info[i]
        loop_timing = timing_info[i]

        X_loop = [loop_features[feature_name] for feature_name in loop_features]
        Y_loop = min(loop_timing, key=lambda x: loop_timing[x])

        X.append(X_loop)
        Y.append(Y_loop)

Number of loops found in feature info: 5
Number of loops found in feature info: 7
Mismatch! Continuing...


In [59]:
X = np.array(X)
Y = np.array(Y)

print(X.shape)
print(Y.shape)

(64, 35)
(64,)


## Create splits

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, random_state=42)

In [61]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(48, 35) (48,)
(16, 35) (16,)


## Train Random Forest

In [62]:
from sklearn.ensemble import RandomForestClassifier
import pickle

In [63]:
feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

In [64]:
with open('models/forest/forest.pkl', 'wb') as model_file:
    pickle.dump(forest, model_file)

## Evaluate Model

In [65]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [66]:
with open('models/forest/forest.pkl', 'rb') as model_file:
    forest = pickle.load(model_file)

In [67]:
y_pred = forest.predict(X_test)

In [68]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Generate a confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.31
Classification Report:
              precision    recall  f1-score   support

           1       0.33      0.50      0.40         6
           2       0.00      0.00      0.00         2
          32       1.00      0.50      0.67         2
           4       0.25      0.33      0.29         3
          64       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1

    accuracy                           0.31        16
   macro avg       0.26      0.22      0.23        16
weighted avg       0.30      0.31      0.29        16

Confusion Matrix:
[[3 1 0 2 0 0]
 [2 0 0 0 0 0]
 [0 0 1 1 0 0]
 [2 0 0 1 0 0]
 [1 1 0 0 0 0]
 [1 0 0 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Feature Importance

In [69]:
import time
# import pandas as pd
import matplotlib.pyplot as plt

In [70]:
start_time = time.time()
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

Elapsed time to compute the importances: 0.013 seconds


In [72]:
# forest_importances = pd.Series(importances, index=feature_names)

# fig, ax = plt.subplots()
# forest_importances.plot.bar(yerr=std, ax=ax)
# ax.set_title("Feature importances using MDI")
# ax.set_ylabel("Mean decrease in impurity")
# fig.tight_layout()