In [1]:
import os
import json
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import svm

from pmlb import fetch_data

## Read Data
You can change the code below to read your own data.

In [2]:
data_name = 'penn_benchmark_cpu'

# Returns a pandas DataFrame
df = fetch_data('197_cpu_act')
print(df.describe())

             lread       lwrite         scall        sread       swrite  \
count  8192.000000  8192.000000   8192.000000  8192.000000  8192.000000   
mean     19.559692    13.106201   2306.318237   210.479980   150.058228   
std      53.353799    29.891726   1633.617322   198.980146   160.478980   
min       0.000000     0.000000    109.000000     6.000000     7.000000   
25%       2.000000     0.000000   1012.000000    86.000000    63.000000   
50%       7.000000     1.000000   2051.500000   166.000000   117.000000   
75%      20.000000    10.000000   3317.250000   279.000000   185.000000   
max    1845.000000   575.000000  12493.000000  5318.000000  5456.000000   

              fork         exec         rchar         wchar        pgout  ...  \
count  8192.000000  8192.000000  8.192000e+03  8.192000e+03  8192.000000  ...   
mean      1.884554     2.791998  1.970137e+05  9.589829e+04     2.285317  ...   
std       2.479493     5.212456  2.394808e+05  1.407569e+05     5.307038  ...   


In [3]:
df.head()

Unnamed: 0,lread,lwrite,scall,sread,swrite,fork,exec,rchar,wchar,pgout,...,pgscan,atch,pgin,ppgin,pflt,vflt,runqsz,freemem,freeswap,target
0,6.0,2.0,1036.0,103.0,114.0,1.0,1.0,172076.0,355965.0,0.0,...,0.0,0.0,2.0,4.0,73.599998,89.0,2.0,6527.0,1851864.0,90.0
1,1.0,0.0,2165.0,205.0,101.0,0.4,1.2,43107.0,44139.0,4.8,...,181.399994,0.2,85.400002,88.199997,19.4,161.800003,3.0,130.0,1131931.0,88.0
2,62.0,77.0,3806.0,258.0,166.0,1.4,1.4,492142.0,268706.0,4.8,...,79.199997,2.2,7.6,12.2,68.0,218.800003,5.2,256.0,1314590.0,85.0
3,5.0,0.0,4721.0,256.0,177.0,0.99,2.58,524787.0,174964.0,14.51,...,189.860001,1.99,4.17,24.85,95.629997,248.910004,1.0,233.0,972606.0,81.0
4,42.0,55.0,3949.0,249.0,244.0,2.6,4.6,197289.0,529200.0,4.2,...,0.0,1.4,1.8,2.2,219.600006,297.200012,3.4,331.0,1013805.0,79.0


In [4]:
df['target'].median(), df['target'].min(), df['target'].max(),

(89.0, 0.0, 99.0)

In [5]:
df['target'].value_counts()

90.0    459
91.0    448
92.0    426
94.0    421
93.0    411
97.0    410
96.0    410
95.0    405
88.0    384
98.0    378
89.0    376
87.0    338
86.0    283
0.0     283
85.0    254
84.0    252
83.0    230
81.0    201
82.0    187
80.0    166
79.0    150
77.0    144
78.0    126
76.0    119
75.0    104
74.0     96
72.0     77
73.0     73
99.0     60
69.0     51
71.0     49
68.0     46
70.0     42
67.0     39
66.0     36
63.0     32
62.0     27
64.0     27
65.0     25
59.0     23
60.0     20
58.0     17
61.0     16
57.0     14
56.0     11
1.0      10
55.0     10
54.0      7
53.0      5
50.0      4
51.0      4
52.0      2
48.0      1
2.0       1
46.0      1
49.0      1
Name: target, dtype: int64

In [6]:
gt = (df['target'] > df['target'].median()).astype(int)
df['gt'] = gt

In [7]:
df['gt'].value_counts()

0    4364
1    3828
Name: gt, dtype: int64

## Train a classification model
In this step, you can either train a model in the cell below, or skip the training part and read the model prediction of the training data from your own model .

In [8]:
'''prepare data'''
target_col = 'gt'

to_keep = df.columns[:-2]
X = df[to_keep].values
y = df[target_col].values
y = y.reshape(len(y))

train, test, train_labels, test_labels = train_test_split(X, y, test_size = 0.2)

train_df = pd.DataFrame(train, columns=df.columns[:-2].values)
test_df = pd.DataFrame(test, columns=df.columns[:-2].values)

'''train the model'''
clf = MLPClassifier(random_state=1, max_iter=350)
clf.fit(train, train_labels)

'''report accuracy'''
print(clf.score(test,test_labels ))

0.8194020744356315


In [9]:
y_pred = clf.predict(train)

## Output Training Data

In [10]:
'''
cols: the column names of the input data.
data: the training input.
target_names: the names for the target classes (grount truth).
real_min: minimal values for columns of the input data.
real_max: maximal values for columns of the input data.
y_pred: the prediction of the input data from the model you want to explain.
y_gt: the ground truth of the input data.
'''

def output_data(cols, data, target_names, real_min, real_max, y_pred, y_gt):
    filename = "./output/"+data_name+"/test.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    to_output = {}
    to_output['columns'] = cols
    to_output['data'] = data
    to_output['target_names'] = target_names
    to_output['real_min'] = real_min
    to_output['real_max'] = real_max
    to_output['y_pred'] = y_pred
    to_output['y_gt'] = y_gt
    with open(filename, 'w') as output:
        output.write(json.dumps(to_output))

In [11]:
''' name the target classes '''
target_names = ["low", "high"]

''' obtain the min and max values '''
min_val = np.min(X, axis=0)
max_val = np.max(X, axis=0)

''' output data '''
output_data(cols=to_keep.tolist(), 
            data=train.tolist(), 
            target_names=target_names, 
            real_min=min_val.tolist(), 
            real_max=max_val.tolist(),
            y_pred=y_pred.tolist(), 
            y_gt=train_labels.tolist())