In [173]:
import numpy as np
import polars as pl
import pandas as pd
from pathlib import Path
from lisa.trace import Trace

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [170]:
PATH = "../training/geekbench_idle_trace_3_0411/"
trace1path = Path(PATH) / 'wk1-geekbench-1' / 'trace.dat'
trace2path = Path(PATH) / 'wk1-geekbench-2' / 'trace.dat'
trace3path = Path(PATH) / 'wk1-geekbench-3' / 'trace.dat'

cpu_to_cluster = lambda cpu: 'little' if cpu < 4 else 'big' if cpu > 5 else 'mid'
cluster_target_res = {
    'little': 2000 * 1000,
    'mid': 2500 * 1000,
    'big': 3500 * 1000,
}

flip_state = lambda x: int(not bool(x))

In [241]:
def trace_teo_accuracy(trace):
    cpu_idle = trace.df_event('cpu_idle')
    cpu_idle_miss = trace.df_event('cpu_idle_miss')
    
    wakeups = cpu_idle.query("state > 10")
    print('accuracy', 1 - (len(cpu_idle_miss) / len(wakeups)))
    print('misess', len(cpu_idle_miss) / len(wakeups))
    print('too deep', len(cpu_idle_miss.query("state == 1")) / len(wakeups))
    print('too shallow', len(cpu_idle_miss.query("state == 0")) / len(wakeups))

trace_teo_accuracy(trace1)
trace_teo_accuracy(trace2)

accuracy 0.8009649226875469
misess 0.19903507731245304
too deep 0.06326113064129062
too shallow 0.13577394667116244



KeyboardInterrupt



In [160]:
def join_callback_dfs(cpu, select, reflect, update):
    select = select.set_index('sleep_id')
    reflect = reflect.set_index('sleep_id')
    update = update.set_index('sleep_id')

    df = select.join(reflect, rsuffix='_re', how='inner').join(update, rsuffix='_up', how='inner').reset_index()
    df = df[['sleep_id', 'cpu', 'sleep_length', 'time_span', 'measured', 'util', 'max_cap', 's0hit', 's0int', 's0rec', 's1hit', 's1int', 's1rec', 'timer_state', 'duration_state', 'state', 'hit']]
    df['goal_state'] = df['time_span'].apply(lambda x: int(x > cluster_target_res[cpu_to_cluster(cpu)]))
    
    return df

def trace_to_idle_training(trace):
    teo_select = trace.df_event('trace_printk@func@teo_select')
    teo_reflect = trace.df_event('trace_printk@func@teo_reflect')
    teo_update = trace.df_event('trace_printk@func@teo_update')

    cpu_dfs = {cpu:[df] for cpu, df in teo_select.groupby('__cpu')}
    
    for cpu, df in teo_reflect.groupby('__cpu'):
        cpu_dfs[cpu].append(df)
        
    for cpu, df in teo_update.groupby('__cpu'):
        cpu_dfs[cpu].append(df)
        
    return pd.concat([join_callback_dfs(cpu, df[0], df[1], df[2]) for cpu, df in cpu_dfs.items()])

In [171]:
trace1 = Trace(trace1path)
trace2 = Trace(trace2path)
trace3 = Trace(trace3path)

In [172]:
trace2data = trace_to_idle_training(trace2)
trace2data

Unnamed: 0,sleep_id,cpu,sleep_length,time_span,measured,util,max_cap,s0hit,s0int,s0rec,s1hit,s1int,s1rec,timer_state,duration_state,state,hit,goal_state
0,162276,0,43945049,342936,198029,1,160,7,1887,0,6312,0,0,1,0,1,0,0
1,162277,0,43401674,527140,374706,1,160,7,2676,1,5523,0,0,1,0,1,0,0
2,162280,0,42174663,339925,191071,1,160,7,3366,2,4833,0,0,1,0,1,0,0
3,162282,0,41477316,914429,767243,2,160,7,3970,3,4229,0,0,1,0,1,0,0
4,162283,0,40493267,8559408,8534372,3,160,7,4498,4,3701,0,0,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36230,111598,7,37089435,140421,127837,815,1024,50,6791,9,1360,0,0,1,0,0,0,0
36231,111599,7,36870929,298625,292306,803,1024,44,6967,9,1190,0,0,1,0,0,0,0
36232,111600,7,36559121,518514,512521,803,1024,39,7121,9,1042,0,0,1,0,0,0,0
36233,111601,7,22890867,504273,491850,844,1024,35,7255,9,912,0,0,1,0,0,0,0


In [216]:
idle_data = pd.concat([cpu_dfs_joint, trace2data])
idle_data

Unnamed: 0,sleep_id,cpu,sleep_length,time_span,measured,util,max_cap,s0hit,s0int,s0rec,s1hit,s1int,s1rec,timer_state,duration_state,state,hit,goal_state
0,40444,0,51894280,1366781,1338368,6,160,1557,2989,2,3657,0,0,1,0,0,0,0
1,40445,0,50368767,147257,118478,6,160,1363,3640,3,3200,0,0,1,0,0,0,0
2,40446,0,50170566,199259,175485,6,160,1193,4209,4,2800,0,0,1,0,0,0,0
3,40447,0,49914381,433594,407988,6,160,1044,4707,5,2450,0,0,1,0,0,0,0
4,40448,0,49339633,1378214,1348905,6,160,914,5143,6,2144,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36230,111598,7,37089435,140421,127837,815,1024,50,6791,9,1360,0,0,1,0,0,0,0
36231,111599,7,36870929,298625,292306,803,1024,44,6967,9,1190,0,0,1,0,0,0,0
36232,111600,7,36559121,518514,512521,803,1024,39,7121,9,1042,0,0,1,0,0,0,0
36233,111601,7,22890867,504273,491850,844,1024,35,7255,9,912,0,0,1,0,0,0,0


In [199]:
def training_strip(df):
    df = df.drop(['sleep_id', 'time_span', 'measured', 'state', 'hit', 'timer_state', 'duration_state', 'cpu'], axis=1)
    X = df.drop(['goal_state'], axis=1)
    y = df['goal_state']
    return (X, y)

In [249]:
X, y = training_strip(idle_data)
1 - len(idle_data.query("state != goal_state")) / len(idle_data)

0.7759782053646151

## Train-test split

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

forest = RandomForestClassifier(n_estimators=10, n_jobs=8, max_depth=10)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.815
Accuracy on test set: 0.814


## Grid Search CV

In [255]:
parameters = {'n_estimators': [10, 20, 50], 'max_depth': [10, 20, 50]}
clf = GridSearchCV(RandomForestClassifier(n_jobs=8), parameters)
clf.fit(X, y)
print(clf.cv_results_['mean_test_score'].mean(), clf.cv_results_['mean_test_score'])
print(clf.best_estimator_.score(X, y))

0.8046761533693348 [0.81254703 0.81295661 0.81288125 0.80653365 0.80944237 0.81076775
 0.78408032 0.79363808 0.79923833]
0.8152624505312132


In [251]:
print(clf.cv_results_['mean_test_score'].mean(), clf.cv_results_['mean_test_score'])
print(clf.best_estimator_.score(X, y))

0.8012946832309962 [0.81443572 0.81628445 0.81604455 0.79808082 0.80090628 0.80363654
 0.78025608 0.78879909 0.79320862]
0.8145984340203407
