# Use phoronix data

In [1]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
# high-level plots
import seaborn as sns

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OrdinalEncoder
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# metrics
from sklearn.metrics import mean_absolute_error

# we use it to interact with the file system
import os
# compute time
from time import time

## Get data

### Test profiles

In [2]:
system_dir = './data/systems/'
system_names = os.listdir(system_dir)

s_data = []

for sn in system_names:
    s_data.append(pd.read_csv(system_dir+sn, index_col=0))

def separate_perf(perf):
    raw_perf = []
    std_perf = []

    for p in perf:
        tab = str(p).replace(" ", "").split("+/-")
        raw_perf.append(tab[0])
        if len(tab) > 1:
            std_perf.append(tab[1])
        else:
            std_perf.append('')

    return (raw_perf, std_perf)

for i in range(len(s_data)):
    
    raw_perf, std_perf = separate_perf(s_data[i]["perf"])

    s_data[i]["raw_perf"] = raw_perf
    s_data[i]["std_perf"] = std_perf
    del s_data[i]["perf"]

s_data[0]

Unnamed: 0,idproc,descproc,percentile,nbproc,raw_perf,std_perf
0,/s/AMD+Ryzen+5+5600X+6-Core,AMD Ryzen 5 5600X 6-Core,98th,6,39,
1,/s/Intel+Core+i9-11900K,Intel Core i9-11900K,91st,10,40,2
2,/s/AMD+Ryzen+9+5950X+16-Core,AMD Ryzen 9 5950X 16-Core,91st,34,40,4
3,/s/AMD+Ryzen+7+5800X+8-Core,AMD Ryzen 7 5800X 8-Core,90th,15,40,4
4,/s/AMD+Ryzen+9+5900X+12-Core,AMD Ryzen 9 5900X 12-Core,90th,27,41,5
...,...,...,...,...,...,...
100,/s/Intel+Celeron+J3060,Intel Celeron J3060,4th,15,211,2
101,/s/ARMv8+Cortex-A72+4-Core,ARMv8 Cortex-A72 4-Core,4th,3,219,13
102,/s/ARMv7+Cortex-A15+8-Core,ARMv7 Cortex-A15 8-Core,3rd,4,243,5
103,/s/Mobile+AMD+Sempron+3500,Mobile AMD Sempron 3500,2nd,23,265,26


### Hardware

In [3]:
hard_dir = './data/procs/'
hard_names = os.listdir(hard_dir)

lscpu = []
catproc = []

for hn in hard_names:
    lscpu.append(np.loadtxt(hard_dir+hn+"/lscpu.txt", dtype =str, delimiter = ":"))
    catproc.append(np.loadtxt(hard_dir+hn+"/catproc.txt", dtype =str, delimiter = ":"))

### lscpu files

In [4]:
keywords = dict()

delete_char_cp = [" ","\t"]
delete_char_lscpu = [" ", "\xa0", "(s)", "é"]

# we translate the non-english information in english for occurences with 10 measurements or more
# we neglete the other measurements
# finally, we kept the information having 100+ occurences in the final list
translate_kw = [
              ("architektura", 'architecture'), 
              ("arquitetura", 'architecture'),
              ("arquitectura", 'architecture'),
              ("architektur", 'architecture'),
              ('arkitektur', 'architecture'),
              ('cpuoperationsmodus', "cpuop-mode"),
              ("modeopratoiredesprocesseurs", "cpuop-mode"),
              ("tryb(y)pracycpu", "cpuop-mode"),
              ("kolejnośćbajtów", "byteorder"),
              ('ordemdosbytes', "byteorder"),
              ('byte-reihenfolge', "byteorder"),
              ("ordendebytes", "byteorder"),
              ("wątkównardzeń", "threadpercore"),
              ("threadparcœur", "threadpercore"),
              ("threadprokern", "threadpercore"),
              ("hilopornúcleo", "threadpercore"),
              ("kern(e)prosocket", 'corepersocket'),
              ('kernenpervoet', 'corepersocket'), 
              ('cœurparsocket', 'corepersocket'), 
              ('núcleoporzócalo', 'corepersocket'), 
              ('rdzeninagniazdo','corepersocket'),
              ('sockel', 'socket'),
              ("modelo", 'model'),
              ("modell", 'model'),
              ('modèle', 'model'),
              ("nomdemodèle", 'modelname'),
              ('nomdemodel', 'modelname'),
              ("węzłównuma", 'numanode'),
              ('nœudnuma', 'numanode'),
              ('nodonuma', "numanode"),
              ('numa-knoten', "numanode"),
              ("cachel1d", 'l1dcache'),
              ("cachel1i", 'l1icache'),
              ("cachel2", 'l2cache'),
              ("cachel3", 'l3cache'),
              ('cachedel1d', 'l1dcache'),
              ('cachedel1i', 'l1icache'),
              ('cachedel2', 'l2cache'),
              ('cachedel3', 'l3cache'),
              ('cachl1d', 'l1dcache'),
              ('cachl1i', 'l1icache'),
              ('cachl2', 'l2cache'),
              ('cachl3', 'l3cache'),
              ('familledeprocesseur', 'cpufamily'),
              ('familiadecpu', 'cpufamily'),
              ('cpu-familie', 'cpufamily'),
              ('famíliadacpu', 'cpufamily'),
              ('prozessorfamilie', 'cpufamily'),
              ('rodinacpu', 'cpufamily'),
              ('rodzinacpu', 'cpufamily'),
              ('vitesseminimaleduprocesseurenmhz' ,'cpuminmhz'),
              ('minimaletaktfrequenzdercpu', 'cpuminmhz'),
              ('cpumhzmín.', 'cpuminmhz'),
              ('vitessemaximaleduprocesseurenmhz', 'cpumaxmhz'),
              ('maximaletaktfrequenzdercpu', 'cpumaxmhz'),
              ('cpumhzmáx.', 'cpumaxmhz'),
              ('vitesseduprocesseurenmhz', 'cpumhz'),
              ('virtualisation', 'virtualization'),
              ('wirtualizacja', 'virtualization'),
              ('virtualização', 'virtualization'),
              ('virtualisierung', 'virtualization'),
              ('virtualización', 'virtualization'),
              ('vulnerabilidade', 'vulnerability'),
              ('vulnrabilit', 'vulnerability'),
              ('identifiantconstructeur', 'vendorid'),
              ('iddelvendedor', 'vendorid'),
              ('anbieterkennung', 'vendorid'),
              ('idproducenta', 'vendorid'),
              ('processeur', 'cpu'),
              ('listederonline-cpu', 'on-linecpulist'),
              ('listedecpuenligne', 'on-linecpulist'),
              ('numanode0decpu', 'numanode0cpu'),
              ('numa-knoten0cpu', 'numanode0cpu'),
              ('cpudenó0numa', 'numanode0cpu'),
              ]

for l in lscpu:
    for kw in [k[0] for k in l]:
        kw = 'ls_'+kw.lower()
        for dl in delete_char_lscpu:
            kw = kw.replace(dl, "")
        for tr in translate_kw:
            kw = kw.replace(tr[0], tr[1])
        if kw not in keywords:
            keywords[kw] = 1
        else:
            keywords[kw]+=1

for cp in catproc:
    for c in cp:
        kw = 'cp_'+c[0]
        kw = kw.lower()
        for dl in delete_char_cp:
            kw = kw.replace(dl, "")
        if kw not in keywords:
            keywords[kw] = 1
        else:
            keywords[kw]+=1

kw_list = [k for k in keywords if keywords[k]>300] 

for kw in kw_list:
    print(kw, keywords[kw])


ls_architecture 910
ls_cpuop-mode 845
ls_byteorder 860
ls_cpu 916
ls_on-linecpulist 872
ls_threadpercore 886
ls_corepersocket 886
ls_socket 828
ls_numanode 796
ls_vendorid 848
ls_cpufamily 847
ls_model 882
ls_modelname 501
ls_stepping 826
ls_cpumhz 843
ls_cpumaxmhz 444
ls_cpuminmhz 443
ls_bogomips 857
ls_virtualization 776
ls_l1dcache 863
ls_l1icache 858
ls_l2cache 862
ls_l3cache 670
ls_numanode0cpu 789
ls_flags 384
cp_processor 936
cp_vendor_id 860
cp_cpufamily 860
cp_model 860
cp_modelname 882
cp_stepping 860
cp_microcode 741
cp_cpumhz 860
cp_cachesize 860
cp_physicalid 835
cp_siblings 835
cp_coreid 835
cp_cpucores 835
cp_apicid 834
cp_initialapicid 830
cp_fpu 860
cp_fpu_exception 860
cp_cpuidlevel 860
cp_wp 860
cp_flags 860
cp_bugs 396
cp_bogomips 908
cp_clflushsize 860
cp_cache_alignment 860
cp_addresssizes 860
cp_powermanagement 860


In [5]:
charac = []

for i in range(len(catproc)):
    cp = catproc[i]
    modif_kw = dict()
    for c in cp:
        kw = 'cp_'+c[0]
        kw = kw.lower()
        for dl in delete_char_cp:
            kw = kw.replace(dl, "")
        if kw in kw_list:
            modif_kw[kw] = c[1].replace(" ", "")
    ls = lscpu[i]
    for l in ls:
        kw = 'ls_'+l[0]
        kw = kw.lower()
        for dl in delete_char_lscpu:
            kw = kw.replace(dl, "")
        for tr in translate_kw:
            kw = kw.replace(tr[0], tr[1])
        if kw in kw_list:
            modif_kw[kw] = l[1].replace(" ", "")
    final_kw_val = []
    for kw in kw_list:
        if kw in modif_kw:
            final_kw_val.append(modif_kw[kw])
        else:
            final_kw_val.append("")
    charac.append(final_kw_val)

charac[0]

['x86_64',
 '32-bit,64-bit',
 'LittleEndian',
 '48',
 '0-47',
 '2',
 '24',
 '1',
 '1',
 'AuthenticAMD',
 '23',
 '49',
 'AMDRyzenThreadripper3960X24-CoreProcessor',
 '0',
 '2200.000',
 '6635.1558',
 '2200.0000',
 '7600.29',
 'AMD-V',
 '768KiB',
 '768KiB',
 '12MiB',
 '128MiB',
 '0-47',
 'fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmovpatpse36clflushmmxfxsrssesse2htsyscallnxmmxextfxsr_optpdpe1gbrdtscplmconstant_tscrep_goodnoplnonstop_tsccpuidextd_apicidaperfmperfpnipclmulqdqmonitorssse3fmacx16sse4_1sse4_2movbepopcntaesxsaveavxf16crdrandlahf_lmcmp_legacysvmextapiccr8_legacyabmsse4amisalignsse3dnowprefetchosvwibsskinitwdttcetopoextperfctr_coreperfctr_nbbpextperfctr_llcmwaitxcpbcat_l3cdp_l3hw_pstatessbdmbaibpbstibpvmmcallfsgsbasebmi1avx2smepbmi2cqmrdt_ardseedadxsmapclflushoptclwbsha_nixsaveoptxsavecxgetbv1xsavescqm_llccqm_occup_llccqm_mbm_totalcqm_mbm_localclzeroirperfxsaveerptrrdpruwbnoinvdaratnptlbrvsvm_locknrip_savetsc_scalevmcb_cleanflushbyasiddecodeassistspausefilterpfthresholdavicv_vms

In [6]:
df = pd.DataFrame(charac)
df.columns = kw_list
df['id'] = hard_names 
df = df.set_index('id')
df

Unnamed: 0_level_0,ls_architecture,ls_cpuop-mode,ls_byteorder,ls_cpu,ls_on-linecpulist,ls_threadpercore,ls_corepersocket,ls_socket,ls_numanode,ls_vendorid,...,cp_fpu_exception,cp_cpuidlevel,cp_wp,cp_flags,cp_bugs,cp_bogomips,cp_clflushsize,cp_cache_alignment,cp_addresssizes,cp_powermanagement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMDRyzenThreadripper3960X24-Core,x86_64,"32-bit,64-bit",LittleEndian,48,0-47,2,24,1,1,AuthenticAMD,...,yes,16,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,sysret_ss_attrsspectre_v1spectre_v2spec_store_...,7600.96,64,64,"43bitsphysical,48bitsvirtual",tsttptmhwpstatecpbeff_freq_ro[13][14]
IntelCorei7870,x86_64,"32-bit,64-bit",LittleEndian,8,0-7,2,4,1,1,GenuineIntel,...,yes,11,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,,5851.77,64,64,"36bitsphysical,48bitsvirtual",
2xAMDOpteron23xx,x86_64,"32-bit,64-bit",LittleEndian,2,01,1,1,2,1,AuthenticAMD,...,yes,5,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,,4600.16,64,64,"48bitsphysical,48bitsvirtual",
IntelXeonD-2191,x86_64,"32-bit,64-bit",LittleEndian,36,0-35,2,18,1,1,GenuineIntel,...,yes,22,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,,3192.47,64,64,"46bitsphysical,48bitsvirtual",
AMDRyzen5PRO4650G,x86_64,"32-bit,64-bit",LittleEndian,12,,2,6,,1,AuthenticAMD,...,yes,16,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,sysret_ss_attrsspectre_v1spectre_v2spec_store_...,7385.96,64,64,"48bitsphysical,48bitsvirtual",tsttptmhwpstatecpbeff_freq_ro[13][14]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AMDRyzen7PRO4750G,,,,16,,,,,,,...,yes,16,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,sysret_ss_attrsspectre_v1spectre_v2spec_store_...,7186.58,64,64,"48bitsphysical,48bitsvirtual",tsttptmhwpstatecpbeff_freq_ro[13][14]
IntelXeonE5420,x86_64,"32-bit,64-bit",LittleEndian,4,0-3,1,4,1,1,GenuineIntel,...,yes,13,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,,4987.57,64,64,"38bitsphysical,48bitsvirtual",
AMDEPYC72528-Core,x86_64,"32-bit,64-bit",LittleEndian,16,0-15,2,8,1,1,AuthenticAMD,...,yes,16,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,sysret_ss_attrsspectre_v1spectre_v2spec_store_...,6188.39,64,64,"43bitsphysical,48bitsvirtual",tsttptmhwpstatecpbeff_freq_ro[13][14]
AMDRyzen51500X,x86_64,"32-bit,64-bit",LittleEndian,8,0-7,2,4,1,1,AuthenticAMD,...,yes,13,yes,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,fxsave_leak,7685.54,64,64,"48bitsphysical,48bitsvirtual",tsttptmhwpstateeff_freq_ro[13][14]


### Export

In [7]:
#df.to_csv('data/ls_cp_info.csv')

### not shown in the code -> manual cleaning of the dataset

- removing the false or meaningless values
- homogenize the values (eg remove str in text field)
- replace 'Mib' with '000Kib' for caches

In [8]:
envs = pd.read_csv("data/ls_cp_info_modif.csv").set_index("id")
envs.head()

Unnamed: 0_level_0,ls_architecture,ls_cpuop-mode,ls_byteorder,ls_cpu,ls_on-linecpulist,ls_threadpercore,ls_corepersocket,ls_socket,ls_numanode,ls_vendorid,...,cp_fpu_exception,cp_cpuidlevel,cp_wp,cp_flags,cp_bugs,cp_bogomips,cp_clflushsize,cp_cache_alignment,cp_addresssizes,cp_powermanagement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMDRyzenThreadripper3960X24-Core,x86_64,32.64,LittleEndian,48.0,0-47,2.0,24.0,1.0,1.0,AuthenticAMD,...,1.0,16.0,1.0,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,sysret_ss_attrsspectre_v1spectre_v2spec_store_...,7600.96,64.0,64.0,43.48,tsttptmhwpstatecpbeff_freq_ro[13][14]
IntelCorei7870,x86_64,32.64,LittleEndian,8.0,0-7,2.0,4.0,1.0,1.0,GenuineIntel,...,1.0,11.0,1.0,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,,5851.77,64.0,64.0,36.48,
2xAMDOpteron23xx,x86_64,32.64,LittleEndian,2.0,0.1,1.0,1.0,2.0,1.0,AuthenticAMD,...,1.0,5.0,1.0,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,,4600.16,64.0,64.0,48.48,
IntelXeonD-2191,x86_64,32.64,LittleEndian,36.0,0-35,2.0,18.0,1.0,1.0,GenuineIntel,...,1.0,22.0,1.0,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,,3192.47,64.0,64.0,46.48,
AMDRyzen5PRO4650G,x86_64,32.64,LittleEndian,12.0,,2.0,6.0,,1.0,AuthenticAMD,...,1.0,16.0,1.0,fpuvmedepsetscmsrpaemcecx8apicsepmtrrpgemcacmo...,sysret_ss_attrsspectre_v1spectre_v2spec_store_...,7385.96,64.0,64.0,48.48,tsttptmhwpstatecpbeff_freq_ro[13][14]
