In [36]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier

In [16]:
labels = pd.read_csv('./labels.csv', index_col=0)
pts = pd.read_csv('./pts.csv', index_col=0)

In [17]:
pts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 871535 entries, 0 to 871534
Data columns (total 83 columns):
system_id                               871535 non-null int64
HPL_Tflops                              871535 non-null float64
StarDGEMM_Gflops                        871535 non-null float64
SingleDGEMM_Gflops                      871535 non-null float64
PTRANS_GBs                              871535 non-null float64
MPIRandomAccess_LCG_GUPs                871535 non-null float64
MPIRandomAccess_GUPs                    871535 non-null float64
StarRandomAccess_LCG_GUPs               871535 non-null float64
SingleRandomAccess_LCG_GUPs             871535 non-null float64
StarRandomAccess_GUPs                   871535 non-null float64
SingleRandomAccess_GUPs                 871535 non-null float64
StarSTREAM_Copy                         871535 non-null float64
StarSTREAM_Scale                        871535 non-null float64
StarSTREAM_Add                          871535 non-null flo

In [18]:
all_data = pts.drop([
'matrix_id',
'system_id',
'time',
'matrix_id',
'status_id'], axis=1)

In [4]:
# plain_data = pts.drop([
# 'matrix_id', 
# 'status_id', 
# 'time', 
# 'HPL_Tflops',        
# 'StarDGEMM_Gflops',                        
# 'SingleDGEMM_Gflops',                      
# 'PTRANS_GBs',                              
# 'MPIRandomAccess_LCG_GUPs',                
# 'MPIRandomAccess_GUPs',                    
# 'StarRandomAccess_LCG_GUPs',               
# 'SingleRandomAccess_LCG_GUPs',             
# 'StarRandomAccess_GUPs',                   
# 'SingleRandomAccess_GUPs',                 
# 'StarSTREAM_Copy',                         
# 'StarSTREAM_Scale',                        
# 'StarSTREAM_Add',                          
# 'StarSTREAM_Triad',                        
# 'SingleSTREAM_Copy',                       
# 'SingleSTREAM_Scale',                      
# 'SingleSTREAM_Add',                        
# 'SingleSTREAM_Triad',                      
# 'StarFFT_Gflops',                          
# 'SingleFFT_Gflops',                        
# 'MPIFFT_Gflops',                           
# 'MaxPingPongLatency_usec',                 
# 'RandomlyOrderedRingLatency_usec',         
# 'MinPingPongBandwidth_GBytes',             
# 'NaturallyOrderedRingBandwidth_GBytes',    
# 'RandomlyOrderedRingBandwidth_GBytes',     
# 'MinPingPongLatency_usec',                 
# 'AvgPingPongLatency_usec',                 
# 'MaxPingPongBandwidth_GBytes',             
# 'AvgPingPongBandwidth_GBytes',             
# 'NaturallyOrderedRingLatency_usec',        
# 'MemProc',                                
# 'core_count',                              
# 'cpu_freq',                                
# 'bogo_mips',                              
# 'l1_cache',                                
# 'l2_cache',                                
# 'l3_cache',                                
# 'memory_size',                             
# 'memory_freq',                             
# 'memory_type'], axis=1)

In [19]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 871535 entries, 0 to 871534
Data columns (total 79 columns):
HPL_Tflops                              871535 non-null float64
StarDGEMM_Gflops                        871535 non-null float64
SingleDGEMM_Gflops                      871535 non-null float64
PTRANS_GBs                              871535 non-null float64
MPIRandomAccess_LCG_GUPs                871535 non-null float64
MPIRandomAccess_GUPs                    871535 non-null float64
StarRandomAccess_LCG_GUPs               871535 non-null float64
SingleRandomAccess_LCG_GUPs             871535 non-null float64
StarRandomAccess_GUPs                   871535 non-null float64
SingleRandomAccess_GUPs                 871535 non-null float64
StarSTREAM_Copy                         871535 non-null float64
StarSTREAM_Scale                        871535 non-null float64
StarSTREAM_Add                          871535 non-null float64
StarSTREAM_Triad                        871535 non-null f

In [6]:
summit_only_data = plain_data[plain_data['system_id'] == 3]
summit_only_labels = labels[plain_data['system_id'] == 3]

In [7]:
summit_only_data.info()
summit_only_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203878 entries, 622584 to 826461
Data columns (total 39 columns):
system_id             203878 non-null int64
rows                  203878 non-null int64
cols                  203878 non-null int64
min_nnz_row           203878 non-null int64
row_var               203878 non-null float64
col_var               203878 non-null float64
diag_var              203878 non-null float64
nnz                   203878 non-null int64
frob_norm             203878 non-null float64
symm_frob_norm        203878 non-null float64
antisymm_frob_norm    203878 non-null float64
one_norm              203878 non-null float64
inf_norm              203878 non-null float64
symm_inf_norm         203878 non-null float64
antisymm_inf_norm     203878 non-null float64
max_nnz_row           203878 non-null int64
trace                 203878 non-null float64
abs_trace             203878 non-null float64
min_nnz_row.1         203878 non-null int64
avg_nnz_row           20

In [38]:
X = all_data.as_matrix()
y = labels.as_matrix()
y

array([[0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]])

In [39]:
from sklearn.externals import joblib
classifier = RandomForestClassifier()
classifier = classifier.fit(X,y)
joblib.dump(classifier, 'all_data_classifier.pkl')

['all_data_classifier.pkl']

In [40]:
results_y = classifier.predict(X)
results_y

array([[ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.]])

In [41]:
print("Predicted number of instances:\t",
sum(results_y[:,0]),
sum(results_y[:,1]),
sum(results_y[:,2]),
sum(results_y[:,3]),
sum(results_y[:,4]),
sum(results_y[:,5]))

print("Actual number of instances:\t",
sum(y[:,0]),
sum(y[:,1]),
sum(y[:,2]),
sum(y[:,3]),
sum(y[:,4]),
sum(y[:,5]))

Predicted number of instances:	 348605.0 168147.0 18511.0 5553.0 2367.0 580.0
Actual number of instances:	 357726 178625 21098 6904 3616 883


In [42]:
classifier.score(X,y)

0.89591812147532801

In [43]:
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
print(coverage_error(y, results_y))
print(label_ranking_average_precision_score(y, results_y))
print(label_ranking_loss(y, results_y))

1.00632332609
0.941945405393
0.0704532998292
