In [2]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
labels = pd.read_csv('./labels.csv', index_col=0)
pts = pd.read_csv('./pts.csv', index_col=0)

In [4]:
pts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 871535 entries, 0 to 871534
Data columns (total 83 columns):
system_id                               871535 non-null int64
HPL_Tflops                              871535 non-null float64
StarDGEMM_Gflops                        871535 non-null float64
SingleDGEMM_Gflops                      871535 non-null float64
PTRANS_GBs                              871535 non-null float64
MPIRandomAccess_LCG_GUPs                871535 non-null float64
MPIRandomAccess_GUPs                    871535 non-null float64
StarRandomAccess_LCG_GUPs               871535 non-null float64
SingleRandomAccess_LCG_GUPs             871535 non-null float64
StarRandomAccess_GUPs                   871535 non-null float64
SingleRandomAccess_GUPs                 871535 non-null float64
StarSTREAM_Copy                         871535 non-null float64
StarSTREAM_Scale                        871535 non-null float64
StarSTREAM_Add                          871535 non-null flo

In [5]:
plain_data = pts.drop([
'matrix_id', 
'status_id', 
'time', 
'HPL_Tflops',        
'StarDGEMM_Gflops',                        
'SingleDGEMM_Gflops',                      
'PTRANS_GBs',                              
'MPIRandomAccess_LCG_GUPs',                
'MPIRandomAccess_GUPs',                    
'StarRandomAccess_LCG_GUPs',               
'SingleRandomAccess_LCG_GUPs',             
'StarRandomAccess_GUPs',                   
'SingleRandomAccess_GUPs',                 
'StarSTREAM_Copy',                         
'StarSTREAM_Scale',                        
'StarSTREAM_Add',                          
'StarSTREAM_Triad',                        
'SingleSTREAM_Copy',                       
'SingleSTREAM_Scale',                      
'SingleSTREAM_Add',                        
'SingleSTREAM_Triad',                      
'StarFFT_Gflops',                          
'SingleFFT_Gflops',                        
'MPIFFT_Gflops',                           
'MaxPingPongLatency_usec',                 
'RandomlyOrderedRingLatency_usec',         
'MinPingPongBandwidth_GBytes',             
'NaturallyOrderedRingBandwidth_GBytes',    
'RandomlyOrderedRingBandwidth_GBytes',     
'MinPingPongLatency_usec',                 
'AvgPingPongLatency_usec',                 
'MaxPingPongBandwidth_GBytes',             
'AvgPingPongBandwidth_GBytes',             
'NaturallyOrderedRingLatency_usec',        
'MemProc',                                
'core_count',                              
'cpu_freq',                                
'bogo_mips',                              
'l1_cache',                                
'l2_cache',                                
'l3_cache',                                
'memory_size',                             
'memory_freq',                             
'memory_type'], axis=1)

In [6]:
plain_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 871535 entries, 0 to 871534
Data columns (total 39 columns):
system_id             871535 non-null int64
rows                  871535 non-null int64
cols                  871535 non-null int64
min_nnz_row           871535 non-null int64
row_var               871535 non-null float64
col_var               871535 non-null float64
diag_var              871535 non-null float64
nnz                   871535 non-null int64
frob_norm             871535 non-null float64
symm_frob_norm        871535 non-null float64
antisymm_frob_norm    871535 non-null float64
one_norm              871535 non-null float64
inf_norm              871535 non-null float64
symm_inf_norm         871535 non-null float64
antisymm_inf_norm     871535 non-null float64
max_nnz_row           871535 non-null int64
trace                 871535 non-null float64
abs_trace             871535 non-null float64
min_nnz_row.1         871535 non-null int64
avg_nnz_row           871535 

In [7]:
summit_only_data = plain_data[plain_data['system_id'] == 3]
summit_only_labels = labels[plain_data['system_id'] == 3]

In [8]:
summit_only_data.info()
summit_only_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203878 entries, 622584 to 826461
Data columns (total 39 columns):
system_id             203878 non-null int64
rows                  203878 non-null int64
cols                  203878 non-null int64
min_nnz_row           203878 non-null int64
row_var               203878 non-null float64
col_var               203878 non-null float64
diag_var              203878 non-null float64
nnz                   203878 non-null int64
frob_norm             203878 non-null float64
symm_frob_norm        203878 non-null float64
antisymm_frob_norm    203878 non-null float64
one_norm              203878 non-null float64
inf_norm              203878 non-null float64
symm_inf_norm         203878 non-null float64
antisymm_inf_norm     203878 non-null float64
max_nnz_row           203878 non-null int64
trace                 203878 non-null float64
abs_trace             203878 non-null float64
min_nnz_row.1         203878 non-null int64
avg_nnz_row           20

In [14]:
X = summit_only_data.as_matrix()
y = summit_only_labels.as_matrix()

In [15]:
from sklearn.externals import joblib
classifier = OneVsRestClassifier(LinearSVC(), n_jobs=6)
classifier = classifier.fit(X,y)
joblib.dump(classifier, 'classifier.pkl')

['classifier.pkl']

In [18]:
results_y = classifier.predict(X)
results_y

array([[1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       ..., 
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]])

In [41]:
print("Predicted number of instances:\t",
sum(results_y[:,0]), '\t',
sum(results_y[:,1]), 
sum(results_y[:,2]), '\t',
sum(results_y[:,3]), '\t',
sum(results_y[:,4]), '\t',
sum(results_y[:,5]))

print("Actual number of instances:\t",
sum(y[:,0]), '\t',
sum(y[:,1]), '\t',
sum(y[:,2]), '\t',
sum(y[:,3]), '\t',
sum(y[:,4]), '\t',
sum(y[:,5]))

Predicted number of instances:	 37276 	 114914 41739 	 19439 	 31327 	 15235
Actual number of instances:	 74871 	 60248 	 5483 	 2033 	 842 	 192


In [42]:
classifier.score(X,y)

0.20758002334729594

In [46]:
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
print(coverage_error(y, results_y))
print(label_ranking_average_precision_score(y, results_y))
print(label_ranking_loss(y, results_y))

3.14660237985
0.560519158517
0.49176471005
