In [29]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import imblearn.pipeline as pl
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score, roc_curve

In [2]:
labels = pd.read_csv('./labels.csv', index_col=0)
pts = pd.read_csv('./pts.csv', index_col=0)

In [8]:
summit_data = pts[pts['system_id'] == 3]
summit_labels = labels[pts['system_id'] == 3]
summit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203878 entries, 622584 to 826461
Data columns (total 83 columns):
system_id                               203878 non-null int64
HPL_Tflops                              203878 non-null float64
StarDGEMM_Gflops                        203878 non-null float64
SingleDGEMM_Gflops                      203878 non-null float64
PTRANS_GBs                              203878 non-null float64
MPIRandomAccess_LCG_GUPs                203878 non-null float64
MPIRandomAccess_GUPs                    203878 non-null float64
StarRandomAccess_LCG_GUPs               203878 non-null float64
SingleRandomAccess_LCG_GUPs             203878 non-null float64
StarRandomAccess_GUPs                   203878 non-null float64
SingleRandomAccess_GUPs                 203878 non-null float64
StarSTREAM_Copy                         203878 non-null float64
StarSTREAM_Scale                        203878 non-null float64
StarSTREAM_Add                          203878 non-nul

In [9]:
summit_data = summit_data.drop([
'matrix_id', 
'status_id', 
'time', 
'HPL_Tflops',        
'StarDGEMM_Gflops',                        
'SingleDGEMM_Gflops',                      
'PTRANS_GBs',                              
'MPIRandomAccess_LCG_GUPs',                
'MPIRandomAccess_GUPs',                    
'StarRandomAccess_LCG_GUPs',               
'SingleRandomAccess_LCG_GUPs',             
'StarRandomAccess_GUPs',                   
'SingleRandomAccess_GUPs',                 
'StarSTREAM_Copy',                         
'StarSTREAM_Scale',                        
'StarSTREAM_Add',                          
'StarSTREAM_Triad',                        
'SingleSTREAM_Copy',                       
'SingleSTREAM_Scale',                      
'SingleSTREAM_Add',                        
'SingleSTREAM_Triad',                      
'StarFFT_Gflops',                          
'SingleFFT_Gflops',                        
'MPIFFT_Gflops',                           
'MaxPingPongLatency_usec',                 
'RandomlyOrderedRingLatency_usec',         
'MinPingPongBandwidth_GBytes',             
'NaturallyOrderedRingBandwidth_GBytes',    
'RandomlyOrderedRingBandwidth_GBytes',     
'MinPingPongLatency_usec',                 
'AvgPingPongLatency_usec',                 
'MaxPingPongBandwidth_GBytes',             
'AvgPingPongBandwidth_GBytes',             
'NaturallyOrderedRingLatency_usec',        
'MemProc',                                
'core_count',                              
'cpu_freq',                                
'bogo_mips',                              
'l1_cache',                                
'l2_cache',                                
'l3_cache',                                
'memory_size',                             
'memory_freq',                             
'memory_type'], axis=1)
summit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203878 entries, 622584 to 826461
Data columns (total 39 columns):
system_id             203878 non-null int64
rows                  203878 non-null int64
cols                  203878 non-null int64
min_nnz_row           203878 non-null int64
row_var               203878 non-null float64
col_var               203878 non-null float64
diag_var              203878 non-null float64
nnz                   203878 non-null int64
frob_norm             203878 non-null float64
symm_frob_norm        203878 non-null float64
antisymm_frob_norm    203878 non-null float64
one_norm              203878 non-null float64
inf_norm              203878 non-null float64
symm_inf_norm         203878 non-null float64
antisymm_inf_norm     203878 non-null float64
max_nnz_row           203878 non-null int64
trace                 203878 non-null float64
abs_trace             203878 non-null float64
min_nnz_row.1         203878 non-null int64
avg_nnz_row           20

In [54]:
X = summit_data.as_matrix()
y = summit_labels.as_matrix()
y = y[:,0]
num_labels = summit_labels.shape[1]
pipeline = pl.make_pipeline(StandardScaler(),
                            RandomOverSampler(), 
                            RandomForestClassifier())
sss = StratifiedShuffleSplit()
for label in range(0,num_labels):
    split = 0
    print("Working on label ", label)
    y = summit_labels.as_matrix()[:,label]
    for train_index, test_index in sss.split(X, y):
        print("Working on split ", split)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test).astype(int)
        print("AUROC: ", roc_auc_score(y_test, y_pred))
        split+=1

Working on label  0
Working on split  0
AUROC:  0.7993133684
Working on split  1
AUROC:  0.800536287548
Working on split  2
AUROC:  0.796354106258
Working on split  3
AUROC:  0.802500377188
Working on split  4
AUROC:  0.795259332128
Working on split  5
AUROC:  0.798412377698
Working on split  6


KeyboardInterrupt: 