In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier

In [2]:
labels = pd.read_csv('./labels.csv', index_col=0)
pts = pd.read_csv('./pts.csv', index_col=0)

In [3]:
pts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 871535 entries, 0 to 871534
Data columns (total 83 columns):
system_id                               871535 non-null int64
HPL_Tflops                              871535 non-null float64
StarDGEMM_Gflops                        871535 non-null float64
SingleDGEMM_Gflops                      871535 non-null float64
PTRANS_GBs                              871535 non-null float64
MPIRandomAccess_LCG_GUPs                871535 non-null float64
MPIRandomAccess_GUPs                    871535 non-null float64
StarRandomAccess_LCG_GUPs               871535 non-null float64
SingleRandomAccess_LCG_GUPs             871535 non-null float64
StarRandomAccess_GUPs                   871535 non-null float64
SingleRandomAccess_GUPs                 871535 non-null float64
StarSTREAM_Copy                         871535 non-null float64
StarSTREAM_Scale                        871535 non-null float64
StarSTREAM_Add                          871535 non-null flo

In [4]:
all_data = pts.drop([
'matrix_id',
'system_id',
'time',
'matrix_id',
'status_id'], axis=1)

In [4]:
# plain_data = pts.drop([
# 'matrix_id', 
# 'status_id', 
# 'time', 
# 'HPL_Tflops',        
# 'StarDGEMM_Gflops',                        
# 'SingleDGEMM_Gflops',                      
# 'PTRANS_GBs',                              
# 'MPIRandomAccess_LCG_GUPs',                
# 'MPIRandomAccess_GUPs',                    
# 'StarRandomAccess_LCG_GUPs',               
# 'SingleRandomAccess_LCG_GUPs',             
# 'StarRandomAccess_GUPs',                   
# 'SingleRandomAccess_GUPs',                 
# 'StarSTREAM_Copy',                         
# 'StarSTREAM_Scale',                        
# 'StarSTREAM_Add',                          
# 'StarSTREAM_Triad',                        
# 'SingleSTREAM_Copy',                       
# 'SingleSTREAM_Scale',                      
# 'SingleSTREAM_Add',                        
# 'SingleSTREAM_Triad',                      
# 'StarFFT_Gflops',                          
# 'SingleFFT_Gflops',                        
# 'MPIFFT_Gflops',                           
# 'MaxPingPongLatency_usec',                 
# 'RandomlyOrderedRingLatency_usec',         
# 'MinPingPongBandwidth_GBytes',             
# 'NaturallyOrderedRingBandwidth_GBytes',    
# 'RandomlyOrderedRingBandwidth_GBytes',     
# 'MinPingPongLatency_usec',                 
# 'AvgPingPongLatency_usec',                 
# 'MaxPingPongBandwidth_GBytes',             
# 'AvgPingPongBandwidth_GBytes',             
# 'NaturallyOrderedRingLatency_usec',        
# 'MemProc',                                
# 'core_count',                              
# 'cpu_freq',                                
# 'bogo_mips',                              
# 'l1_cache',                                
# 'l2_cache',                                
# 'l3_cache',                                
# 'memory_size',                             
# 'memory_freq',                             
# 'memory_type'], axis=1)

In [5]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 871535 entries, 0 to 871534
Data columns (total 79 columns):
HPL_Tflops                              871535 non-null float64
StarDGEMM_Gflops                        871535 non-null float64
SingleDGEMM_Gflops                      871535 non-null float64
PTRANS_GBs                              871535 non-null float64
MPIRandomAccess_LCG_GUPs                871535 non-null float64
MPIRandomAccess_GUPs                    871535 non-null float64
StarRandomAccess_LCG_GUPs               871535 non-null float64
SingleRandomAccess_LCG_GUPs             871535 non-null float64
StarRandomAccess_GUPs                   871535 non-null float64
SingleRandomAccess_GUPs                 871535 non-null float64
StarSTREAM_Copy                         871535 non-null float64
StarSTREAM_Scale                        871535 non-null float64
StarSTREAM_Add                          871535 non-null float64
StarSTREAM_Triad                        871535 non-null f

In [6]:
summit_only_data = plain_data[plain_data['system_id'] == 3]
summit_only_labels = labels[plain_data['system_id'] == 3]

In [7]:
summit_only_data.info()
summit_only_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203878 entries, 622584 to 826461
Data columns (total 39 columns):
system_id             203878 non-null int64
rows                  203878 non-null int64
cols                  203878 non-null int64
min_nnz_row           203878 non-null int64
row_var               203878 non-null float64
col_var               203878 non-null float64
diag_var              203878 non-null float64
nnz                   203878 non-null int64
frob_norm             203878 non-null float64
symm_frob_norm        203878 non-null float64
antisymm_frob_norm    203878 non-null float64
one_norm              203878 non-null float64
inf_norm              203878 non-null float64
symm_inf_norm         203878 non-null float64
antisymm_inf_norm     203878 non-null float64
max_nnz_row           203878 non-null int64
trace                 203878 non-null float64
abs_trace             203878 non-null float64
min_nnz_row.1         203878 non-null int64
avg_nnz_row           20

In [6]:
X = all_data.as_matrix()
y = labels.as_matrix()
y

array([[0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]], dtype=int64)

In [7]:
from sklearn.externals import joblib
classifier = RandomForestClassifier()
classifier = classifier.fit(X,y)
joblib.dump(classifier, 'all_data_classifier.pkl')

['all_data_classifier.pkl']

In [17]:
classifier = RandomForestClassifier()
sss = StratifiedShuffleSplit()
y_preds = []
y_tests = []
i = 0
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    classifier.fit(X_train, y_train)
    y_preds.append(classifier.predict(X_test))
    y_tests.append(y_test)
    i+=1

In [36]:
from sklearn.model_selection import LeaveOneOut
classifier = RandomForestClassifier()
sss = LeaveOneOut()
loo_y_preds = []
loo_y_tests = []
i = 0
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    classifier.fit(X_train, y_train)
    loo_y_preds.append(classifier.predict(X_test))
    loo_y_tests.append(y_test)
    i+=1

KeyboardInterrupt: 

In [31]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
sss = StratifiedShuffleSplit()
y_preds = []
y_tests = []
i = 0
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    classifier.fit(X_train, y_train)
    y_preds.append(classifier.predict(X_test))
    y_tests.append(y_test)
    i+=1

10

In [32]:
j = i
for i in range(0,j):
    y_preds[i] = y_preds[i].astype(int)
y_preds

[array([[0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ..., 
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0]]), array([[1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        ..., 
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), array([[0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        ..., 
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), array([[1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ..., 
        [1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0]]), array([[0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ..., 
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), array([[0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
 

In [33]:
from sklearn.metrics import classification_report, confusion_matrix
for i in range(0,10):
    print(classification_report(y_tests[i],y_preds[i]))

             precision    recall  f1-score   support

          0       0.66      0.63      0.65     35778
          1       0.69      0.57      0.62     17875
          2       0.45      0.12      0.19      2156
          3       0.43      0.12      0.18       694
          4       0.21      0.03      0.05       386
          5       0.17      0.02      0.04        88

avg / total       0.66      0.58      0.61     56977

             precision    recall  f1-score   support

          0       0.66      0.63      0.65     35723
          1       0.69      0.56      0.62     17868
          2       0.43      0.11      0.18      2188
          3       0.37      0.09      0.15       729
          4       0.10      0.01      0.02       407
          5       0.11      0.01      0.02        94

avg / total       0.65      0.58      0.61     57009

             precision    recall  f1-score   support

          0       0.67      0.64      0.66     35994
          1       0.68      0.57      0

In [26]:
from sklearn.metrics import classification_report, confusion_matrix
for i in range(0,10):
    print(classification_report(y_tests[i],y_preds[i]))

             precision    recall  f1-score   support

          0       0.84      0.79      0.82     35735
          1       0.85      0.76      0.80     18148
          2       0.85      0.59      0.70      2111
          3       0.79      0.48      0.60       671
          4       0.72      0.42      0.53       357
          5       0.80      0.43      0.56        81

avg / total       0.84      0.77      0.80     57103

             precision    recall  f1-score   support

          0       0.84      0.79      0.82     35785
          1       0.85      0.76      0.80     17781
          2       0.85      0.59      0.70      2053
          3       0.81      0.50      0.61       650
          4       0.70      0.36      0.48       373
          5       0.59      0.34      0.43        79

avg / total       0.84      0.77      0.80     56721

             precision    recall  f1-score   support

          0       0.84      0.80      0.82     35592
          1       0.86      0.77      0

In [27]:
from sklearn.model_selection import KFold
classifier = RandomForestClassifier()
kf = KFold(n_splits=10)
y_preds = []
y_tests = []
i = 0
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    classifier.fit(X_train, y_train)
    y_preds.append(classifier.predict(X_test))
    y_tests.append(y_test)
    i+=1

In [28]:
from sklearn.metrics import classification_report, confusion_matrix
for i in range(0,10):
    print(classification_report(y_tests[i],y_preds[i]))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.40      0.28      0.33     33880
          1       0.40      0.03      0.06     18415
          2       0.46      0.05      0.09      2090
          3       0.69      0.01      0.02       864
          4       0.00      0.00      0.00       370
          5       0.00      0.00      0.00        86

avg / total       0.40      0.18      0.22     55705

             precision    recall  f1-score   support

          0       0.49      0.24      0.32     40959
          1       0.15      0.20      0.17     10239
          2       0.47      0.15      0.23      1680
          3       0.33      0.04      0.07       702
          4       0.00      0.00      0.00       378
          5       0.00      0.00      0.00        97

avg / total       0.42      0.22      0.28     54055

             precision    recall  f1-score   support

          0       0.37      0.24      0.29     32437
          1       0.24      0.04      0

In [16]:
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
print(coverage_error(y_tests[0], y_preds[0]))
print(coverage_error(y_tests[1], y_preds[0]))
print(coverage_error(y_tests[0], y_preds[0]))

1.36241595337
1.36241595337


In [8]:
y_pred = classifier.predict(X)
y_pred = y_pred.astype(int)
y_pred

array([[0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]])

In [75]:
y_pred_prob = classifier.predict_proba(X)
y_pred_prob

[array([[ 1. ,  0. ],
        [ 1. ,  0. ],
        [ 0.9,  0.1],
        ..., 
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        [ 0.9,  0.1]]), array([[ 0. ,  1. ],
        [ 0.1,  0.9],
        [ 0.7,  0.3],
        ..., 
        [ 0. ,  1. ],
        [ 0. ,  1. ],
        [ 0.1,  0.9]]), array([[ 0.8,  0.2],
        [ 0.9,  0.1],
        [ 1. ,  0. ],
        ..., 
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        [ 1. ,  0. ]]), array([[ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.],
        ..., 
        [ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.]]), array([[ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.],
        ..., 
        [ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.]]), array([[ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.],
        ..., 
        [ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.]])]

In [80]:
print("Predicted number of instances:\t",
sum(y_pred[:,0]),
sum(y_pred[:,1]),
sum(y_pred[:,2]),
sum(y_pred[:,3]),
sum(y_pred[:,4]),
sum(y_pred[:,5]))

print("Predicted probabilities of instances:\n",
sum(y_pred_prob[0]), '\n',
sum(y_pred_prob[1]),'\n',
sum(y_pred_prob[2]),'\n',
sum(y_pred_prob[3]),'\n',
sum(y_pred_prob[4]),'\n',
sum(y_pred_prob[5]))

print("Actual number of instances:\t",
sum(y[:,0]),
sum(y[:,1]),
sum(y[:,2]),
sum(y[:,3]),
sum(y[:,4]),
sum(y[:,5]))

Predicted number of instances:	 348605 168147 18511 5553 2367 580
Predicted probabilities of instances:
 [ 513699.88492589  357835.1150742 ] 
 [ 692681.60130509  178853.39869567] 
 [ 850268.20762262   21266.79237786] 
 [ 864584.20092999    6950.79907022] 
 [ 867846.23927891    3688.76072121] 
 [ 870614.08617525     920.91382476]
Actual number of instances:	 357726 178625 21098 6904 3616 883


In [56]:
classifier.score(X,y)

0.89591812147532801

In [43]:
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
print(coverage_error(y, results_y))
print(label_ranking_average_precision_score(y, results_y))
print(label_ranking_loss(y, results_y))

1.00632332609
0.941945405393
0.0704532998292


In [86]:
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score
print(f1_score(y, y_pred, average=None))
print(f1_score(y, y_pred, average='micro'))
print(f1_score(y, y_pred, average='macro'))
print(f1_score(y, y_pred, average='samples'))
print(f1_score(y, y_pred, average='weighted'))
scores = fbeta_score(y, y_pred, beta=0.5, average=None)

[ 0.90917431  0.90349855  0.88227423  0.84017019  0.70967742  0.70813397]
0.904337978546
0.825488111613


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.554095122809
0.903976694961


In [102]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y,y_pred))
print(confusion_matrix(y[:,0],y_pred[:,0]))

             precision    recall  f1-score   support

          0       0.92      0.90      0.91    357726
          1       0.93      0.88      0.90    178625
          2       0.94      0.83      0.88     21098
          3       0.94      0.76      0.84      6904
          4       0.90      0.59      0.71      3616
          5       0.89      0.59      0.71       883

avg / total       0.93      0.88      0.90    568852

[[486293  27516]
 [ 36637 321089]]


In [103]:
from sklearn.metrics import roc_auc_score, roc_curve
print(roc_auc_score(y, y_pred))

0.871866570886


In [112]:
from sklearn.metrics import zero_one_loss
print(zero_one_loss(y,y_pred)) # Fraction of misclassifications
print(zero_one_loss(y,y_pred,normalize=False)) # num of misclassifications

0.104081878525
90711
