In [None]:
#@title
# Upload: 
# electrical_grid_stability_simulated_data.csv

# 10.2. Combinatory pattern recognition, classification

In [None]:
#@title 10.2.1. Import some necessary packages
import numpy as np
import pandas as pd
import matplotlib
import itertools # new

from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error as mse

# ml models
from sklearn.svm import SVC as svc
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.ensemble import RandomForestClassifier as rfc

import warnings
warnings.filterwarnings("ignore")

from multiprocessing import Pool
import time


In [None]:
#@title 10.2.2. Data processing
data = pd.read_csv('/content/electrical_grid_stability_simulated_data.csv')

datain = data.iloc[:,0:12]
dataou = data.iloc[:,-1:]

dataou[dataou == 'stable']   = 1
dataou[dataou == 'unstable'] = 0

datain = datain.values 
dataou = dataou.values

# object to float data
mat = np.empty(dataou.shape,dtype=float)
for i0 in range(dataou.shape[0]): mat[i0,0] = dataou[i0]
dataou = mat

# define some simple keys
data_keys = list(data.keys())
keys = []
for i0 in range(len(data_keys)):
  keys.append('feature {:02d}'.format(i0+1))

# remove the outputs
keys = keys[:13]

# necessary information for combinatory model
num_data      = datain.shape[0]
num_variables = datain.shape[1]


In [None]:
#@title 10.2.3. Some necessary functions
def fun_combinations(num_data, num_variables):
  num_comb = 2**num_variables - 1
  comb_bin = np.zeros((num_comb,num_variables))
  comb_ind = []

  counter  = 0
  for i0 in range(num_variables):
    ind       = list(itertools.combinations(range(num_variables),i0+1))
    comb_ind.append(ind)

    for i1 in range(len(ind)):
      comb_bin[counter,ind[i1]] = 1 
      counter                   = counter + 1

  comb_bin = comb_bin.astype(dtype = np.bool)

  return comb_bin, comb_ind, num_comb

def fun_split(RTT, RRS, num_data):
  np.random.seed(42)

  index_te = []
  index_tr = []

  for rtt in RTT:
    index_te_tmp = []
    index_tr_tmp = []

    for rrs in range(RRS):
      index = list(range(num_data))
      index = np.ndarray.tolist(np.random.permutation(index))

      index_te_tmp.append(index[0:int(np.floor(rtt*num_data))])
      index_tr_tmp.append(index[int(np.floor(rtt*num_data)):])
    
    index_te.append(index_te_tmp)
    index_tr.append(index_tr_tmp)

  return index_tr, index_te

def fun_prep(datain, dataou, comb_bin, index_tr, index_te, lb, ub):
  datain_tr = datain[index_tr, :][:, comb_bin] # this way we avoid "mismatch" error in indexing
  datain_te = datain[index_te, :][:, comb_bin] # this way we avoid "mismatch" error in indexing
  dataou_tr = dataou[index_tr,:]
  dataou_te = dataou[index_te,:]

  if len(datain_tr.shape) == 1:
    datain_tr = np.expand_dims(datain_tr, axis = 1)
    datain_te = np.expand_dims(datain_te, axis = 1)

  if len(dataou_tr.shape) == 1:
    dataou_tr = np.expand_dims(dataou_tr, axis = 1)
    dataou_te = np.expand_dims(dataou_te, axis = 1) 

  scalerin = MinMaxScaler(feature_range=(lb,ub))
  scalerin.fit(datain_tr)

  scalerou = MinMaxScaler(feature_range=(lb,ub))
  scalerou.fit(dataou_tr)

  datain_tr_calibrated = scalerin.transform(datain_tr)
  datain_te_calibrated = scalerin.transform(datain_te)

  # dataou_tr_calibrated = scalerou.transform(dataou_tr)
  # dataou_te_calibrated = scalerou.transform(dataou_te)

  return datain_tr_calibrated, dataou_tr, datain_te_calibrated, dataou_te
 

def fun_ml(datain_tr, dataou_tr, datain_te, dataou_te, model_type):

  result = {}

  if model_type == 'SVC':
    mdl       = svc()
  elif model_type == 'KNC':
    mdl       = knc()
  elif model_type == 'DTC':
    mdl       = dtc()
  elif model_type == 'RFC':
    mdl       = rfc()
  else:
    print("Model type has not been defined!")

  result['model_type'] = model_type
  history              = mdl.fit(datain_tr, dataou_tr)
  dataes_tr            = mdl.predict(datain_tr)
  dataes_te            = mdl.predict(datain_te)

  if len(dataes_tr.shape) == 1:
    dataes_tr = np.expand_dims(dataes_tr, axis = 1)

  if len(dataes_te.shape) == 1:
    dataes_te = np.expand_dims(dataes_te, axis = 1)


  result['acc_tr']     = fun_accuracy(dataes_tr, dataou_tr)
  result['acc_te']     = fun_accuracy(dataes_te, dataou_te)

  # result['dataes_tr'] = dataes_tr
  # result['dataou_tr'] = dataou_tr
  # result['dataes_te'] = dataes_te
  # result['dataou_te'] = dataou_te

  return result

def fun_accuracy(dataes, dataou):
  num_err = np.count_nonzero(dataes - dataou)
  accuracy = 1 - num_err/dataou.shape[0]
  return accuracy 

In [None]:
#@title 10.2.4. Combinatory main run (ordinary)
model_type = ['KNC', 'DTC'] #['SVR', 'KNR']
RTT        = [0.1, 0.2]
RRS        = 2

comb_bin, comb_ind, num_comb = fun_combinations(num_data, num_variables)
index_tr, index_te           = fun_split(RTT, RRS, num_data)

result = np.empty((len(RTT), RRS, num_comb, len(model_type)), dtype = dict)

num_comb   = 50

time_start = time.time()

counter = 1
for i0 in range(num_comb):
  for i1 in range(len(RTT)):
    for i2 in range(RRS):
      datain_tr, dataou_tr, datain_te, dataou_te = fun_prep(datain, dataou, comb_bin[i0,:], index_tr[i1][i2], index_te[i1][i2], 0, 1)
      for i3 in range(len(model_type)):
        result[i1,i2,i0,i3] = fun_ml(datain_tr, dataou_tr, datain_te, dataou_te, model_type[i3])
        print("Comb #: {:05d}/{} | RTT #: {:05d}/{} | RRS #: {:05d}/{} | Model: {} | Completed {:07.3F}%".format(i0+1, num_comb, i1+1, len(RTT), i2+1, RRS, model_type[i3], counter/(num_comb * len(RTT) * RRS * len(model_type))*100))
        counter = counter + 1

run_time = time.time() - time_start
print(run_time)

In [None]:
#@title 10.2.5.1 Combinatory main run (parallel) - part 01
# define some assistant function
def fun_rttrrs(i0):
  result  = np.empty((len(RTT), RRS, 1, len(model_type)), dtype = dict)
  for i1 in range(len(RTT)):
    for i2 in range(RRS):
      datain_tr, dataou_tr, datain_te, dataou_te = fun_prep(datain, dataou, comb_bin[i0,:], index_tr[i1][i2], index_te[i1][i2], 0, 1)
      for i3 in range(len(model_type)):
        result[i1,i2,0,i3] = fun_ml(datain_tr, dataou_tr, datain_te, dataou_te, model_type[i3])
        print("Comb #: {:05d}/{} | RTT #: {:05d}/{} | RRS #: {:05d}/{} | Model: {}".format(i0+1, num_comb, i1+1, len(RTT), i2+1, RRS, model_type[i3]))
  return result

def fun_parallel(pool_size):
  result = np.empty((len(RTT), RRS, num_comb, len(model_type)), dtype = dict)

  # define the loop counters
  par_loop = []
  for i0 in range(0, num_comb):
    par_loop.append((i0,))

  my_pool     = Pool(pool_size) # define the number of cores/workers
  result_list = my_pool.starmap(fun_rttrrs, par_loop)

  # gather results
  for i0 in range(len(result_list)):
    result[:,:,i0:i0+1,:] = result_list[i0]

  return result

In [None]:
#@title 10.2.5.2 Combinatory main run (parallel) - part 02

model_type = ['KNC', 'DTC']
RTT        = [0.1, 0.2]
RRS        = 2
pool_size  = 2

comb_bin, comb_ind, num_comb = fun_combinations(num_data, num_variables)
index_tr, index_te           = fun_split(RTT, RRS, num_data)

num_comb = 50

time_start = time.time()

result = fun_parallel(pool_size)

run_time = time.time() - time_start
print(run_time)

In [None]:
#@title 10.2.6. Result function
def fun_result(result, model_type, comb_bin):
  num_rtt, num_rrs, num_comb, num_model = result.shape
  result_summary                        = {}

  for i3 in range(num_model):
    res_tr = np.empty((num_comb, 1))
    res_te = np.empty((num_comb, 1))

    for i0 in range(num_comb):

      mse_te = np.empty((num_rtt, num_rrs))
      mse_tr = np.empty((num_rtt, num_rrs))

      for i1 in range(num_rtt):
        for i2 in range(num_rrs):
          mse_tr[i1,i2] = result[i1, i2, i0, i3]['acc_tr']
          mse_te[i1,i2] = result[i1, i2, i0, i3]['acc_te']

      res_tr[i0,0] = mse_tr.mean()
      res_te[i0,0] = mse_te.mean()

    mse_tr_sorted  = np.sort(res_tr, axis = 0)[:,0]
    mse_tr_index   = np.argsort(res_tr, axis = 0)[:,0]

    mse_te_sorted  = np.sort(res_te, axis = 0)[:,0]
    mse_te_index   = np.argsort(res_te, axis = 0)[:,0]

    #print(mse_te_index)

    result_summary[model_type[i3]] = {'acc_tr&te'  : [mse_tr_sorted, res_te[mse_tr_index,0]], 
                                      'comb_bin_tr': comb_bin[mse_tr_index,:], 
                                      'acc_te&tr'  : [mse_te_sorted, res_tr[mse_te_index,0]], 
                                      'comb_bin_te': comb_bin[mse_te_index,:]}

  return result_summary


In [None]:
#@title 10.2.7. Results and summaries
result_summary = fun_result(result, model_type, comb_bin)

#########################################
# Create a report about best combinations
#########################################
df = {}
# add combinations 
for i0 in range(datain.shape[1]):
  comb_best = []
  index     = []
  for i1 in range(len(model_type)):
    comb_best_tr = result_summary[model_type[i1]]['comb_bin_tr'][-1,i0]*1
    comb_best_te = result_summary[model_type[i1]]['comb_bin_te'][-1,i0]*1

    comb_best.append(comb_best_tr)
    comb_best.append(comb_best_te)

  df[keys[i0]] = comb_best

# add mse and data frame index (i.e., row names) 
index       = []
acc_best_tr = [] 
acc_best_te = [] 
for i1 in range(len(model_type)):
  acc_best_tr.append(result_summary[model_type[i1]]['acc_tr&te'][0][-1])
  acc_best_te.append(result_summary[model_type[i1]]['acc_tr&te'][1][-1])

  acc_best_tr.append(result_summary[model_type[i1]]['acc_te&tr'][0][-1])
  acc_best_te.append(result_summary[model_type[i1]]['acc_te&tr'][1][-1])

  index.append(model_type[i1] + ' Training')
  index.append(model_type[i1] + ' Testing')

df['Accuracy Training'] = acc_best_tr
df['Accuracy Testing']  = acc_best_te

df = pd.DataFrame(df, index=index)
df.to_csv('report.csv')
print("Best Combinations and Their Corresponding Accuracies")
print("----------------------------------------------")
print(df)

df_summary = df

#######################################
# Create a report about selection rates
#######################################
col_val    = ['bo-','bs-','bd-','b*-','bo--','bs--','bd--','b*--','bo:','bs:','bd:','b*:', 'bo-.','bs-.','bd-.','b*-.']
for i1 in range(len(model_type)):
  comb_cumsum = np.cumsum(result_summary[model_type[i1]]['comb_bin_tr']*1, axis = 0)
  comb_denom  = np.repeat(np.expand_dims(np.array(range(1,comb_cumsum.shape[0]+1)), axis = 1), comb_cumsum.shape[1], axis = 1) 
  comb_rate   = comb_cumsum / comb_denom  * 100

  filename = 'Selection_Rates_' +  model_type[i1] + '.csv'
  comb_dict = {}

  # top 10% num
  num_top    = int(comb_rate.shape[0] * 0.1)
  comb_label = [] 
  for i2 in range(num_top,0,-1):
    comb_label.append(str(i2))

  plt.figure(figsize = [7,7])
  for i2 in range(len(keys[:-1])):
    comb_dict[keys[i2]] = comb_rate[:,i2]

    # plot top 10% selection rates
    
    plt.plot(comb_rate[range(num_top,0,-1),i2], 
             col_val[i2], 
             linewidth = 1, 
             markersize = 5,
             markerfacecolor = 'w')

  plt.xlabel('Combination #', fontsize = 20)
  plt.ylabel('Selection Rate (%)', fontsize = 20)
  plt.title(model_type[i1], fontsize = 20)
  plt.xticks(np.arange(num_top), np.arange(num_top,0,-1))
  plt.grid(color=[.75,.75,.75], linestyle='-', linewidth=1, which='both')
  plt.ylim([0,105])
  plt.legend(keys[:-1], fontsize = 14)
  
  #plt.xticks(list(range), ['January', 'February', 'March'],
  df = pd.DataFrame(comb_dict)
  df.to_csv(filename)
