In [None]:
#@title
# Upload: 
# concrete_compressive_strength.csv

# 10.1. Combinatory pattern recognition, regression

In [None]:
#@title 10.1.1. Import some necessary packages
import numpy as np
import pandas as pd
import matplotlib
import itertools # new

from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error as mse

# ml models
from sklearn.svm import SVR as svr
from sklearn.neighbors import KNeighborsRegressor as knr
from sklearn.tree import DecisionTreeRegressor as dtr 
from sklearn.ensemble import RandomForestRegressor as rfr

# svr knr dtr rfr

In [None]:
#@title 10.1.2. Data processing
data      = pd.read_csv('/content/concrete_compressive_strength.csv')

# only 28-dat concrete strength
ind  = data['Age (day)'].values == 28
data = data.iloc[ind,:]
data = data.drop(['Age (day)'], axis = 1 )

# get all before first '(' for simplicity
data_keys = list(data.keys())
keys = []
for i0 in data_keys:
  keys.append(i0[:i0.find('(')])

datain = data.iloc[:,:-1]
dataou = data.iloc[:,-1:]

datain = datain.values 
dataou = dataou.values

# necessary information for combinatory model
num_data      = datain.shape[0]
num_variables = datain.shape[1]


In [None]:
#@title 10.1.3. Some necessary functions
def fun_combinations(num_data, num_variables):
  num_comb = 2**num_variables - 1
  comb_bin = np.zeros((num_comb,num_variables))
  comb_ind = []

  counter  = 0
  for i0 in range(num_variables):
    ind       = list(itertools.combinations(range(num_variables),i0+1))
    comb_ind.append(ind)

    for i1 in range(len(ind)):
      comb_bin[counter,ind[i1]] = 1 
      counter                   = counter + 1

  comb_bin = comb_bin.astype(dtype = np.bool)

  return comb_bin, comb_ind, num_comb

def fun_split(RTT, RRS, num_data):
  np.random.seed(42)

  index_te = []
  index_tr = []

  for rtt in RTT:
    index_te_tmp = []
    index_tr_tmp = []

    for rrs in range(RRS):
      index = list(range(num_data))
      index = np.ndarray.tolist(np.random.permutation(index))

      index_te_tmp.append(index[0:int(np.floor(rtt*num_data))])
      index_tr_tmp.append(index[int(np.floor(rtt*num_data)):])
    
    index_te.append(index_te_tmp)
    index_tr.append(index_tr_tmp)

  return index_tr, index_te

def fun_prep(datain, dataou, comb_bin, index_tr, index_te, lb, ub):
  datain_tr = datain[index_tr, :][:, comb_bin] # this way we avoid "mismatch" error in indexing
  datain_te = datain[index_te, :][:, comb_bin] # this way we avoid "mismatch" error in indexing
  dataou_tr = dataou[index_tr,:]
  dataou_te = dataou[index_te,:]

  if len(datain_tr.shape) == 1:
    datain_tr = np.expand_dims(datain_tr, axis = 1)
    datain_te = np.expand_dims(datain_te, axis = 1)

  if len(dataou_tr.shape) == 1:
    dataou_tr = np.expand_dims(dataou_tr, axis = 1)
    dataou_te = np.expand_dims(dataou_te, axis = 1) 

  scalerin = MinMaxScaler(feature_range=(lb,ub))
  scalerin.fit(datain_tr)

  scalerou = MinMaxScaler(feature_range=(lb,ub))
  scalerou.fit(dataou_tr)

  datain_tr_calibrated = scalerin.transform(datain_tr)
  datain_te_calibrated = scalerin.transform(datain_te)

  dataou_tr_calibrated = scalerou.transform(dataou_tr)
  dataou_te_calibrated = scalerou.transform(dataou_te)

  return datain_tr_calibrated, dataou_tr_calibrated, datain_te_calibrated, dataou_te_calibrated
 

def fun_ml(datain_tr, dataou_tr, datain_te, dataou_te, model_type):

  result = {}

  if model_type == 'SVR':
    mdl       = svr()
  elif model_type == 'KNR':
    mdl       = knr()
  elif model_type == 'DTR':
    mdl       = dtr()
  elif model_type == 'RFR':
    mdl       = rfr()
  else:
    print("Model type has not been defined!")

  result['model_type'] = model_type
  history              = mdl.fit(datain_tr, dataou_tr)
  dataes_tr            = mdl.predict(datain_tr)
  dataes_te            = mdl.predict(datain_te)
  result['mse_tr']     = mse(dataes_tr, dataou_tr)
  result['mse_te']     = mse(dataes_te, dataou_te)

  return result


In [None]:
#@title 10.1.4. Combinatory main run
import warnings
warnings.filterwarnings("ignore")

model_type = ['SVR', 'KNR', 'DTR', 'RFR']
#model_type = ['SVR', 'KNR']
RTT        = [0.1, 0.2, 0.3]
#RTT        = [0.1, 0.2]
RRS        = 10

comb_bin, _, num_comb = fun_combinations(num_data, num_variables)
index_tr, index_te           = fun_split(RTT, RRS, num_data)

result = np.empty((len(RTT), RRS, num_comb, len(model_type)), dtype = dict)

counter = 1
for i0 in range(num_comb):
  for i1 in range(len(RTT)):
    for i2 in range(RRS):
      datain_tr, dataou_tr, datain_te, dataou_te = fun_prep(datain, dataou, comb_bin[i0,:], index_tr[i1][i2], index_te[i1][i2], 0, 1)
      for i3 in range(len(model_type)):
        result[i1,i2,i0,i3] = fun_ml(datain_tr, dataou_tr, datain_te, dataou_te, model_type[i3])
        print("Comb #: {:05d}/{} | RTT #: {:05d}/{} | RRS #: {:05d}/{} | Model: {} | Completed {:07.3F}%".format(i0+1, num_comb, i1+1, len(RTT), i2+1, RRS, model_type[i3], counter/(num_comb * len(RTT) * RRS * len(model_type))*100))
        counter = counter + 1


In [None]:
#@title 10.1.5. Result function
def fun_result(result, model_type, comb_bin):
  num_rtt, num_rrs, num_comb, num_model = result.shape
  result_summary                        = {}

  for i3 in range(num_model):
    res_tr = np.empty((num_comb, 1))
    res_te = np.empty((num_comb, 1))

    for i0 in range(num_comb):

      mse_te = np.empty((num_rtt, num_rrs))
      mse_tr = np.empty((num_rtt, num_rrs))

      for i1 in range(num_rtt):
        for i2 in range(num_rrs):
          mse_tr[i1,i2] = result[i1, i2, i0, i3]['mse_tr']
          mse_te[i1,i2] = result[i1, i2, i0, i3]['mse_te']

      res_tr[i0,0] = mse_tr.mean()
      res_te[i0,0] = mse_te.mean()

    mse_tr_sorted  = np.sort(res_tr, axis = 0)[:,0]
    mse_tr_index   = np.argsort(res_tr, axis = 0)[:,0]

    mse_te_sorted  = np.sort(res_te, axis = 0)[:,0]
    mse_te_index   = np.argsort(res_te, axis = 0)[:,0]

    #print(mse_te_index)

    result_summary[model_type[i3]] = {'mse_tr&te'  : [mse_tr_sorted, res_te[mse_tr_index,0]], 
                                      'comb_bin_tr': comb_bin[mse_tr_index,:], 
                                      'mse_te&tr'  : [mse_te_sorted, res_tr[mse_te_index,0]], 
                                      'comb_bin_te': comb_bin[mse_te_index,:]}

  return result_summary


In [None]:
#@title 10.1.6. Results and summaries
result_summary = fun_result(result, model_type, comb_bin)

#########################################
# Create a report about best combinations
#########################################
df = {}
# add combinations 
for i0 in range(datain.shape[1]):
  comb_best = []
  index     = []
  for i1 in range(len(model_type)):
    comb_best_tr = result_summary[model_type[i1]]['comb_bin_tr'][0,i0]*1
    comb_best_te = result_summary[model_type[i1]]['comb_bin_te'][0,i0]*1

    comb_best.append(comb_best_tr)
    comb_best.append(comb_best_te)

  df[keys[i0]] = comb_best

# add mse and data frame index (i.e., row names) 
index       = []
mse_best_tr = [] 
mse_best_te = [] 
for i1 in range(len(model_type)):
  mse_best_tr.append(result_summary[model_type[i1]]['mse_tr&te'][0][0])
  mse_best_te.append(result_summary[model_type[i1]]['mse_tr&te'][1][0])

  mse_best_tr.append(result_summary[model_type[i1]]['mse_te&tr'][0][0])
  mse_best_te.append(result_summary[model_type[i1]]['mse_te&tr'][1][0])

  index.append(model_type[i1] + ' Training')
  index.append(model_type[i1] + ' Testing')

df['MSE Training'] = mse_best_tr
df['MSE Testing']  = mse_best_te

df = pd.DataFrame(df, index=index)
df.to_csv('report.csv')
print("Best Combinations and Their Corresponding MSEs")
print("----------------------------------------------")
print(df)

df_summary = df

#######################################
# Create a report about selection rates
#######################################
col_val    = ['bo-','bs-','bd-','b*-','bo--','bs--','bd--','b*--','bo:','bs:','bd:','b*:']
for i1 in range(len(model_type)):
  comb_cumsum = np.cumsum(result_summary[model_type[i1]]['comb_bin_tr']*1, axis = 0)
  comb_denom  = np.repeat(np.expand_dims(np.array(range(1,comb_cumsum.shape[0]+1)), axis = 1), comb_cumsum.shape[1], axis = 1) 
  comb_rate   = comb_cumsum / comb_denom  * 100

  filename = 'Selection_Rates_' +  model_type[i1] + '.csv'
  comb_dict = {}

  # top 10% num
  num_top    = int(comb_rate.shape[0] * 0.1)
  comb_label = [] 
  for i2 in range(num_top,0,-1):
    comb_label.append(str(i2))

  plt.figure(figsize = [7,7])
  for i2 in range(len(keys[:-1])):
    comb_dict[keys[i2]] = comb_rate[:,i2]

    # plot top 10% selection rates
    
    plt.plot(comb_rate[range(num_top,0,-1),i2], 
             col_val[i2], 
             linewidth = 1, 
             markersize = 5,
             markerfacecolor = 'w')

  plt.xlabel('Combination #', fontsize = 20)
  plt.ylabel('Selection Rate (%)', fontsize = 20)
  plt.title(model_type[i1], fontsize = 20)
  plt.xticks(np.arange(num_top), np.arange(num_top,0,-1))
  plt.grid(color=[.75,.75,.75], linestyle='-', linewidth=1, which='both')
  plt.ylim([0,105])
  plt.legend(keys[:-1], fontsize = 14)
  
  #plt.xticks(list(range), ['January', 'February', 'March'],
  df = pd.DataFrame(comb_dict)
  df.to_csv(filename)
