In [18]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import datetime

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# regressors
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from xgboost import XGBRegressor

# classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import lightgbm as lgb

# for results
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import max_error
from sklearn.metrics import r2_score

pd.set_option("display.max_rows", 10)

In [2]:
# load data
division = 'm'  # s or m (s = small-sized meeting room, m = medium-sized seminar room)

# session 1
if division == 's':
    l = glob.glob(r'..\..\datasets\small-room\sess1\S=*.csv')
elif division == 'm':
    l = glob.glob(r'..\..\datasets\medium-room\sess1\S=*.csv')
l.sort()

df_fea_sess1 = []
for i in l:
    df_fea_sess1.append(pd.read_csv(i, header=None))

# session 2
if division == 's':
    l = glob.glob(r'..\..\datasets\small-room\sess2\S=*.csv')
elif division == 'm':
    l = glob.glob(r'..\..\datasets\medium-room\sess2\S=*.csv')
l.sort()

df_fea_sess2 = []
for i in l:
    df_fea_sess2.append(pd.read_csv(i, header=None))
    
# session 3
if division == 's':
    l = glob.glob(r'..\..\datasets\small-room\sess3\S=*.csv')
elif division == 'm':
    l = glob.glob(r'..\..\datasets\medium-room\sess3\S=*.csv')
l.sort()

df_fea_sess3 = []
for i in l:
    df_fea_sess3.append(pd.read_csv(i, header=None))

In [3]:
l

['..\\..\\datasets\\medium-room\\sess3\\S=0.csv',
 '..\\..\\datasets\\medium-room\\sess3\\S=1.csv',
 '..\\..\\datasets\\medium-room\\sess3\\S=2.csv',
 '..\\..\\datasets\\medium-room\\sess3\\S=3.csv',
 '..\\..\\datasets\\medium-room\\sess3\\S=4.csv']

In [4]:
# select session (sess1, sess2, sess3)

df_fea = df_fea_sess1

In [20]:
df_fea[0].shape

(399, 357)

In [6]:
# create column label (feature name)
# l(N1)_xxx(N2) >> N1: link number, N2: subcarrier number.

nof_link = 4
nof_usedsubc = 13

col_label = []

for i in range(nof_link):
    
    for j in range(nof_usedsubc):
        col_label.append('l%d_std%d' %(i+1,j+1))
        col_label.append('l%d_min%d' %(i+1,j+1))
        col_label.append('l%d_max%d' %(i+1,j+1))
        col_label.append('l%d_qtl%d' %(i+1,j+1))
        col_label.append('l%d_qtu%d' %(i+1,j+1))
        col_label.append('l%d_avg%d' %(i+1,j+1))
        
    for j in range(6):
        col_label.append('l%d_cur%d' %(i+1,j+1))
    for j in range(5):
        col_label.append('l%d_der%d' %(i+1,j+1))    

len(col_label)

356

In [7]:
# change column label

for i in range(len(df_fea)):
    df_fea[i].columns = col_label

In [8]:
df_fea[0]

Unnamed: 0,l1_std1,l1_min1,l1_max1,l1_qtl1,l1_qtu1,l1_avg1,l1_std2,l1_min2,l1_max2,l1_qtl2,...,l4_cur2,l4_cur3,l4_cur4,l4_cur5,l4_cur6,l4_der1,l4_der2,l4_der3,l4_der4,l4_der5
0,1.461229,1.403337,1.329958,1.183326,1.083925,1.030910,1.060135,0.948225,0.944712,0.878323,...,0.000003,-0.001783,0.071555,-0.451787,9.110630,8.476972e-07,0.000011,-0.005350,0.143110,-0.451787
1,1.206355,1.028218,0.993922,0.874931,0.872215,0.911993,0.961354,0.854425,0.843202,0.776964,...,0.000027,-0.002552,0.079066,-0.526390,10.690206,-2.650281e-07,0.000107,-0.007657,0.158133,-0.526390
2,0.747694,0.706687,0.675697,0.677832,0.596699,0.656809,0.696858,0.723022,0.759360,0.710899,...,0.000040,-0.003094,0.087967,-0.587742,10.941266,-7.766324e-07,0.000158,-0.009282,0.175934,-0.587742
3,0.680562,0.652428,0.617645,0.655980,0.595589,0.618400,0.705216,0.705249,0.719335,0.599678,...,0.000018,-0.002280,0.079904,-0.622525,10.893866,1.265775e-07,0.000071,-0.006841,0.159808,-0.622525
4,1.418847,1.444397,1.328114,1.280205,1.189202,1.207989,1.236722,1.037668,1.032366,0.900402,...,-0.000001,-0.001523,0.069126,-0.568937,10.664818,9.074015e-07,-0.000005,-0.004569,0.138252,-0.568937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,2.234998,1.952983,1.642235,1.552838,1.686878,2.188347,2.466221,2.025162,1.933353,1.760320,...,0.000023,-0.001875,0.049112,-0.179824,10.048577,-3.035061e-07,0.000090,-0.005625,0.098223,-0.179824
395,2.136482,2.112427,1.756895,1.640038,1.605874,2.040877,2.245979,1.822832,1.856636,1.732509,...,0.000012,-0.001043,0.020199,0.219921,8.553810,-3.865501e-08,0.000048,-0.003129,0.040398,0.219921
396,2.320379,2.269228,1.839009,1.702551,1.535080,1.516574,1.428790,1.220235,1.294640,1.203857,...,0.000050,-0.003108,0.070145,-0.218774,8.743192,-1.336387e-06,0.000199,-0.009325,0.140289,-0.218774
397,2.107777,1.920989,1.682704,1.690026,1.659764,1.700245,1.649740,1.328510,1.216458,1.217324,...,0.000069,-0.004302,0.102801,-0.535439,8.660696,-1.910258e-06,0.000275,-0.012905,0.205601,-0.535439


In [9]:
# Ground truth

for i in range(len(df_fea)):
    df_fea[i]['S'] = i

In [10]:
df_fea[0]

Unnamed: 0,l1_std1,l1_min1,l1_max1,l1_qtl1,l1_qtu1,l1_avg1,l1_std2,l1_min2,l1_max2,l1_qtl2,...,l4_cur3,l4_cur4,l4_cur5,l4_cur6,l4_der1,l4_der2,l4_der3,l4_der4,l4_der5,S
0,1.461229,1.403337,1.329958,1.183326,1.083925,1.030910,1.060135,0.948225,0.944712,0.878323,...,-0.001783,0.071555,-0.451787,9.110630,8.476972e-07,0.000011,-0.005350,0.143110,-0.451787,0
1,1.206355,1.028218,0.993922,0.874931,0.872215,0.911993,0.961354,0.854425,0.843202,0.776964,...,-0.002552,0.079066,-0.526390,10.690206,-2.650281e-07,0.000107,-0.007657,0.158133,-0.526390,0
2,0.747694,0.706687,0.675697,0.677832,0.596699,0.656809,0.696858,0.723022,0.759360,0.710899,...,-0.003094,0.087967,-0.587742,10.941266,-7.766324e-07,0.000158,-0.009282,0.175934,-0.587742,0
3,0.680562,0.652428,0.617645,0.655980,0.595589,0.618400,0.705216,0.705249,0.719335,0.599678,...,-0.002280,0.079904,-0.622525,10.893866,1.265775e-07,0.000071,-0.006841,0.159808,-0.622525,0
4,1.418847,1.444397,1.328114,1.280205,1.189202,1.207989,1.236722,1.037668,1.032366,0.900402,...,-0.001523,0.069126,-0.568937,10.664818,9.074015e-07,-0.000005,-0.004569,0.138252,-0.568937,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,2.234998,1.952983,1.642235,1.552838,1.686878,2.188347,2.466221,2.025162,1.933353,1.760320,...,-0.001875,0.049112,-0.179824,10.048577,-3.035061e-07,0.000090,-0.005625,0.098223,-0.179824,0
395,2.136482,2.112427,1.756895,1.640038,1.605874,2.040877,2.245979,1.822832,1.856636,1.732509,...,-0.001043,0.020199,0.219921,8.553810,-3.865501e-08,0.000048,-0.003129,0.040398,0.219921,0
396,2.320379,2.269228,1.839009,1.702551,1.535080,1.516574,1.428790,1.220235,1.294640,1.203857,...,-0.003108,0.070145,-0.218774,8.743192,-1.336387e-06,0.000199,-0.009325,0.140289,-0.218774,0
397,2.107777,1.920989,1.682704,1.690026,1.659764,1.700245,1.649740,1.328510,1.216458,1.217324,...,-0.004302,0.102801,-0.535439,8.660696,-1.910258e-06,0.000275,-0.012905,0.205601,-0.535439,0


In [11]:
df_fea_all = pd.DataFrame()

for i in range(len(df_fea)):
    df_fea_all = pd.concat([df_fea_all, df_fea[i]], axis=0)

df_fea_all.reset_index(drop=True)

Unnamed: 0,l1_std1,l1_min1,l1_max1,l1_qtl1,l1_qtu1,l1_avg1,l1_std2,l1_min2,l1_max2,l1_qtl2,...,l4_cur3,l4_cur4,l4_cur5,l4_cur6,l4_der1,l4_der2,l4_der3,l4_der4,l4_der5,S
0,1.461229,1.403337,1.329958,1.183326,1.083925,1.030910,1.060135,0.948225,0.944712,0.878323,...,-0.001783,0.071555,-0.451787,9.110630,8.476972e-07,0.000011,-0.005350,0.143110,-0.451787,0
1,1.206355,1.028218,0.993922,0.874931,0.872215,0.911993,0.961354,0.854425,0.843202,0.776964,...,-0.002552,0.079066,-0.526390,10.690206,-2.650281e-07,0.000107,-0.007657,0.158133,-0.526390,0
2,0.747694,0.706687,0.675697,0.677832,0.596699,0.656809,0.696858,0.723022,0.759360,0.710899,...,-0.003094,0.087967,-0.587742,10.941266,-7.766324e-07,0.000158,-0.009282,0.175934,-0.587742,0
3,0.680562,0.652428,0.617645,0.655980,0.595589,0.618400,0.705216,0.705249,0.719335,0.599678,...,-0.002280,0.079904,-0.622525,10.893866,1.265775e-07,0.000071,-0.006841,0.159808,-0.622525,0
4,1.418847,1.444397,1.328114,1.280205,1.189202,1.207989,1.236722,1.037668,1.032366,0.900402,...,-0.001523,0.069126,-0.568937,10.664818,9.074015e-07,-0.000005,-0.004569,0.138252,-0.568937,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990,0.917898,0.919850,0.856698,0.804004,0.727550,0.679315,0.650399,0.645671,0.707827,0.656676,...,0.000444,0.015425,-0.062696,10.424669,1.781041e-06,-0.000127,0.001333,0.030851,-0.062696,4
1991,0.993171,1.055292,0.918394,0.886796,0.840524,0.783448,0.783273,0.697142,0.768202,0.650290,...,0.001289,-0.000815,-0.047433,10.910980,2.140359e-06,-0.000183,0.003866,-0.001630,-0.047433,4
1992,0.913375,1.006806,1.072936,1.082991,1.013199,0.927840,0.883479,0.827988,0.818503,0.758421,...,0.000557,0.027969,-0.485448,11.351925,1.960868e-06,-0.000151,0.001671,0.055939,-0.485448,4
1993,0.987489,1.093021,1.092624,1.122730,1.011742,0.943294,0.923651,0.845975,0.821050,0.787015,...,0.000407,0.029154,-0.436101,10.818459,2.029731e-06,-0.000146,0.001221,0.058308,-0.436101,4


In [13]:
# set k for k-hold cross-validation
k = 7
skf = StratifiedKFold(n_splits=k)

X = df_fea_all.drop('S', axis=1)
y = df_fea_all['S']

X = X.reset_index()
X = X.drop('index', axis = 1)
y = y.reset_index()
y = y.drop('index', axis = 1)

train_index = [[] for _ in range(k)]
test_index = [[] for _ in range(k)]

n = 0
for tra_idx, tes_idx in skf.split(X,y):
    
    print('##### Stratified k-fold',n+1,'回目 #####')
    print("TRAIN: ",len(tra_idx))
    print("TEST: ",len(tes_idx))
    
    train_index[n] = tra_idx
    test_index[n] = tes_idx
    
    n += 1

##### Stratified k-fold 1 回目 #####
TRAIN:  1710
TEST:  285
##### Stratified k-fold 2 回目 #####
TRAIN:  1710
TEST:  285
##### Stratified k-fold 3 回目 #####
TRAIN:  1710
TEST:  285
##### Stratified k-fold 4 回目 #####
TRAIN:  1710
TEST:  285
##### Stratified k-fold 5 回目 #####
TRAIN:  1710
TEST:  285
##### Stratified k-fold 6 回目 #####
TRAIN:  1710
TEST:  285
##### Stratified k-fold 7 回目 #####
TRAIN:  1710
TEST:  285


In [14]:
# learning

clf_1 = RandomForestClassifier(n_estimators=749,max_depth=15.295632458713467,criterion='entropy', min_samples_split=14)
clf_2 = LogisticRegression(random_state=0,C=1.0)
clf_3 = svm.SVC(gamma=0.001, C=1.)
clf_4 = lgb.LGBMClassifier()

y_test_list = [[] for _ in range(k)]

y_pred_1_list = [[] for _ in range(k)]
y_pred_2_list = [[] for _ in range(k)]
y_pred_3_list = [[] for _ in range(k)]
y_pred_4_list = [[] for _ in range(k)]

for i in range(k):
    
    X_train = X.iloc[train_index[i]]
    X_test = X.iloc[test_index[i]]
    y_train = y.iloc[train_index[i]]
    y_test = y.iloc[test_index[i]]
    
    # Feature standardization
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train=scaler.transform(X_train)
    X_test=scaler.transform(X_test)

    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    
    # fit
    clf_1.fit(X_train, y_train)
    clf_2.fit(X_train, y_train)
    clf_3.fit(X_train, y_train)
    clf_4.fit(X_train, y_train)
    
    # prediction
    y_pred_1 = clf_1.predict(X_test)
    y_pred_2 = clf_2.predict(X_test)
    y_pred_3 = clf_3.predict(X_test)
    y_pred_4 = clf_4.predict(X_test)
    
    y_test_list[i] = y_test
    y_pred_1_list[i] = y_pred_1
    y_pred_2_list[i] = y_pred_2
    y_pred_3_list[i] = y_pred_3
    y_pred_4_list[i] = y_pred_4
    
    print(i+1)

  clf_1.fit(X_train, y_train)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)


1


  clf_1.fit(X_train, y_train)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)


2


  clf_1.fit(X_train, y_train)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)


3


  clf_1.fit(X_train, y_train)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)


4


  clf_1.fit(X_train, y_train)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)


5


  clf_1.fit(X_train, y_train)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)


6


  clf_1.fit(X_train, y_train)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)


7


In [15]:
for i in range(len(y_test_list)):
    y_test_list[i] = y_test_list[i].values

In [16]:
y_test_tot = []

y_pred_1_tot = []
y_pred_2_tot = []
y_pred_3_tot = []
y_pred_4_tot = []

for i in range(k):
    y_test_tot.extend(y_test_list[i])
    
    y_pred_1_tot.extend(y_pred_1_list[i])
    y_pred_2_tot.extend(y_pred_2_list[i])
    y_pred_3_tot.extend(y_pred_3_list[i])
    y_pred_4_tot.extend(y_pred_4_list[i])

In [19]:
# result scores

print("################### overall results ##################")
print("  ")

print("############## Random Forest #############")
print("accuracy: ",round(accuracy_score(y_test_tot, y_pred_1_tot),3))
print(classification_report(y_test_tot, y_pred_1_tot))
print(confusion_matrix(y_test_tot, y_pred_1_tot))
print("  ")

print("############## LogisticRegression #############")
print("accuracy: ",round(accuracy_score(y_test_tot, y_pred_2_tot),3))
print(classification_report(y_test_tot, y_pred_2_tot))
print(confusion_matrix(y_test_tot, y_pred_2_tot))
print("  ")

print("############## svm #############")
print("accuracy: ",round(accuracy_score(y_test_tot, y_pred_3_tot),3))
print(classification_report(y_test_tot, y_pred_3_tot))
print(confusion_matrix(y_test_tot, y_pred_3_tot))
print("  ")

print("############## LightGBM #############")
print("accuracy: ",round(accuracy_score(y_test_tot, y_pred_4_tot),3))
print(classification_report(y_test_tot, y_pred_4_tot))
print(confusion_matrix(y_test_tot, y_pred_4_tot))

################### overall results ##################
  
############## Random Forest #############
accuracy:  0.962
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       399
           1       0.98      0.96      0.97       399
           2       0.98      0.98      0.98       399
           3       0.96      0.99      0.98       399
           4       0.94      0.98      0.96       399

    accuracy                           0.96      1995
   macro avg       0.96      0.96      0.96      1995
weighted avg       0.96      0.96      0.96      1995

[[358   7   6  13  15]
 [  0 383   3   2  11]
 [  6   0 390   3   0]
 [  2   0   0 397   0]
 [  6   1   0   0 392]]
  
############## LogisticRegression #############
accuracy:  0.966
              precision    recall  f1-score   support

           0       0.98      0.89      0.93       399
           1       0.98      0.97      0.98       399
           2       0.97      0.99      0.98   