In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import datetime

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# regressors
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from xgboost import XGBRegressor

# classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import lightgbm as lgb

# for results
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import max_error
from sklearn.metrics import r2_score

pd.set_option("display.max_rows", 10)

In [2]:
# load data
division = 's'  # s or m (s = small-sized meeting room, m = medium-sized seminar room)

# session 1
if division == 's':
    l = glob.glob(r'..\..\datasets\small-room\sess1\P=*.csv')
elif division == 'm':
    l = glob.glob(r'..\..\datasets\medium-room\sess1\P=*.csv')
l.sort()

df_fea_sess1 = []
for i in l:
    df_fea_sess1.append(pd.read_csv(i, header=None))

# session 2
if division == 's':
    l = glob.glob(r'..\..\datasets\small-room\sess2\P=*.csv')
elif division == 'm':
    l = glob.glob(r'..\..\datasets\medium-room\sess2\P=*.csv')
l.sort()

df_fea_sess2 = []
for i in l:
    df_fea_sess2.append(pd.read_csv(i, header=None))
    
# session 3
if division == 's':
    l = glob.glob(r'..\..\datasets\small-room\sess3\P=*.csv')
elif division == 'm':
    l = glob.glob(r'..\..\datasets\medium-room\sess3\P=*.csv')
l.sort()

df_fea_sess3 = []
for i in l:
    df_fea_sess3.append(pd.read_csv(i, header=None))

In [3]:
l

['..\\..\\datasets\\small-room\\sess3\\P=0.csv',
 '..\\..\\datasets\\small-room\\sess3\\P=1.csv',
 '..\\..\\datasets\\small-room\\sess3\\P=2.csv',
 '..\\..\\datasets\\small-room\\sess3\\P=3.csv',
 '..\\..\\datasets\\small-room\\sess3\\P=4.csv',
 '..\\..\\datasets\\small-room\\sess3\\P=5.csv']

In [4]:
# In case of medium-room, bring P=10 dataset to behind

if division == 'm':
    # session 1
    temp1 = df_fea_sess1[2]
    del df_fea_sess1[2]
    df_fea_sess1.append(temp1)

    # session 2
    temp2 = df_fea_sess2[2]
    del df_fea_sess2[2]
    df_fea_sess2.append(temp2)

    # session 3
    temp3 = df_fea_sess3[2]
    del df_fea_sess3[2]
    df_fea_sess3.append(temp3)
    
    print('done.')

In [5]:
# select session (sess1, sess2, sess3)

df_fea = df_fea_sess1

In [6]:
df_fea[0].shape

(199, 420)

In [7]:
# create column label (feature name)
# l(N1)_xxx(N2) >> N1: link number, N2: subcarrier number.

nof_link = 4
nof_usedsubc = 13

col_label = []

for i in range(nof_link):
    
    for j in range(nof_usedsubc):
        col_label.append('l%d_std%d' %(i+1,j+1))
        col_label.append('l%d_min%d' %(i+1,j+1))
        col_label.append('l%d_max%d' %(i+1,j+1))
        col_label.append('l%d_qtl%d' %(i+1,j+1))
        col_label.append('l%d_qtu%d' %(i+1,j+1))
        col_label.append('l%d_avg%d' %(i+1,j+1))
        col_label.append('l%d_iqr%d' %(i+1,j+1))
        
    for j in range(nof_usedsubc-1):
        col_label.append('l%d_adj%d' %(i+1,j+1))
        
    col_label.append('l%d_euc' %(i+1))
    col_label.append('l%d_rss' %(i+1))

len(col_label)

420

In [8]:
# change column label

for i in range(len(df_fea)):
    df_fea[i].columns = col_label

In [9]:
df_fea[0]

Unnamed: 0,l1_std1,l1_min1,l1_max1,l1_qtl1,l1_qtu1,l1_avg1,l1_iqr1,l1_std2,l1_min2,l1_max2,...,l4_adj5,l4_adj6,l4_adj7,l4_adj8,l4_adj9,l4_adj10,l4_adj11,l4_adj12,l4_euc,l4_rss
0,0.579245,0.590528,0.465022,0.539145,0.396148,0.454259,0.364344,0.451833,0.414886,0.419528,...,1.181186,2.544215,1.248014,1.204240,1.439531,1.244940,2.025076,1.269062,1.241139,1.122604
1,0.555057,0.509983,0.473291,0.460764,0.434179,0.394327,0.375031,0.393596,0.373684,0.384711,...,1.195286,2.110777,1.196166,1.070712,1.192843,1.315626,1.559272,1.228392,1.505176,0.611211
2,0.459749,0.445092,0.492385,0.480850,0.459144,0.415320,0.329702,0.436089,0.475927,0.429373,...,1.256296,2.879833,1.248596,1.331100,1.568482,1.351303,2.310752,1.350138,1.369714,1.344241
3,0.488519,0.477486,0.474437,0.468533,0.439576,0.424415,0.305615,0.475285,0.523427,0.472617,...,1.192813,2.699522,1.213124,1.287003,1.437217,1.279635,2.182681,1.319379,1.339355,1.307026
4,0.531140,0.562913,0.542084,0.476537,0.454736,0.445256,0.304397,0.475096,0.578374,0.481708,...,1.143808,1.875198,1.129517,1.032512,1.027724,1.278242,1.422185,1.216394,1.443487,0.212954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,0.533440,0.530056,0.481699,0.407438,0.376391,0.409826,0.388772,0.398905,0.446871,0.421669,...,1.176691,1.726087,1.237604,1.068228,1.021038,1.035873,1.474633,1.252343,1.150770,0.497048
195,0.566003,0.610318,0.472996,0.444268,0.426559,0.442905,0.407152,0.409039,0.455480,0.420535,...,1.206792,1.931635,1.329326,1.097927,1.203568,1.171545,1.585152,1.382298,1.221690,0.972795
196,0.615311,0.747305,0.528381,0.493453,0.508952,0.490040,0.354299,0.519829,0.554000,0.520394,...,1.273121,2.652459,1.284114,1.310528,1.570716,1.159018,2.395588,1.589232,1.248171,1.412303
197,0.545751,0.661330,0.515602,0.483101,0.499383,0.437842,0.318861,0.500243,0.561087,0.553691,...,1.192449,2.502690,1.205278,1.212069,1.382317,1.034018,2.217331,1.466100,1.192218,1.529909


In [10]:
# Ground truth

for i in range(len(df_fea)):
    df_fea[i]['P'] = i

In [11]:
df_fea[0]

Unnamed: 0,l1_std1,l1_min1,l1_max1,l1_qtl1,l1_qtu1,l1_avg1,l1_iqr1,l1_std2,l1_min2,l1_max2,...,l4_adj6,l4_adj7,l4_adj8,l4_adj9,l4_adj10,l4_adj11,l4_adj12,l4_euc,l4_rss,P
0,0.579245,0.590528,0.465022,0.539145,0.396148,0.454259,0.364344,0.451833,0.414886,0.419528,...,2.544215,1.248014,1.204240,1.439531,1.244940,2.025076,1.269062,1.241139,1.122604,0
1,0.555057,0.509983,0.473291,0.460764,0.434179,0.394327,0.375031,0.393596,0.373684,0.384711,...,2.110777,1.196166,1.070712,1.192843,1.315626,1.559272,1.228392,1.505176,0.611211,0
2,0.459749,0.445092,0.492385,0.480850,0.459144,0.415320,0.329702,0.436089,0.475927,0.429373,...,2.879833,1.248596,1.331100,1.568482,1.351303,2.310752,1.350138,1.369714,1.344241,0
3,0.488519,0.477486,0.474437,0.468533,0.439576,0.424415,0.305615,0.475285,0.523427,0.472617,...,2.699522,1.213124,1.287003,1.437217,1.279635,2.182681,1.319379,1.339355,1.307026,0
4,0.531140,0.562913,0.542084,0.476537,0.454736,0.445256,0.304397,0.475096,0.578374,0.481708,...,1.875198,1.129517,1.032512,1.027724,1.278242,1.422185,1.216394,1.443487,0.212954,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,0.533440,0.530056,0.481699,0.407438,0.376391,0.409826,0.388772,0.398905,0.446871,0.421669,...,1.726087,1.237604,1.068228,1.021038,1.035873,1.474633,1.252343,1.150770,0.497048,0
195,0.566003,0.610318,0.472996,0.444268,0.426559,0.442905,0.407152,0.409039,0.455480,0.420535,...,1.931635,1.329326,1.097927,1.203568,1.171545,1.585152,1.382298,1.221690,0.972795,0
196,0.615311,0.747305,0.528381,0.493453,0.508952,0.490040,0.354299,0.519829,0.554000,0.520394,...,2.652459,1.284114,1.310528,1.570716,1.159018,2.395588,1.589232,1.248171,1.412303,0
197,0.545751,0.661330,0.515602,0.483101,0.499383,0.437842,0.318861,0.500243,0.561087,0.553691,...,2.502690,1.205278,1.212069,1.382317,1.034018,2.217331,1.466100,1.192218,1.529909,0


In [12]:
df_fea_all = pd.DataFrame()

for i in range(len(df_fea)):
    df_fea_all = pd.concat([df_fea_all, df_fea[i]], axis=0)

df_fea_all.reset_index(drop=True)

Unnamed: 0,l1_std1,l1_min1,l1_max1,l1_qtl1,l1_qtu1,l1_avg1,l1_iqr1,l1_std2,l1_min2,l1_max2,...,l4_adj6,l4_adj7,l4_adj8,l4_adj9,l4_adj10,l4_adj11,l4_adj12,l4_euc,l4_rss,P
0,0.579245,0.590528,0.465022,0.539145,0.396148,0.454259,0.364344,0.451833,0.414886,0.419528,...,2.544215,1.248014,1.204240,1.439531,1.244940,2.025076,1.269062,1.241139,1.122604,0
1,0.555057,0.509983,0.473291,0.460764,0.434179,0.394327,0.375031,0.393596,0.373684,0.384711,...,2.110777,1.196166,1.070712,1.192843,1.315626,1.559272,1.228392,1.505176,0.611211,0
2,0.459749,0.445092,0.492385,0.480850,0.459144,0.415320,0.329702,0.436089,0.475927,0.429373,...,2.879833,1.248596,1.331100,1.568482,1.351303,2.310752,1.350138,1.369714,1.344241,0
3,0.488519,0.477486,0.474437,0.468533,0.439576,0.424415,0.305615,0.475285,0.523427,0.472617,...,2.699522,1.213124,1.287003,1.437217,1.279635,2.182681,1.319379,1.339355,1.307026,0
4,0.531140,0.562913,0.542084,0.476537,0.454736,0.445256,0.304397,0.475096,0.578374,0.481708,...,1.875198,1.129517,1.032512,1.027724,1.278242,1.422185,1.216394,1.443487,0.212954,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189,3.701647,3.592159,3.336933,3.361590,3.504554,3.740507,3.797289,3.377996,2.588687,1.881520,...,3.647540,1.986958,1.834716,2.372234,1.699732,3.419391,2.123907,2.871195,3.701198,5
1190,2.835873,2.796866,2.550490,2.573745,2.527001,2.783263,3.061806,2.791892,2.331941,1.706890,...,3.798074,1.915546,1.795722,2.478701,1.727398,3.614637,2.210107,3.009515,4.959107,5
1191,2.139021,2.001566,1.639007,1.658234,1.569986,1.625831,1.648079,1.352616,1.245641,1.225599,...,3.979956,2.085157,1.743442,2.524065,1.602903,3.625238,1.764593,3.229761,4.801689,5
1192,1.725523,1.792047,1.672065,1.718735,1.709904,1.707449,1.600284,1.232880,0.925690,0.800569,...,3.410398,1.875781,1.655271,2.101263,1.536997,2.964088,1.559484,2.782080,3.138428,5


In [13]:
# set k for k-hold cross-validation
k = 7
skf = StratifiedKFold(n_splits=k)

X = df_fea_all.drop('P', axis=1)
y = df_fea_all['P']

X = X.reset_index()
X = X.drop('index', axis = 1)
y = y.reset_index()
y = y.drop('index', axis = 1)

train_index = [[] for _ in range(k)]
test_index = [[] for _ in range(k)]

n = 0
for tra_idx, tes_idx in skf.split(X,y):
    
    print('##### Stratified k-fold',n+1,'回目 #####')
    print("TRAIN: ",len(tra_idx))
    print("TEST: ",len(tes_idx))
    
    train_index[n] = tra_idx
    test_index[n] = tes_idx
    
    n += 1

##### Stratified k-fold 1 回目 #####
TRAIN:  1023
TEST:  171
##### Stratified k-fold 2 回目 #####
TRAIN:  1023
TEST:  171
##### Stratified k-fold 3 回目 #####
TRAIN:  1023
TEST:  171
##### Stratified k-fold 4 回目 #####
TRAIN:  1023
TEST:  171
##### Stratified k-fold 5 回目 #####
TRAIN:  1024
TEST:  170
##### Stratified k-fold 6 回目 #####
TRAIN:  1024
TEST:  170
##### Stratified k-fold 7 回目 #####
TRAIN:  1024
TEST:  170


In [14]:
# learning

reg_1 = LinearRegression()
reg_2 = RandomForestRegressor()
reg_3 = XGBRegressor()
reg_4 = lgb.LGBMRegressor()

y_test_list = [[] for _ in range(k)]

y_pred_1_list = [[] for _ in range(k)]
y_pred_2_list = [[] for _ in range(k)]
y_pred_3_list = [[] for _ in range(k)]
y_pred_4_list = [[] for _ in range(k)]

for i in range(k):
    
    X_train = X.iloc[train_index[i]]
    X_test = X.iloc[test_index[i]]
    y_train = y.iloc[train_index[i]]
    y_test = y.iloc[test_index[i]]
    
    # Feature standardization
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train=scaler.transform(X_train)
    X_test=scaler.transform(X_test)

    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    
    # fit
    reg_1.fit(X_train, y_train)
    reg_2.fit(X_train, y_train)
    reg_3.fit(X_train, y_train)
    reg_4.fit(X_train, y_train)
    
    # prediction
    y_pred_1 = reg_1.predict(X_test)
    y_pred_2 = reg_2.predict(X_test)
    y_pred_3 = reg_3.predict(X_test)
    y_pred_4 = reg_4.predict(X_test)
    
    y_test_list[i] = y_test
    y_pred_1_list[i] = y_pred_1
    y_pred_2_list[i] = y_pred_2
    y_pred_3_list[i] = y_pred_3
    y_pred_4_list[i] = y_pred_4
    
    print(i+1)

  reg_2.fit(X_train, y_train)


1


  reg_2.fit(X_train, y_train)


2


  reg_2.fit(X_train, y_train)


3


  reg_2.fit(X_train, y_train)


4


  reg_2.fit(X_train, y_train)


5


  reg_2.fit(X_train, y_train)


6


  reg_2.fit(X_train, y_train)


7


In [15]:
for i in range(len(y_test_list)):
    y_test_list[i] = y_test_list[i].values

In [16]:
y_test_tot = []

y_pred_1_tot = []
y_pred_2_tot = []
y_pred_3_tot = []
y_pred_4_tot = []

for i in range(k):
    y_test_tot.extend(y_test_list[i])
    
    y_pred_1_tot.extend(y_pred_1_list[i])
    y_pred_2_tot.extend(y_pred_2_list[i])
    y_pred_3_tot.extend(y_pred_3_list[i])
    y_pred_4_tot.extend(y_pred_4_list[i])

In [17]:
# result scores

print('Regressor 1')
print('###########################')
print('Med-abs-err: ',round(median_absolute_error(y_test_tot, y_pred_1_tot),4))
print('Max-err: ',round(max_error(y_test_tot, y_pred_1_tot),4))
print('R2_score: ',round(r2_score(y_test_tot, y_pred_1_tot),4))
print('###########################')

print(' ')

print('Regressor 2')
print('###########################')
print('Med-abs-err: ',round(median_absolute_error(y_test_tot, y_pred_2_tot),4))
print('Max-err: ',round(max_error(y_test_tot, y_pred_2_tot),4))
print('R2_score: ',round(r2_score(y_test_tot, y_pred_2_tot),4))
print('###########################')

print(' ')

print('Regressor 3')
print('###########################')
print('Med-abs-err: ',round(median_absolute_error(y_test_tot, y_pred_3_tot),4))
print('Max-err: ',round(max_error(y_test_tot, y_pred_3_tot),4))
print('R2_score: ',round(r2_score(y_test_tot, y_pred_3_tot),4))
print('###########################')

print(' ')

print('Regressor 4')
print('###########################')
print('Med-abs-err: ',round(median_absolute_error(y_test_tot, y_pred_4_tot),4))
print('Max-err: ',round(max_error(y_test_tot, y_pred_4_tot),4))
print('R2_score: ',round(r2_score(y_test_tot, y_pred_4_tot),4))
print('###########################')

Regressor 1
###########################
Med-abs-err:  0.2174
Max-err:  1.585
R2_score:  0.9465
###########################
 
Regressor 2
###########################
Med-abs-err:  0.17
Max-err:  2.81
R2_score:  0.9289
###########################
 
Regressor 3
###########################
Med-abs-err:  0.1433
Max-err:  2.8859
R2_score:  0.9233
###########################
 
Regressor 4
###########################
Med-abs-err:  0.1578
Max-err:  2.38
R2_score:  0.9407
###########################
