### Load Libraries and Train Data

In [1]:
import os
import sys
import glob
import math
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
from numpy import sqrt, square
from kneed import KneeLocator
from collections import Counter
from collections import defaultdict
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [2]:
## Load Training Data
path =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 20 April Fluctuative Noise Quadrant 3 (3,3)\Train"
globbed_files = glob.glob(path + "/*.csv")
data = []
for csv in globbed_files:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data.append(frame)

In [3]:
# ## Load Training Data 2
# path2 =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Train 1"
# globbed_files2 = glob.glob(path2 + "/*.csv")
# for csv in globbed_files2:
#     frame = pd.read_csv(csv)
#     frame['x'] = os.path.basename(csv).split('.')[0][0]
#     frame['y'] = os.path.basename(csv).split('.')[0][1]
#     data.append(frame)

In [4]:
## Take the lowest number of set data (32)
attempt = []
for i, item in enumerate(data):
    attempt.append(data[i][['Router 1','Router 2','Router 3','Router 4','x','y']]) #.head(32)
attempt_concat = pd.concat(attempt)
data_train = attempt_concat.sample(frac=1, random_state=42).reset_index(drop=True)
data_train[['Router 1','Router 2','Router 3','Router 4']] = data_train[['Router 1','Router 2','Router 3','Router 4']].abs()
data_train = data_train.assign(Unique_ID = (data_train['x'].astype(str) + '_' + data_train['y'].astype(str)).astype('category').cat.codes)
x_train = data_train.iloc[:,0:4].values
y_train = data_train.iloc[:,4:]
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates().reset_index(drop=True)

In [5]:
# # scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)

### Left Testing

In [6]:
path_left =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 20 April Fluctuative Noise Quadrant 3 (3,3)\Test\Right Test"
globbed_files_left = glob.glob(path_left + "/*.csv")
data_left = []
for csv in globbed_files_left:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data_left.append(frame)
    
attempt_left = []
for i, item in enumerate(data_left):
    attempt_left.append(data_left[i][['Router 1','Router 2','Router 3','Router 4','x','y']])
attempt_concat_left = pd.concat(attempt_left)
data_test_left = attempt_concat_left.sample(frac=1, random_state=42).reset_index(drop=True)
data_test_left[['Router 1','Router 2','Router 3','Router 4']] = data_test_left[['Router 1','Router 2','Router 3','Router 4']].abs()
data_test_left = data_test_left.tail(562).reset_index(drop=True)

In [7]:
## split features and label
x_test_left = data_test_left.iloc[:,0:4].values
y_test_left = data_test_left.iloc[:,4:]

### Normalization
# x_test_left = scaler.transform(x_test_left)

### Perform DBSCAN

In [8]:
nearest_neighbors = NearestNeighbors(n_neighbors=5)
neighbors = nearest_neighbors.fit(x_train)
distances, indices = neighbors.kneighbors(x_train)
distances = np.sort(distances[:,4], axis=0)
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')

In [9]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=distances[knee.knee], min_samples=5).fit(x_train)
labels = db.labels_

In [10]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # jumlah cluster, total klas - 1 (kalo ada noise)
n_noise_ = list(labels).count(-1)

In [11]:
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 22
Estimated number of noise points: 88


In [12]:
x_train_df = pd.DataFrame(x_train,columns=['PC_1','PC_2','PC_3','PC_4'])
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates()
y_train['dbscan'] = labels

In [13]:
combined = pd.concat((x_train_df,y_train),axis=1)

In [14]:
combined_core = combined.loc[~(combined['dbscan'] == -1)]
combined_core = combined_core.reset_index(drop=True)

In [15]:
x_train_cr = combined_core.iloc[:,0:4]
y_train_cr = combined_core.iloc[:,4:]

### Handle Imbalaced Data

In [16]:
dict_ = {}
clusters = {}
for x in np.unique(combined_core.values[:,-1]):
    dict_[x] = list(combined_core[combined_core.dbscan == x]['Unique_ID'].values)
    clusters[x] = list(combined_core[combined_core.dbscan == x][['PC_1', 'PC_2', 'PC_3', 'PC_4']].values)

In [17]:
def strategy_dict(y):
    '''
    balancing for oversampling strategy
    '''
    new_strategy = {}
    keys = Counter(y).keys()
    values = max(Counter(y).values())
    for key in keys:
        new_strategy[key] = values
    return new_strategy

## untuk bantu train regressor
def over_sampling(clusters, dict_, index):
    '''
    random oversampling strategy
    '''
    x_total = {}
    y_total = {}
    for i,item in enumerate(index):
        if len(item) > 1:
            x = [z for z in clusters[list(clusters.keys())[i]]]
            y = [z for z in dict_[list(dict_.keys())[i]]]
            strategy = strategy_dict(y)
            oversample = RandomOverSampler(sampling_strategy=strategy)
            x_over, y_over = oversample.fit_resample(x, y)
            x_total[list(clusters.keys())[i]] = np.array(x_over)
            y_total[list(clusters.keys())[i]] = np.array(y_over)
        else:
            x_total[list(clusters.keys())[i]] = np.array([z for z in clusters[list(clusters.keys())[i]]])
            y_total[list(clusters.keys())[i]] = np.array([z for z in dict_[list(dict_.keys())[i]]])
    return x_total, y_total

In [18]:
unique = [list(np.unique(x)) for x in dict_.values()]

In [19]:
x_total, y_total = over_sampling(clusters, dict_, unique)

### Merge Dataframe

In [20]:
## matching process between new id and ref table
data_balanced = {}
for i, (x,y) in enumerate(zip(x_total, y_total)):
    data_balanced["{0}".format(list(x_total.keys())[i])] = y_total[y]
data_df_balanced = pd.DataFrame.from_dict(data_balanced, orient='index').T

In [21]:
def filter_list_id(df_id, ref_table):
    dict_loc = {}; id_total = []
    m_total = ref_table.shape[0]
    poses = []
    for c in df_id:
        x = []
        for i in range(len(df_id[c].dropna())):
            x.append(int(df_id[c][i]))
        var = np.array(x)
        id_total.append(var)
    for i in range(m_total):
        key = int(ref_table.iloc[i]['Unique_ID'])
        value = ref_table.iloc[i, 0:2].values
        dict_loc[key] = value
    for i in range(len(id_total)):
        pos = []
        for j in range(len(id_total[i])):
            x = id_total[i][j]
            pos.append(dict_loc.get(x))
        pos = np.array(pos)
        poses.append(pos)    
    return id_total, poses

In [22]:
id_total_balanced, poses_balanced = filter_list_id(data_df_balanced, ref_table)

In [23]:
def make_df(id_total,poses,i):
    df_id = pd.DataFrame(id_total[i],columns=[str(i)])
    df_pos = pd.DataFrame(poses[i],columns=['x','y'])
    return df_pos, df_id

df = {}
for i, (_id,pose) in enumerate(zip(id_total_balanced,poses_balanced)):
    df[list(x_total.keys())[i]] = pd.concat((make_df(id_total_balanced,poses_balanced,i)),axis=1)

### Pre-Trained and Tuned

##### -- Regressor

In [24]:
## regression model for pose prediction
def tuning_regr_knn(x_train,y_train):
    QUANTITATIVE_COLUMNS = ['x', 'y']
    regr = KNeighborsRegressor(n_neighbors=1)
    
    metric = ['euclidean']
    hyperparameters = {'metric': metric}
     
    grid = GridSearchCV(estimator = regr,
                        param_grid = hyperparameters,
                        scoring = 'neg_mean_squared_error',
                        cv = 5,
                        n_jobs = -1)

    tic = time.time()
    grid_result_regr = grid.fit(x_train,y_train[QUANTITATIVE_COLUMNS].values.astype(np.float64))
    toc = time.time()
    run_time = (toc - tic)/60
    return grid_result_regr.best_estimator_, grid_result_regr.best_score_, run_time

In [25]:
regr_tuned = {}
for i,(x,y) in enumerate(zip(x_total,df)):
    regr, regr_score, runtime_regr = tuning_regr_knn(x_total[x], df[y])
    regr_tuned["regr_{0}".format(list(x_total.keys())[i])] = regr, regr_score, runtime_regr

In [26]:
regr_tuned

{'regr_0': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.16595844427744547),
 'regr_1': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  -0.032,
  0.000977484385172526),
 'regr_2': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.0009176214536031087),
 'regr_3': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.000900721549987793),
 'regr_4': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.0008660157521565756),
 'regr_5': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.0008728384971618652),
 'regr_6': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  -0.0125,
  0.0008770545323689778),
 'regr_7': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.0008669416109720866),
 'regr_8': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.0008833885192871093),
 'regr_9': (KNeighborsRegressor(metric='euclidean', n_neighbors=1),
  0.0,
  0.000962698

### Testing Procedure (Online Phase)

In [27]:
y_test_left['Unique_ID'] = [int(ref_table[(ref_table.x == str(x[0])) & (ref_table.y == str(x[1]))]['Unique_ID'].values) 
                              for x in y_test_left.values]

In [28]:
import scipy as sp
def nn_modified(test_noises, x_train, y_train):
    distance_n = []
    for x,y in zip(x_train, y_train):
        dim = np.array(x).shape[0]
        eu_distance = sp.spatial.distance.euclidean(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((eu_distance,y))
    distance_n = sorted(distance_n)[0][1]
    return distance_n

In [29]:
## use WKNN instead, return label, didekati ke 
clus_knn = []
for n in x_test_left:
    clus_knn.append(nn_modified(n, x_train_cr.values, y_train_cr['dbscan'].values))

In [30]:
prep_core = np.append(x_test_left, np.array(clus_knn).reshape(len(clus_knn),1).astype(int), axis=1)

In [31]:
class POS_Regr(): 
    '''
    Generate object for Coordinate Result
    '''
    def getRegr(self):
        return self.regr
    def __init__(self, regr):
        self.regr = regr

In [32]:
def test_prediction(prep_core, regr_tuned):
    buff = []
    for i,item in enumerate(prep_core):
        N = len(item)-1
        data = item[0:N].reshape(1,N)
        hasil = regr_tuned['regr_{0}'.format(item[-1].astype(int))][0].predict(data)
        value = POS_Regr(hasil)
        buff.append(value)
    return buff

In [33]:
pred = test_prediction(prep_core, regr_tuned)

In [34]:
def df_prediction(pred):
    df_prediction = []
    for i in range(len(pred)):
        xs = list(pred[i].getRegr()[0])
        df_prediction.append(xs)
    dataset = pd.DataFrame(df_prediction,columns=['x_pred','y_pred']) 
    return dataset

In [35]:
df_pred = df_prediction(pred)

### Calculate Regression Error

In [36]:
def calculate_regr_error(df_final,label_core):
    x, y  = df_final['x_pred'].values, df_final['y_pred'].values
    x0, y0 = label_core[:,0].astype(int), label_core[:,1].astype(int)
    coords_error = np.sqrt(np.square(x - x0) + np.square(y - y0))
    mean_loc_error = coords_error.mean()
    return mean_loc_error, coords_error

In [37]:
## core point error
mean_loc_error, coords_error = calculate_regr_error(df_pred,y_test_left.values)
print(mean_loc_error)

0.17625235563907898


In [38]:
coords_error

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.41421356, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     