### Load Libraries and Train Data

In [1]:
import os
import sys
import glob
import math
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
from numpy import sqrt, square
from kneed import KneeLocator
from collections import Counter
from collections import defaultdict
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [2]:
## Load Training Data
path =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Train 2"
globbed_files = glob.glob(path + "/*.csv")
data = []
for csv in globbed_files:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data.append(frame)

In [3]:
## Take the lowest number of set data (32)
attempt = []
for i, item in enumerate(data):
    attempt.append(data[i][['Router 1','Router 2','Router 3','Router 4','x','y']]) #.head(32)
attempt_concat = pd.concat(attempt)
data_train = attempt_concat.sample(frac=1, random_state=42).reset_index(drop=True)
data_train[['Router 1','Router 2','Router 3','Router 4']] = data_train[['Router 1','Router 2','Router 3','Router 4']].abs()
data_train = data_train.assign(Unique_ID = (data_train['x'].astype(str) + '_' + data_train['y'].astype(str)).astype('category').cat.codes)
x_train = data_train.iloc[:,0:4].values
y_train = data_train.iloc[:,4:]
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates().reset_index(drop=True)

### Left Testing

In [4]:
path_left =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Test\Right Test"
globbed_files_left = glob.glob(path_left + "/*.csv")
data_left = []
for csv in globbed_files_left:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data_left.append(frame)
    
attempt_left = []
for i, item in enumerate(data_left):
    attempt_left.append(data_left[i][['Router 1','Router 2','Router 3','Router 4','x','y']])
attempt_concat_left = pd.concat(attempt_left)
data_test_left = attempt_concat_left.sample(frac=1, random_state=42).reset_index(drop=True)
data_test_left[['Router 1','Router 2','Router 3','Router 4']] = data_test_left[['Router 1','Router 2','Router 3','Router 4']].abs()
data_test_left = data_test_left.tail(562).reset_index(drop=True)

In [5]:
## split features and label
x_test_left = data_test_left.iloc[:,0:4].values
y_test_left = data_test_left.iloc[:,4:]

### Perform DBSCAN

In [6]:
nearest_neighbors = NearestNeighbors(n_neighbors=5)
neighbors = nearest_neighbors.fit(x_train)
distances, indices = neighbors.kneighbors(x_train)
distances = np.sort(distances[:,4], axis=0)
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')

In [7]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=distances[knee.knee], min_samples=5).fit(x_train)
labels = db.labels_

In [8]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # jumlah cluster, total klas - 1 (kalo ada noise)
n_noise_ = list(labels).count(-1)

In [9]:
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 24
Estimated number of noise points: 57


In [10]:
x_train_df = pd.DataFrame(x_train,columns=['PC_1','PC_2','PC_3','PC_4'])
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates()
y_train['dbscan'] = labels

In [11]:
combined = pd.concat((x_train_df,y_train),axis=1)

In [12]:
combined_core = combined.loc[~(combined['dbscan'] == -1)]
combined_core = combined_core.reset_index(drop=True)

combined_noise = combined.loc[combined['dbscan'] == -1]
combined_noise = combined_noise.reset_index(drop=True)

In [13]:
x_train_cr = combined_core.iloc[:,0:4]
y_train_cr = combined_core.iloc[:,4:]

In [14]:
x_train_n = combined_noise.iloc[:,0:4]
y_train_n = combined_noise.iloc[:,4:]

### Handle Imbalaced Data

In [15]:
dict_ = {}
clusters = {}
for x in np.unique(combined_core.values[:,-1]):
    dict_[x] = list(combined_core[combined_core.dbscan == x]['Unique_ID'].values)
    clusters[x] = list(combined_core[combined_core.dbscan == x][['PC_1', 'PC_2', 'PC_3', 'PC_4']].values)

In [16]:
def strategy_dict(y):
    '''
    balancing for oversampling strategy
    '''
    new_strategy = {}
    keys = Counter(y).keys()
    values = max(Counter(y).values())
    for key in keys:
        new_strategy[key] = values
    return new_strategy

## untuk bantu train regressor
def over_sampling(clusters, dict_, index):
    '''
    random oversampling strategy
    '''
    x_total = {}
    y_total = {}
    for i,item in enumerate(index):
        if len(item) > 1:
            x = [z for z in clusters[list(clusters.keys())[i]]]
            y = [z for z in dict_[list(dict_.keys())[i]]]
            strategy = strategy_dict(y)
            oversample = RandomOverSampler(sampling_strategy=strategy)
            x_over, y_over = oversample.fit_resample(x, y)
            x_total[list(clusters.keys())[i]] = np.array(x_over)
            y_total[list(clusters.keys())[i]] = np.array(y_over)
        else:
            x_total[list(clusters.keys())[i]] = np.array([z for z in clusters[list(clusters.keys())[i]]])
            y_total[list(clusters.keys())[i]] = np.array([z for z in dict_[list(dict_.keys())[i]]])
    return x_total, y_total

In [17]:
unique = [list(np.unique(x)) for x in dict_.values()]

In [18]:
x_total, y_total = over_sampling(clusters, dict_, unique)

### Merge Dataframe

In [19]:
## matching process between new id and ref table
data_balanced = {}
for i, (x,y) in enumerate(zip(x_total, y_total)):
    data_balanced["{0}".format(list(x_total.keys())[i])] = y_total[y]
data_df_balanced = pd.DataFrame.from_dict(data_balanced, orient='index').T

In [20]:
def filter_list_id(df_id, ref_table):
    dict_loc = {}; id_total = []
    m_total = ref_table.shape[0]
    poses = []
    for c in df_id:
        x = []
        for i in range(len(df_id[c].dropna())):
            x.append(int(df_id[c][i]))
        var = np.array(x)
        id_total.append(var)
    for i in range(m_total):
        key = int(ref_table.iloc[i]['Unique_ID'])
        value = ref_table.iloc[i, 0:2].values
        dict_loc[key] = value
    for i in range(len(id_total)):
        pos = []
        for j in range(len(id_total[i])):
            x = id_total[i][j]
            pos.append(dict_loc.get(x))
        pos = np.array(pos)
        poses.append(pos)    
    return id_total, poses

In [21]:
id_total_balanced, poses_balanced = filter_list_id(data_df_balanced, ref_table)

In [22]:
def make_df(id_total,poses,i):
    df_id = pd.DataFrame(id_total[i],columns=[str(i)])
    df_pos = pd.DataFrame(poses[i],columns=['x','y'])
    return df_pos, df_id

df = {}
for i, (_id,pose) in enumerate(zip(id_total_balanced,poses_balanced)):
    df[list(x_total.keys())[i]] = pd.concat((make_df(id_total_balanced,poses_balanced,i)),axis=1)

### Pre-Trained and Tuned

##### -- Regressor

In [23]:
## regression model for pose prediction
def tuning_regr_knn(x_train,y_train):
    QUANTITATIVE_COLUMNS = ['x', 'y']
    regr = KNeighborsRegressor(n_neighbors=1)
    
    metric = ['euclidean']
    hyperparameters = {'metric': metric}
     
    grid = GridSearchCV(estimator = regr,
                        param_grid = hyperparameters,
                        scoring = 'neg_mean_squared_error',
                        cv = 3,
                        n_jobs = -1)

    tic = time.time()
    grid_result_regr = grid.fit(x_train,y_train[QUANTITATIVE_COLUMNS].values.astype(np.float64))
    toc = time.time()
    run_time = (toc - tic)/60
    return grid_result_regr.best_estimator_, grid_result_regr.best_score_, run_time

In [24]:
regr_tuned = {}
for i,(x,y) in enumerate(zip(x_total,df)):
    regr, regr_score, runtime_regr = tuning_regr_knn(x_total[x], df[y])
    regr_tuned["regr_{0}".format(list(x_total.keys())[i])] = regr, regr_score, runtime_regr

### Testing Procedure

In [25]:
y_test_left['Unique_ID'] = [int(ref_table[(ref_table.x == str(x[0])) & (ref_table.y == str(x[1]))]['Unique_ID'].values) 
                              for x in y_test_left.values]

In [26]:
import scipy as sp
def nn_modified(test_noises, x_train, y_train):
    distance_n = []
    for x,y in zip(x_train, y_train):
        dim = np.array(x).shape[0]
        eu_distance = sp.spatial.distance.euclidean(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((eu_distance,y))
    clus = sorted(distance_n)[0][1]
    return clus

In [27]:
clus_knn = []
for n in x_test_left:
    clus_knn.append(nn_modified(n, x_train, y_train['dbscan'].values))

In [28]:
## generate index_core and noises from function with cosine similiarity
def index_nc(labels_):
    index_core = []
    index_noises = []
    for i, x in enumerate(labels_):
        if x == -1:
            index_noises.append(i) 
        else:
            index_core.append(i)
    return index_core, index_noises

In [29]:
index_core, index_noises = index_nc(clus_knn)

test_core = x_test_left[index_core] ## core point testing
test_noises = x_test_left[index_noises]

label_core = y_test_left.values[index_core] ## core point testing
label_noises = y_test_left.values[index_noises]

In [30]:
prep_core = np.append(test_core, np.array(clus_knn)[index_core].reshape(len(np.array(clus_knn)[index_core]),1).astype(int), axis=1)

In [31]:
class POS_Regr(): 
    '''
    Generate object for Coordinate Result
    '''
    def getRegr(self):
        return self.regr
    def __init__(self, regr):
        self.regr = regr

In [32]:
def test_prediction(prep_core, regr_tuned):
    buff = []
    for i,item in enumerate(prep_core):
        N = len(item)-1
        data = item[0:N].reshape(1,N)
        hasil = regr_tuned['regr_{0}'.format(item[-1].astype(int))][0].predict(data)
        value = POS_Regr(hasil)
        buff.append(value)
    return buff

In [33]:
pred = test_prediction(prep_core, regr_tuned)

In [34]:
def df_prediction(pred):
    df_prediction = []
    for i in range(len(pred)):
        xs = list(pred[i].getRegr()[0])
        df_prediction.append(xs)
    dataset = pd.DataFrame(df_prediction,columns=['x_pred','y_pred']) 
    return dataset

In [35]:
df_pred = df_prediction(pred)

### Noise Testing Procedure

In [36]:
x_train_n = x_train_cr.values
y_train_n = y_train_cr['Unique_ID'].values ## label kelas grid

In [37]:
def wknn_modified(test_noises, x_train_n, y_train_n, k=5):
    distance_n = []; freq = defaultdict(float)
    for x,y in zip(x_train_n, y_train_n):
        dim = np.array(x).shape[0]
        eu_distance = sp.spatial.distance.euclidean(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((eu_distance,y))
    distance_n = sorted(distance_n)[:k]
    count = Counter([b[1] for b in distance_n])
    unique_c = list(np.unique(np.array(distance_n)[:,1]).astype(int))
    for x in unique_c:
        freq.setdefault(int(x),0)
    for d in distance_n:
        temp = freq[float(d[1])]
        if d[0] == float(0):
            temp_ = float(temp)
            freq[float(d[1])] = temp_
        else:
            temp_ = float(temp) + (1 / d[0])
            freq[float(d[1])] = temp_
    for key in freq:
        freq[key] = freq[key] / count[key]
    return max(freq, key=lambda key: freq[key])

In [38]:
## use WKNN instead, return label
clus_wknn_n = []
for n in test_noises:
    clus_wknn_n.append(wknn_modified(n, x_train, y_train['Unique_ID'].values))

In [39]:
pred_n = []
for x in clus_wknn_n:
    pred_n.append((int(ref_table['x'][ref_table[ref_table.Unique_ID==x].index]),
                   int(ref_table['y'][ref_table[ref_table.Unique_ID==x].index])))

In [40]:
dataset_n = pd.DataFrame(pred_n,columns=['x_pred','y_pred'])

### Calculate Regression Error

In [41]:
def calculate_regr_error(df_final,label_core):
    x, y  = df_final['x_pred'].values, df_final['y_pred'].values
    x0, y0 = label_core[:,0].astype(int), label_core[:,1].astype(int)
    coords_error = np.sqrt(np.square(x - x0) + np.square(y - y0))
    mean_loc_error = coords_error.mean()
    return mean_loc_error, coords_error

In [42]:
## core point error
mean_loc_error, coords_error = calculate_regr_error(df_pred,label_core)
print(mean_loc_error)

1.4701089027594252


In [43]:
## noises point error
mean_loc_error_n, coords_error_n = calculate_regr_error(dataset_n,label_noises)
print(mean_loc_error_n)

1.832242049571854


In [44]:
### Error gabungan
np.sum((len(label_core) * mean_loc_error)+(len(label_noises) * mean_loc_error_n)) / (len(label_core) + len(label_noises))

1.5448552462296061

In [45]:
np.append(coords_error, coords_error_n)

array([1.41421356, 2.23606798, 3.        , 2.82842712, 2.        ,
       3.16227766, 0.        , 1.41421356, 3.16227766, 0.        ,
       2.        , 0.        , 1.41421356, 1.41421356, 0.        ,
       2.23606798, 2.        , 2.82842712, 2.23606798, 3.        ,
       0.        , 1.41421356, 3.        , 0.        , 1.41421356,
       2.        , 2.        , 3.16227766, 1.41421356, 1.41421356,
       1.41421356, 0.        , 3.        , 3.        , 3.        ,
       1.41421356, 0.        , 3.16227766, 1.41421356, 2.        ,
       0.        , 3.        , 1.41421356, 2.        , 3.16227766,
       1.41421356, 2.        , 2.        , 0.        , 1.41421356,
       2.        , 2.        , 0.        , 2.23606798, 2.        ,
       0.        , 3.        , 1.41421356, 1.41421356, 1.41421356,
       3.        , 3.16227766, 0.        , 2.23606798, 1.41421356,
       2.23606798, 0.        , 1.41421356, 0.        , 2.        ,
       2.23606798, 0.        , 3.        , 0.        , 2.     