### Load Libraries and Train Data

In [1]:
import os
import sys
import glob
import math
import time
import scipy as sp
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
from kneed import KneeLocator
from numpy import sqrt, square
from collections import Counter
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [2]:
## Load Training Data
path =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Train 2"
globbed_files = glob.glob(path + "/*.csv")
data = []
for csv in globbed_files:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data.append(frame)

In [3]:
# ## Load Training Data 2
# path2 =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Train 1"
# globbed_files2 = glob.glob(path2 + "/*.csv")
# for csv in globbed_files2:
#     frame = pd.read_csv(csv)
#     frame['x'] = os.path.basename(csv).split('.')[0][0]
#     frame['y'] = os.path.basename(csv).split('.')[0][1]
#     data.append(frame)

In [4]:
## Take the lowest number of set data (32)
attempt = []
for i, item in enumerate(data):
    attempt.append(data[i][['Router 1','Router 2','Router 3','Router 4','x','y']]) #.head(32)
attempt_concat = pd.concat(attempt)
data_train = attempt_concat.sample(frac=1, random_state=42).reset_index(drop=True)
data_train[['Router 1','Router 2','Router 3','Router 4']] = data_train[['Router 1','Router 2','Router 3','Router 4']].abs()
data_train = data_train.assign(Unique_ID = (data_train['x'].astype(str) + '_' + data_train['y'].astype(str)).astype('category').cat.codes)
x_train = data_train.iloc[:,0:4].values
y_train = data_train.iloc[:,4:]
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates().reset_index(drop=True)

In [5]:
# scaler = MinMaxScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)

### Left Testing

In [6]:
path_left =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Test\Left Test"
globbed_files_left = glob.glob(path_left + "/*.csv")
data_left = []
for csv in globbed_files_left:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data_left.append(frame)
    
attempt_left = []
for i, item in enumerate(data_left):
    attempt_left.append(data_left[i][['Router 1','Router 2','Router 3','Router 4','x','y']])
attempt_concat_left = pd.concat(attempt_left)
data_test_left = attempt_concat_left.sample(frac=1, random_state=42).reset_index(drop=True)
data_test_left[['Router 1','Router 2','Router 3','Router 4']] = data_test_left[['Router 1','Router 2','Router 3','Router 4']] .abs()
data_test_left = data_test_left.tail(562).reset_index(drop=True)

In [7]:
## split features and label
x_test_left = data_test_left.iloc[:,0:4].values
y_test_left = data_test_left.iloc[:,4:]

### Normalization
# x_test_left = scaler.transform(x_test_left)

### Perform DB-Kmeans Separate

##### Perform DBSCAN

In [8]:
nearest_neighbors = NearestNeighbors(n_neighbors=5)
neighbors = nearest_neighbors.fit(x_train)
distances, indices = neighbors.kneighbors(x_train)
distances = np.sort(distances[:,4], axis=0)
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')

In [9]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=distances[knee.knee], min_samples=5).fit(x_train)
labels = db.labels_

In [10]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # jumlah cluster, total klas - 1 (kalo ada noise)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 24
Estimated number of noise points: 57


In [11]:
x_train_df = pd.DataFrame(x_train,columns=['PC_1','PC_2','PC_3','PC_4'])
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates()
y_train['dbscan'] = labels

In [12]:
combined = pd.concat((x_train_df,y_train),axis=1)

combined_core = combined.loc[~(combined['dbscan'] == -1)]
combined_core = combined_core.reset_index(drop=True)

x_train_cr = combined_core.iloc[:,0:4]
y_train_cr = combined_core.iloc[:,4:]

#### Core and Noises Normalization

In [13]:
scaler_cr = MinMaxScaler()
scaler_cr.fit(x_train_cr)
x_train_cr = scaler_cr.transform(x_train_cr)

In [14]:
combined_noises = combined.loc[(combined['dbscan'] == -1)]
combined_noises = combined_noises.reset_index(drop=True)

x_train_n = combined_noises.iloc[:,0:4]
y_train_n = combined_noises.iloc[:,4:]

scaler_n = MinMaxScaler()
scaler_n.fit(x_train_n)
x_train_n = scaler_n.transform(x_train_n)

#### Perform K-Means

In [15]:
# from kneed import KneeLocator
# from sklearn.cluster import KMeans

# ks = range(1,25) ## batas maksimal kelas
# inertias_values = []  ## or so called SSE
# for k in ks:
#     model = KMeans(n_clusters=k)     # Create a KMeans instance with k clusters: model
#     model.fit(x_train_cr)      # Fit model to samples
#     inertias_values.append(model.inertia_)        # Append the inertia to the list of inertias
# kl = KneeLocator(range(1,25), inertias_values, curve="convex", direction="decreasing")
# kl.elbow

In [16]:
# temp = pd.DataFrame(x_train_cr, columns=['PC_1','PC_2','PC_3','PC_4'])
# combined_core = pd.concat((temp, y_train_cr), axis=1)

# dict_pre = {}
# clusters_pre = {}
# for x in np.unique(combined_core['dbscan'].values):
#     dict_pre[x] = list(combined_core[combined_core.dbscan == x]['Unique_ID'].values)
#     clusters_pre[x] = list(combined_core[combined_core.dbscan == x][['PC_1','PC_2','PC_3','PC_4']].values)
    
# centroid_dbscan = []
# for x in clusters_pre:
#     centroid_dbscan.append(np.mean(clusters_pre[x], axis=0))

In [17]:
# from typing import Tuple

# def _initialize_clusters(k: int) -> dict:
#     return {str(i): [] for i in range(1, k + 1)}

# def l2norm(point, center) -> float:
#     """calculate euclidean distance between two n-dimensional points"""
#     s = 0
#     for x, y in zip(point, center):
#         s += (x - y) ** 2
#     d = s[0] ** (1 / 2)
#     return d

# def l2norm_vector(point, centers) -> np.ndarray:
#     """calculate l2-norm between a point and several cluster centers."""
#     l2 = []
#     for center in centers:
#         center = center.reshape(1, len(center)).T  # (n, 1)
#         norm = l2norm(point, center)
#         l2.append(norm)
#     return np.array(l2)

# class Kmeans:
#     def __init__(self,
#                  n_clusters: int,          # number of clusters assignments
#                  data: np.ndarray,         # data matrix (n x d)
#                  iterations: int = 100,    # number of iterations to run.
#                  centroids: np.ndarray = None
#                  ): 
#         self.n_clusters = n_clusters
#         self.data = data
#         self.iterations = iterations
#         self.clusters = _initialize_clusters(self.n_clusters) 
#         self.centroids = np.array(centroids)
# #         index = np.random.choice(self.data.shape[0], self.n_clusters, replace=False)
# #         self.centroids = self.data[:,:4][index]
        
#     def train(self, method: str = "l2", verbose: bool = True) -> Tuple[np.ndarray, dict]:
#         iterations = self.iterations  # allows for multiple training runs if user desires
        
#         if method == "l2":
#             self._train_l2(iterations)
#         elif method == "l1":
#             self._train_l1(iterations, verbose=verbose)
            
#         if verbose:
#             print("Training done...")
#         return self.centroids, self.clusters
    
#     def predict(self, x: np.ndarray, method: str = "l2") -> Tuple[dict, np.ndarray]:
#         """calculate closest distance between input matrix and cluster assignments"""
#         # initialize cluster dictionary.
#         #clusters = _initialize_clusters(self.n_clusters)
#         if method == "l2":
#             for i in range(len(x)):
#                 d = l2norm_vector(x[i], self.centroids)
#                 c = np.argmin(d) + 1
#                 #clusters[str(c)].append(x[i])
#         elif method == "l1":
#             for i in range(len(x)):
#                 d = l1norm_vector(x[i], self.centroids)
#                 c = np.argmin(d) + 1
#                 #clusters[str(c)].append(x[i])            
#         return c #clusters, self.centroids
                
#     def _train_l2(self, iterations: int):
#         """run training using l2-norm"""
#         i = 0
#         while iterations != 0:
#             for j in range(self.n_clusters):
#                 self.clusters[str(j + 1)] = []
#             for i in range(len(self.data)):
#                 d = l2norm_vector(self.data[:,:4][i], self.centroids)
#                 c = np.argmin(d) + 1
#                 self.clusters[str(c)].append((self.data[:,:4][i], self.data[:,4][i]))

#             old_centroids = self.centroids.copy()
#             for i, c in enumerate(self.clusters.keys()):
#                 mean = np.mean(np.array([x[0] for x in self.clusters[str(c)]]), axis=0)
#                 self.centroids[int(i)] = mean

#             print("\riterations: {}...".format(iterations))
#             iterations -= 1
#             i += 1
#             if np.all(old_centroids == self.centroids):
#                 break

In [18]:
# data = np.append(x_train_cr, 
#                  y_train_cr['Unique_ID'].values.astype(int).reshape(y_train_cr['Unique_ID'].shape[0],1), 
#                  axis=1)
# K_l2 = Kmeans(n_clusters = 24, data = data, centroids = centroid_dbscan)
# centroids_l2, clusters_l2 = K_l2.train()

In [19]:
# def core_combine(clusters, ref_table):
#     value_ = []
#     unique_ = []
#     keys_ = []
#     for key,values in clusters.items():
#         for value in values:
#             value_.append(value[0]) 
#             unique_.append(value[1])
#             keys_.append(key)
    
#     df_unique_keys = pd.DataFrame({'Unique_ID': unique_,'kmeans': keys_}).astype(int)

#     coord_ = []
#     for x in df_unique_keys['Unique_ID'].values:
#         coord_.append(ref_table[ref_table.Unique_ID == x][['x','y']].values.reshape(2,))

#     df_coord = pd.DataFrame(np.array(coord_),columns=['x','y'])
#     df_rssi_value = pd.DataFrame(value_,columns=['PC_1','PC_2','PC_3','PC_4'])
#     combined_cr = pd.concat([df_rssi_value, df_coord, df_unique_keys],axis=1)
    
#     return combined_cr

# combined_cr = core_combine(clusters_l2, ref_table)

In [20]:
from sklearn.cluster import KMeans

final_model = KMeans(n_clusters = 4, random_state = 1)
final_model.fit(x_train_cr)

y_train_cr['kmeans'] = final_model.labels_
x_train_df = pd.DataFrame(x_train_cr,columns=['PC_1','PC_2','PC_3','PC_4'])
combined_cr = pd.concat((x_train_df,y_train_cr),axis=1)

### Evaluate Clusters (David Bouldin Internal Index)

In [21]:
# def arrayGen(centroids, clusters):
#     dataframe = []
#     labels = []
#     for i, (key,values) in enumerate(clusters.items()):
#         for x, point in enumerate(values):
#             dataframe.append(clusters[key][x])
#             labels.append(key)
#     dataframe_array = np.array(dataframe)
#     labels_array = np.array(labels)
#     return dataframe_array, labels_array

# def compute_s(i, x, labels, clusters):
#     norm_c= len(clusters)
#     s = 0
#     for x in clusters:
#         s += distance.euclidean(x, clusters[i])
#     return s

# def compute_Rij(i, j, x, labels, clusters, nc):
#     Rij = 0
#     try:
#         d = distance.euclidean(clusters[i],clusters[j])
#         Rij = (compute_s(i, x, labels, clusters) + compute_s(j, x, labels, clusters))/d
#     except:
#         Rij = 0
#     return Rij

# def compute_R(i, x, labels, clusters, nc): 
#     list_r = []
#     for i in range(nc):
#         for j in range(nc):
#             if(i!=j):
#                 temp = compute_Rij(i, j, x, labels, clusters, nc)
#                 list_r.append(temp)
#     return max(list_r)

# def compute_DB_index(x, labels, clusters, nc):
#     sigma_R = 0.0
#     for i in range(nc):
#         sigma_R = sigma_R + compute_R(i, x, labels, clusters, nc)
#     DB_index = float(sigma_R)/float(nc)
#     return DB_index

# ## features, labels, centroids, amount of clusters
# n = kl.elbow
# dataframe_array, labels_array = arrayGen(centroid, clusters)
# index_db_val = compute_DB_index(dataframe_array, labels_array, centroid, n)
# print ("The value of Davies Bouldin index for a K-Means cluser of size " + str(n) + " is: " + str(index_db_val))

### Make Dict_ and Clusters

In [22]:
dict_ = {}
clusters = {}
for x in np.unique(combined_cr['kmeans'].values):
    dict_[x] = list(combined_cr[combined_cr.kmeans == x]['Unique_ID'].values)
    clusters[x] = list(combined_cr[combined_cr.kmeans == x][['PC_1','PC_2','PC_3','PC_4']].values)

### Handle Imbalaced Data

In [23]:
def strategy_dict(y):
    '''
    balancing for oversampling strategy
    '''
    new_strategy = {}
    keys = Counter(y).keys()
    values = max(Counter(y).values())
    for key in keys:
        new_strategy[key] = values
    return new_strategy

## untuk bantu train regressor
def over_sampling(clusters, dict_, index):
    '''
    random oversampling strategy
    '''
    x_total = {}
    y_total = {}
    for i,item in enumerate(index):
        if len(item) > 1:
            x = [z for z in clusters[list(clusters.keys())[i]]]
            y = [z for z in dict_[list(dict_.keys())[i]]]
            strategy = strategy_dict(y)
            oversample = RandomOverSampler(sampling_strategy=strategy)
            x_over, y_over = oversample.fit_resample(x, y)
            x_total[list(clusters.keys())[i]] = np.array(x_over)
            y_total[list(clusters.keys())[i]] = np.array(y_over)
        else:
            x_total[list(clusters.keys())[i]] = np.array([z for z in clusters[list(clusters.keys())[i]]])
            y_total[list(clusters.keys())[i]] = np.array([z for z in dict_[list(dict_.keys())[i]]])
    return x_total, y_total

In [24]:
unique = [list(np.unique(x)) for x in dict_.values()]

In [25]:
x_total, y_total = over_sampling(clusters, dict_, unique)

### Catch Sensitive and Insensitive Region

In [27]:
x_insensitive = {}
y_insensitive = {}
for x in [i for i, x in enumerate(unique) if len(x) > 1]:
    x_insensitive[x] = x_total[x]
    y_insensitive[x] = y_total[x]

### Merge Dataframe

In [29]:
## matching process between new id and ref table
data_balanced = {}
for i, (x,y) in enumerate(zip(x_insensitive, y_insensitive)):
    data_balanced["{0}".format(list(x_insensitive.keys())[i])] = y_insensitive[y]
data_df_balanced = pd.DataFrame.from_dict(data_balanced, orient='index').T

# ## matching process between new id and ref table
# data_balanced = {}
# for i, (x,y) in enumerate(zip(x_total, y_total)):
#     data_balanced["{0}".format(list(x_total.keys())[i])] = y_total[y]
# data_df_balanced = pd.DataFrame.from_dict(data_balanced, orient='index').T

In [30]:
def filter_list_id(df_id, ref_table):
    dict_loc = {}; id_total = []
    m_total = ref_table.shape[0]
    poses = []
    for c in df_id:
        x = []
        for i in range(len(df_id[c].dropna())):
            x.append(int(df_id[c][i]))
        var = np.array(x)
        id_total.append(var)
    for i in range(m_total):
        key = int(ref_table.iloc[i]['Unique_ID'])
        value = ref_table.iloc[i, 0:2].values
        dict_loc[key] = value
    for i in range(len(id_total)):
        pos = []
        for j in range(len(id_total[i])):
            x = id_total[i][j]
            pos.append(dict_loc.get(x))
        pos = np.array(pos)
        poses.append(pos)    
    return id_total, poses

In [31]:
id_total_balanced, poses_balanced = filter_list_id(data_df_balanced, ref_table)

In [32]:
def make_df(id_total,poses,i):
    df_id = pd.DataFrame(id_total[i],columns=[str(i)])
    df_pos = pd.DataFrame(poses[i],columns=['x','y'])
    return df_pos, df_id

df = {}
for i, (_id,pose) in enumerate(zip(id_total_balanced,poses_balanced)):
    df[list(x_insensitive.keys())[i]] = pd.concat((make_df(id_total_balanced,poses_balanced,i)),axis=1)

# df = {}
# for i, (_id,pose) in enumerate(zip(id_total_balanced,poses_balanced)):
#     df[list(x_total.keys())[i]] = pd.concat((make_df(id_total_balanced,poses_balanced,i)),axis=1)

### Pre-Trained and Tuned

##### -- Regressor

In [33]:
## regression model for pose prediction
def tuning_regr_knn(x_train,y_train):
    QUANTITATIVE_COLUMNS = ['x', 'y']
    regr = KNeighborsRegressor(n_neighbors=1)

    metric = ['euclidean']
    param_grid = {'metric': metric}
    
    grid = GridSearchCV(estimator = regr,
                        param_grid = param_grid,
                        scoring = 'neg_mean_squared_error',
                        cv = 10,
                        n_jobs = -1)

    tic = time.time()
    grid_result_regr = grid.fit(x_train,y_train[QUANTITATIVE_COLUMNS].values.astype(np.float64))
    toc = time.time()
    run_time = (toc - tic)/60
    return grid_result_regr.best_estimator_, grid_result_regr.best_score_, run_time

In [None]:
# regr_tuned = {}
# for i,(x,y) in enumerate(zip(x_total,df)):
#     regr, regr_score, runtime_regr = tuning_regr_knn(x_total[x], df[y])
#     regr_tuned["regr_{0}".format(list(x_total.keys())[i])] = regr, regr_score, runtime_regr

regr_tuned = {}
for i,(x,y) in enumerate(zip(x_insensitive,df)):
    regr, regr_score, runtime_regr = tuning_regr_knn(x_insensitive[x], df[y])
    regr_tuned["regr_{0}".format(list(x_insensitive.keys())[i])] = regr, regr_score, runtime_regr

### Testing Procedure

In [43]:
y_test_left['Unique_ID'] = [int(ref_table[(ref_table.x == str(x[0])) & (ref_table.y == str(x[1]))]['Unique_ID'].values) 
                              for x in y_test_left.values]

In [44]:
import scipy as sp
def nn_modified(test_noises, x_train, y_train):
    distance_n = []
    for x,y in zip(x_train, y_train):
        dim = np.array(x).shape[0]
        eu_distance = sp.spatial.distance.euclidean(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((eu_distance,y))
    distance_n = sorted(distance_n)[0][1]
    return distance_n

In [45]:
## use WKNN instead, return label
clus_dbscan = []
for n in x_test_left:
    clus_dbscan.append(nn_modified(n, x_train, y_train['dbscan'].values))

In [46]:
## generate index_core and noises from function with cosine similiarity
def index_nc(labels_):
    index_core = []
    index_noises = []
    for i, x in enumerate(labels_):
        if x == -1:
            index_noises.append(i) 
        else:
            index_core.append(i)
    return index_core, index_noises

In [47]:
index_core, index_noises = index_nc(clus_dbscan)

test_core = x_test_left[index_core] ## core point testing
test_noises = x_test_left[index_noises]

label_core = y_test_left.values[index_core] ## core point testing
label_noises = y_test_left.values[index_noises]

#### Core and Noises Test Normalization

In [48]:
test_core = scaler_cr.transform(test_core)

In [49]:
test_noises = scaler_cr.transform(test_noises)

### Core Point Testing

In [50]:
def wknn_modified(test_noises, x_train_n, y_train_n, k):
    distance_n = []; freq = defaultdict(float)
    for x,y in zip(x_train_n, y_train_n):
        dim = np.array(x).shape[0]
        euclidean_distance = sp.spatial.distance.cosine(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((euclidean_distance,y))
    distance_n = sorted(distance_n)
    unique_c = list(np.unique(np.array(distance_n)[:,1]).astype(int))
    for x in unique_c:
        freq.setdefault(int(x),0)
    for d in distance_n:
        temp = freq[float(d[1])]
        if d[0] == float(0):
            temp_ = float(temp)
            freq[float(d[1])] = temp_
        else:
            temp_ = float(temp) + (1 / d[0])
            freq[float(d[1])] = temp_
    for key in freq:
        freq[key] = freq[key]/k
    return max(freq, key=lambda key: freq[key]), distance_n

def test_prediction(x_test_left, clus_, dict_, x_insensitive, regr_tuned, ref_table, x_train_cr, y_train_cr):
    result = []
    for i, (x,y) in enumerate(zip(x_test_left, clus_)):
        if y in list(x_insensitive.keys()):
            data = x.reshape(1,len(x))
            result.append(regr_tuned['regr_{0}'.format(int(y))][0].predict(data).astype(int).reshape(2,))
        else:
            y_ = np.unique(dict_[y])
            result.append(ref_table[ref_table.Unique_ID == int(y_)][['x','y']].values.astype(int).reshape(2,))
    return result

In [52]:
clus_kmeans = []
for x in test_core:
    clus_kmeans.append(final_model.predict(x.reshape(1,4)))

# clus_kmeans = []
# for x in test_core:
#     clus_kmeans.append(K_l2.predict(x.reshape(1,4)))
    
clus_core = [int(i) for i in clus_kmeans]

result = test_prediction(test_core, clus_core, dict_, x_insensitive, regr_tuned, 
                         ref_table, x_train_cr, y_train_cr)

df_pred = pd.DataFrame(result, columns=['x_pred','y_pred'])

### Noise Point Testing

##### With Fusion

In [53]:
### Physical Distance Calculation
benchmark = {
    1 : [(str(0),str(1)),(str(1),str(0))],
    2 : [(str(3),str(0)),(str(4),str(1))],
    3 : [(str(4),str(3)),(str(3),str(4))],
    4 : [(str(1),str(4)),(str(0),str(3))], 
}

def physical_distance(x, data_train, ref_table, benchmark, ap_sel):
    d = []
    clos = []
    
    bm_1 = round(np.mean(data_train[(data_train.x == benchmark[ap_sel][0][0]) & 
                                    (data_train.y == benchmark[ap_sel][0][1])].values[:,ap_sel-1]))
    bm_2 = round(np.mean(data_train[(data_train.x == benchmark[ap_sel][1][0]) & 
                                    (data_train.y == benchmark[ap_sel][1][1])].values[:,ap_sel-1]))
    
    d_1 = 1 * (10 ** (( bm_1 - x ) / ( 10 * 3.1 )))
    d_2 = 1 * (10 ** (( bm_2 - x ) / ( 10 * 3.1 )))
    
    for j in ref_table.values[:,-1]:
        temp = round(np.mean(data_train[data_train.Unique_ID == j].values[:, ap_sel-1].astype(int)))
        ref_dis_1 = (1 * (10 ** (( bm_1 - temp ) / ( 10 * 3.1 ))), j)
        ref_dis_2 = (1 * (10 ** (( bm_2 - temp ) / ( 10 * 3.1 ))), j)
        d.append((ref_dis_1, ref_dis_2))
    
    for z in d:
        clos.append((sp.spatial.distance.cityblock(d_1, z[0][0]), z[0][1]))
        clos.append((sp.spatial.distance.cityblock(d_2, z[1][0]), z[1][1]))
        
    return sorted(clos)[:25]


phy_test = data_test_left.values[:,:4][index_noises]
data_train_new = pd.DataFrame(data_train.values[combined.loc[~(combined['dbscan'] == -1)]
                                                .index.values.tolist()], columns = data_train.columns)

phy_dis = []
for features in phy_test:
    temp = []
    for i, feature in enumerate(features):
        ap_sel = i + 1
        temp.append(physical_distance(feature, data_train_new, ref_table, benchmark, ap_sel))
    phy_dis.append(temp)
    
unique_phy = [[e[0] for e in Counter([c[1] for co in con for c in co]).most_common(4)] for con in phy_dis]

distance_phy_sel = [[d for dis in item for d in dis if d[1] in u] for item, u in zip(phy_dis, unique_phy)]

In [67]:
### Spatial Distance Calculation
x_train_new = x_train_cr
y_train_new = y_train_cr['Unique_ID'].values 

def spatial(test_noises, x_train_n, y_train_n, k=25):
    distance_n = []
    for x,y in zip(x_train_n, y_train_n):
        dim = np.array(x).shape[0]
        euclidean_distance = sp.spatial.distance.cosine(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((euclidean_distance,y))
    distance_n = sorted(distance_n)[:k]
    return distance_n

distance_sp_sel = [spatial(n, x_train_new, y_train_new) for n in test_noises]

unique_sp = [[d[1] for d in dis] for dis in distance_sp_sel]

In [68]:
def get_unique(unique_phy, unique_sp, distance_phy_sel, distance_sp_sel):
    example = []
    for i, (p, s, dp, ds) in enumerate(zip(unique_phy, unique_sp, distance_phy_sel, distance_sp_sel)):
        if (len(set(p) & set(s)) == 1):
            example.append(list(set(p) & set(s))[0])
        elif (len(set(p) & set(s)) > 1):
            buff = sorted([d for d in ds if d[1] in list(set(p) & set(s))])
            example.append(buff[0][1])
        else:
            example.append(sorted([d for d in ds])[0][1])
    return example

In [69]:
unique_tot = get_unique(unique_phy, unique_sp, distance_phy_sel, distance_sp_sel)

In [70]:
pred_n = []
for x in unique_tot:
    pred_n.append((int(ref_table['x'][ref_table[ref_table.Unique_ID==x].index]),
                   int(ref_table['y'][ref_table[ref_table.Unique_ID==x].index])))
    
dataset_n = pd.DataFrame(pred_n,columns=['x_pred','y_pred'])

In [71]:
# unique_tot = []
# for p, s in zip(unique_phy, unique_sp):
#     unique_tot.append(p+s)

# clus_wknn_n = []
# dis_wknn_n = []
# for n, up in zip(test_noises, unique_phy):
#     df = combined_cr[combined_cr['Unique_ID'].isin(up)].reset_index(drop=True)
#     x_train_phy = df.values[:,:4].astype(float)
#     y_train_phy = df['Unique_ID'].values.astype(int)
#     clus, distance = wknn_modified(n, x_train_phy, y_train_phy, x_train_phy.shape[0])
#     clus_wknn_n.append(clus)
#     dis_wknn_n.append(distance)

# def get_weight_fuse(distance1, distance2):
#     freq = {}
#     buff = distance1 + distance2
#     count = Counter([b[1] for b in buff])
#     for b in buff:
#         freq.setdefault(b[1],0)
#     for d in buff:
#         temp = freq[float(d[1])]
#         if d[0] == float(0):
#             temp_ = float(temp)
#             freq[float(d[1])] = temp_
#         else:
#             temp_ = float(temp) + (1 / d[0])
#             freq[float(d[1])] = temp_
#     for key in freq:
#         freq[key] = freq[key] / count[key]
#     return max(freq, key=lambda key: freq[key]), freq

# phy_d = {}
# for i,x in enumerate(data_test_left.values[:,:4][index_noises]):
#     phy_d[i] = physical_distance(x, data_train, ref_table, benchmark)
    
# distance_new = {}
# for i,n in enumerate(test_noises):
#     distance_new[i] = wknn_modified_2(n, x_train_new, y_train_new)

# clus_fuse = []
# for x,y in zip(phy_d.items(), distance_new.items()):
#     clus_fuse.append(get_weight_fuse(x[1],y[1]))

# pred_n = []
# for x in clus_fuse:
#     pred_n.append((int(ref_table['x'][ref_table[ref_table.Unique_ID==x].index]),
#                    int(ref_table['y'][ref_table[ref_table.Unique_ID==x].index])))

##### Without Fusion

In [72]:
# x_train_new = x_train_cr
# y_train_new = y_train_cr['Unique_ID'].values 

In [73]:
# ## use WKNN instead, return label
# clus_wknn_n = []
# dis_wknn_n = []
# for n in test_noises:
#     clus, distance = wknn_modified(n, x_train_new, y_train_new, 5)
#     clus_wknn_n.append(clus)
#     dis_wknn_n.append(distance)

In [74]:
# pred_n = []
# for x in clus_wknn_n:
#     pred_n.append((int(ref_table['x'][ref_table[ref_table.Unique_ID==x].index]),
#                    int(ref_table['y'][ref_table[ref_table.Unique_ID==x].index])))

In [75]:
# dataset_n = pd.DataFrame(pred_n,columns=['x_pred','y_pred'])

### Calculate Regression Error

In [76]:
def calculate_regr_error(df_final,label_core):
    x, y  = df_final['x_pred'].values, df_final['y_pred'].values
    x0, y0 = label_core[:,0].astype(int), label_core[:,1].astype(int)
    coords_error = np.sqrt(np.square(x - x0) + np.square(y - y0))
    mean_loc_error = coords_error.mean()
    return mean_loc_error, coords_error

In [77]:
mean_loc_error, coords_error = calculate_regr_error(df_pred,label_core)
print(mean_loc_error)

1.4706382031574066


In [78]:
# noises point error
mean_loc_error_n, coords_error_n = calculate_regr_error(dataset_n,label_noises)
print(mean_loc_error_n)

1.7731625988476911


In [66]:
### Error gabungan
np.sum((len(label_core) * mean_loc_error)+(len(label_noises) * mean_loc_error_n)) / (len(label_core) + len(label_noises))

1.3788616875655784