### Load Libraries and Train Data

In [1]:
import os
import sys
import glob
import math
import time
import scipy as sp
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
from kneed import KneeLocator
from numpy import sqrt, square
from collections import Counter
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [2]:
## Load Training Data
path =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Train 2"
globbed_files = glob.glob(path + "/*.csv")
data = []
for csv in globbed_files:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data.append(frame)

In [3]:
## Take the lowest number of set data (32)
attempt = []
for i, item in enumerate(data):
    attempt.append(data[i][['Router 1','Router 2','Router 3','Router 4','x','y']]) #.head(32)
attempt_concat = pd.concat(attempt)
data_train = attempt_concat.sample(frac=1, random_state=42).reset_index(drop=True)
data_train[['Router 1','Router 2','Router 3','Router 4']] = data_train[['Router 1','Router 2','Router 3','Router 4']].abs()
data_train = data_train.assign(Unique_ID = (data_train['x'].astype(str) + '_' + data_train['y'].astype(str)).astype('category').cat.codes)
x_train = data_train.iloc[:,0:4].values
y_train = data_train.iloc[:,4:]
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates().reset_index(drop=True)

In [4]:
# scaler = StandardScaler()
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

### Left Testing

In [5]:
path_left =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Test\Right Test"
globbed_files_left = glob.glob(path_left + "/*.csv")
data_left = []
for csv in globbed_files_left:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data_left.append(frame)
    
attempt_left = []
for i, item in enumerate(data_left):
    attempt_left.append(data_left[i][['Router 1','Router 2','Router 3','Router 4','x','y']])
attempt_concat_left = pd.concat(attempt_left)
data_test_left = attempt_concat_left.sample(frac=1, random_state=42).reset_index(drop=True)
data_test_left[['Router 1','Router 2','Router 3','Router 4']] = data_test_left[['Router 1','Router 2','Router 3','Router 4']].abs()
data_test_left = data_test_left.tail(562).reset_index(drop=True)

## split features and label
x_test_left = data_test_left.iloc[:,0:4].values
y_test_left = data_test_left.iloc[:,4:]

### Normalization
x_test_left = scaler.transform(x_test_left)

### Perform DB-Kmeans Separate

##### Perform DBSCAN

In [6]:
nearest_neighbors = NearestNeighbors(n_neighbors=5)
neighbors = nearest_neighbors.fit(x_train)
distances, indices = neighbors.kneighbors(x_train)
distances = np.sort(distances[:,4], axis=0)
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')

In [7]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=distances[knee.knee], min_samples=5).fit(x_train)
labels = db.labels_

In [8]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # jumlah cluster, total klas - 1 (kalo ada noise)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 24
Estimated number of noise points: 69


In [9]:
x_train_df = pd.DataFrame(x_train,columns=['PC_1','PC_2','PC_3','PC_4'])
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates()
y_train['dbscan'] = labels

In [10]:
combined = pd.concat((x_train_df,y_train),axis=1)

combined_core = combined.loc[~(combined['dbscan'] == -1)]
combined_core = combined_core.reset_index(drop=True)

combined_noise = combined.loc[combined['dbscan'] == -1]
combined_noise = combined_noise.reset_index(drop=True)

x_train_cr = combined_core.iloc[:,0:4]
y_train_cr = combined_core.iloc[:,4:]

x_train_n = combined_noise.iloc[:,0:4]
y_train_n = combined_noise.iloc[:,4:]

#### Perform K-Means

In [11]:
from sklearn.cluster import KMeans
ks = range(1, 25)
inertias_values = []  ## or so called SSE
for k in ks:
    model = KMeans(n_clusters=k)     # Create a KMeans instance with k clusters: model
    model.fit(x_train_cr)      # Fit model to samples
    inertias_values.append(model.inertia_)        # Append the inertia to the list of inertias

In [12]:
from kneed import KneeLocator
kl = KneeLocator(range(1, 25), inertias_values, curve="convex", direction="decreasing")

final_model = KMeans(n_clusters = 4,random_state=1)
final_model.fit(x_train_cr)

y_train_cr['kmean'] = final_model.labels_
centroid = final_model.cluster_centers_

### Evaluate Clusters (David Bouldin Internal Index)

In [13]:
## Clusters finals after kmeans
combined_cr = pd.concat([x_train_cr,y_train_cr],axis=1)

In [14]:
dict_ = {}
clusters = {}
for x in np.unique(combined_cr['kmean'].values):
    dict_[x] = list(combined_cr[combined_cr.kmean == x]['Unique_ID'].values)
    clusters[x] = list(combined_cr[combined_cr.kmean == x][['PC_1','PC_2','PC_3','PC_4']].values)

In [15]:
def arrayGen(centroids, clusters):
    dataframe = []
    labels = []
    for i, (key,values) in enumerate(clusters.items()):
        for x, point in enumerate(values):
            dataframe.append(clusters[key][x])
            labels.append(key)
    dataframe_array = np.array(dataframe)
    labels_array = np.array(labels)
    return dataframe_array, labels_array

def compute_s(i, x, labels, clusters):
    norm_c= len(clusters)
    s = 0
    for x in clusters:
        s += distance.euclidean(x, clusters[i])
    return s

def compute_Rij(i, j, x, labels, clusters, nc):
    Rij = 0
    try:
        d = distance.euclidean(clusters[i],clusters[j])
        Rij = (compute_s(i, x, labels, clusters) + compute_s(j, x, labels, clusters))/d
    except:
        Rij = 0
    return Rij

def compute_R(i, x, labels, clusters, nc): 
    list_r = []
    for i in range(nc):
        for j in range(nc):
            if(i!=j):
                temp = compute_Rij(i, j, x, labels, clusters, nc)
                list_r.append(temp)
    return max(list_r)

def compute_DB_index(x, labels, clusters, nc):
    sigma_R = 0.0
    for i in range(nc):
        sigma_R = sigma_R + compute_R(i, x, labels, clusters, nc)
    DB_index = float(sigma_R)/float(nc)
    return DB_index

## features, labels, centroids, amount of clusters
n = kl.elbow
dataframe_array, labels_array = arrayGen(centroid, clusters)
index_db_val = compute_DB_index(dataframe_array, labels_array, centroid, n)
print ("The value of Davies Bouldin index for a K-Means cluser of size " + str(n) + " is: " + str(index_db_val))

The value of Davies Bouldin index for a K-Means cluser of size 8 is: 0.0


### Handle Imbalaced Data

In [16]:
def strategy_dict(y):
    '''
    balancing for oversampling strategy
    '''
    new_strategy = {}
    keys = Counter(y).keys()
    values = max(Counter(y).values())
    for key in keys:
        new_strategy[key] = values
    return new_strategy

## untuk bantu train regressor
def over_sampling(clusters, dict_, index):
    '''
    random oversampling strategy
    '''
    x_total = {}
    y_total = {}
    for i,item in enumerate(index):
        if len(item) > 1:
            x = [z for z in clusters[list(clusters.keys())[i]]]
            y = [z for z in dict_[list(dict_.keys())[i]]]
            strategy = strategy_dict(y)
            oversample = RandomOverSampler(sampling_strategy=strategy)
            x_over, y_over = oversample.fit_resample(x, y)
            x_total[list(clusters.keys())[i]] = np.array(x_over)
            y_total[list(clusters.keys())[i]] = np.array(y_over)
        else:
            x_total[list(clusters.keys())[i]] = np.array([z for z in clusters[list(clusters.keys())[i]]])
            y_total[list(clusters.keys())[i]] = np.array([z for z in dict_[list(dict_.keys())[i]]])
    return x_total, y_total

In [17]:
unique = [list(np.unique(x)) for x in dict_.values()]

In [18]:
x_total, y_total = over_sampling(clusters, dict_, unique)

### Merge Dataframe

In [19]:
## matching process between new id and ref table
data_balanced = {}
for i, (x,y) in enumerate(zip(x_total, y_total)):
    data_balanced["{0}".format(list(x_total.keys())[i])] = y_total[y]
data_df_balanced = pd.DataFrame.from_dict(data_balanced, orient='index').T

In [20]:
def filter_list_id(df_id, ref_table):
    dict_loc = {}; id_total = []
    m_total = ref_table.shape[0]
    poses = []
    for c in df_id:
        x = []
        for i in range(len(df_id[c].dropna())):
            x.append(int(df_id[c][i]))
        var = np.array(x)
        id_total.append(var)
    for i in range(m_total):
        key = int(ref_table.iloc[i]['Unique_ID'])
        value = ref_table.iloc[i, 0:2].values
        dict_loc[key] = value
    for i in range(len(id_total)):
        pos = []
        for j in range(len(id_total[i])):
            x = id_total[i][j]
            pos.append(dict_loc.get(x))
        pos = np.array(pos)
        poses.append(pos)    
    return id_total, poses

In [21]:
id_total_balanced, poses_balanced = filter_list_id(data_df_balanced, ref_table)

In [22]:
def make_df(id_total,poses,i):
    df_id = pd.DataFrame(id_total[i],columns=[str(i)])
    df_pos = pd.DataFrame(poses[i],columns=['x','y'])
    return df_pos, df_id

df = {}
for i, (_id,pose) in enumerate(zip(id_total_balanced,poses_balanced)):
    df[list(x_total.keys())[i]] = pd.concat((make_df(id_total_balanced,poses_balanced,i)),axis=1)

### Pre-Trained and Tuned

##### -- Classifier

In [23]:
## classifier for noise detection
def classifier(x_train,y_train):       
    clf = KNeighborsClassifier(n_neighbors=1)
    
    metric = ['euclidean']
    hyperparameters = {'metric': metric}
    
    grid = GridSearchCV(estimator = clf,
                        param_grid = hyperparameters,
                        scoring = 'accuracy',
                        cv = 10,
                        n_jobs = -1)
    
    tic = time.time()
    grid_result_clf = grid.fit(x_train,y_train)
    toc = time.time()
    run_time = (toc - tic)/60
    return grid_result_clf.best_estimator_, grid_result_clf.best_score_, run_time

#### For Noise Detection

In [24]:
# y_core_nd = np.ones(x_train_cr.values.shape[0])
# y_noises_nd = np.zeros(x_train_n.values.shape[0])

# x_train_nd = np.append(x_train_cr, x_train_n, axis=0)
# y_train_nd = np.append(y_core_nd,y_noises_nd,axis=0)

# ## random oversampling
# strat = strategy_dict(y_train_nd)
# oversample = RandomOverSampler(sampling_strategy=strat)
# x_over, y_over = oversample.fit_resample(x_train_nd, y_train_nd)

# noises_clf_tuned, noises_clf_score, runtime_noises_clf = classifier(x_over,y_over)

#### For Cluster Detection

In [25]:
# ## train core for cluster prediction
# total_c = []
# for keys,values in x_total.items():
#     for z in values:
#         total_c.append(np.append(z.reshape(1,4),keys))
# x_pred_clus = np.array(total_c)[:,0:4]
# y_pred_clus = np.array(total_c)[:,-1].astype(int)

In [26]:
# clf_tuned, clf_score, runtime_clf = classifier(x_pred_clus, y_pred_clus)

##### -- Regressor

In [27]:
## regression model for pose prediction
def tuning_regr_knn(x_train,y_train):
    QUANTITATIVE_COLUMNS = ['x', 'y']
    regr = KNeighborsRegressor(n_neighbors=1)
    
    metric = ['euclidean']
    weights = ['uniform', 'distance']
    algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
    param_grid = {'metric': metric}
     
    grid = GridSearchCV(estimator = regr,
                        param_grid = param_grid,
                        scoring = 'neg_mean_squared_error',
                        cv = 10,
                        n_jobs = -1)

    tic = time.time()
    grid_result_regr = grid.fit(x_train,y_train[QUANTITATIVE_COLUMNS].values.astype(np.float64))
    toc = time.time()
    run_time = (toc - tic)/60
    return grid_result_regr.best_estimator_, grid_result_regr.best_score_, run_time

In [28]:
regr_tuned = {}
for i,(x,y) in enumerate(zip(x_total,df)):
    regr, regr_score, runtime_regr = tuning_regr_knn(x_total[x], df[y])
    regr_tuned["regr_{0}".format(list(x_total.keys())[i])] = regr, regr_score, runtime_regr

#### Regressor for Noise

In [29]:
# x_train_n = combined_noise.iloc[:,0:4].values
# y_train_n = combined_noise.iloc[:,4:].drop('dbscan',axis=1)

# regr_n, regr_score_n, runtime_regr_n = tuning_regr_knn(x_train_n, y_train_n)

### Testing Procedure

In [30]:
y_test_left['Unique_ID'] = [int(ref_table[(ref_table.x == str(x[0])) & (ref_table.y == str(x[1]))]['Unique_ID'].values) 
                              for x in y_test_left.values]

In [31]:
import scipy as sp
def nn_modified(test_noises, x_train, y_train):
    distance_n = []
    for x,y in zip(x_train, y_train):
        dim = np.array(x).shape[0]
        eu_distance = sp.spatial.distance.euclidean(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((eu_distance,y))
    distance_n = sorted(distance_n)[0][1]
    return distance_n

## use WKNN instead, return label
clus_dbscan = []
for n in x_test_left:
    clus_dbscan.append(nn_modified(n, x_train, y_train['dbscan'].values))

In [32]:
# clus_dbscan = []
# for n in x_test_left:
#     clus_dbscan.append(int(noises_clf_tuned.predict(n.reshape(1,4))))

In [33]:
## generate index_core and noises
def index_nc(labels_):
    index_core = []
    index_noises = []
    for i, x in enumerate(labels_):
        if x == -1:
            index_noises.append(i) 
        else:
            index_core.append(i)
    return index_core, index_noises

index_core, index_noises = index_nc(clus_dbscan)

test_core = x_test_left[index_core] ## core point testing
test_noises = x_test_left[index_noises]

label_core = y_test_left.values[index_core] ## core point testing
label_noises = y_test_left.values[index_noises]

### Core Point Testing

In [34]:
clus_kmeans = []
for x in test_core:
    clus_kmeans.append(final_model.predict(x.reshape(1,4)))

In [35]:
# clus_kmeans = []
# for x in test_core:
#     clus_kmeans.append(clf_tuned.predict(x.reshape(1,4)))

In [36]:
clus_core = [int(i) for i in clus_kmeans]

In [37]:
result = []
for i, (x,y) in enumerate(zip(test_core, clus_core)):
    data = x.reshape(1, len(x))
    result.append(regr_tuned['regr_{0}'.format(int(y))][0].predict(data).astype(int).reshape(2,))

In [38]:
df_pred = pd.DataFrame(result,columns=['x_pred','y_pred'])

In [39]:
# prep_core = np.append(test_core, np.array(clus_kmeans).reshape(len(clus_kmeans),1).astype(int), axis=1)

In [40]:
class POS_Regr(): 
    '''
    Generate object for Coordinate Result
    '''
    def getRegr(self):
        return self.regr
    def __init__(self, regr):
        self.regr = regr

In [41]:
def test_prediction(prep_core, regr_tuned):
    buff = []
    for i,item in enumerate(prep_core):
        N = len(item)-1
        data = item[0:N].reshape(1,N)
        hasil = regr_tuned['regr_{0}'.format(item[-1].astype(int))][0].predict(data)
        value = POS_Regr(hasil)
        buff.append(value)
    return buff

In [42]:
# pred = test_prediction(prep_core, regr_tuned) ## With built in function DBKmeans

In [43]:
def df_prediction(pred):
    df_prediction = []
    for i in range(len(pred)):
        xs = list(pred[i].getRegr()[0])
        df_prediction.append(xs)
    dataset = pd.DataFrame(df_prediction,columns=['x_pred','y_pred']) 
    return dataset

In [44]:
# df_pred = df_prediction(pred)

### Noise Point Testing

In [45]:
def wknn_modified(test_noises, x_train_n, y_train_n, k=5):
    distance_n = []; freq = defaultdict(float)
    for x,y in zip(x_train_n, y_train_n):
        dim = np.array(x).shape[0]
        euclidean_distance = sp.spatial.distance.cosine(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((euclidean_distance,y))
    distance_n = sorted(distance_n)[:k]
    count = Counter([b[1] for b in distance_n])
    unique_c = list(np.unique(np.array(distance_n)[:,1]).astype(int))
    for x in unique_c:
        freq.setdefault(int(x),0)
    for d in distance_n:
        temp = freq[float(d[1])]
        if d[0] == float(0):
            temp_ = float(temp)
            freq[float(d[1])] = temp_
        else:
            temp_ = float(temp) + (1 / d[0])
            freq[float(d[1])] = temp_
    for key in freq:
        freq[key] = freq[key] / count[key]
    return max(freq, key=lambda key: freq[key])

In [46]:
### noises point testing
x_train_new = x_train_cr.values
y_train_new = y_train_cr['Unique_ID'].values 

## use WKNN instead, return label
clus_wknn_n = []
for n in test_noises:
    clus_wknn_n.append(wknn_modified(n, x_train_new, y_train_new))
     
pred_n = []
for x in clus_wknn_n:
    pred_n.append((int(ref_table['x'][ref_table[ref_table.Unique_ID==x].index]),
                   int(ref_table['y'][ref_table[ref_table.Unique_ID==x].index])))

dataset_n = pd.DataFrame(pred_n,columns=['x_pred','y_pred'])

# clus_regr_n = []
# for n in test_noises:
#     clus_regr_n.append(regr_n.predict(n.reshape(1,4)).reshape(2,)) 
    
# dataset_n = pd.DataFrame(clus_regr_n,columns=['x_pred','y_pred'])

In [47]:
# def wknn_modified(test_noises, x_train_n, y_train_n, k=25):
#     distance_n = []; freq = defaultdict(float)
#     for x,y in zip(x_train_n, y_train_n):
#         dim = np.array(x).shape[0]
#         cosine_distance = sp.spatial.distance.euclidean(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
#         distance_n.append((cosine_distance,y))
#     distance_n = sorted(distance_n)[:k]
#     distance_n
#     unique_c = list(np.unique(np.array(distance_n)[:,1]).astype(int))
#     for x in unique_c:
#         freq.setdefault(int(x),0)
#     for d in distance_n:
#         temp = freq[float(d[1])]
#         temp_ = temp + (1 / d[0])
#         freq[float(d[1])] = temp_
#     return max(freq, key=lambda key: freq[key])

In [48]:
# ## use WKNN instead, return label
# clus_wknn_n = []
# for n in test_noises:
#     clus_wknn_n.append(wknn_modified(n, x_train_n, y_train_n))

In [49]:
# pred_n = []
# for x in clus_wknn_n:
#     pred_n.append((int(ref_table['x'][ref_table[ref_table.Unique_ID==x].index]),
#                    int(ref_table['y'][ref_table[ref_table.Unique_ID==x].index])))

In [50]:
# dataset_n = pd.DataFrame(pred_n,columns=['x_pred','y_pred'])

### Calculate Regression Error

In [51]:
def calculate_regr_error(df_final,label_core):
    x, y  = df_final['x_pred'].values, df_final['y_pred'].values
    x0, y0 = label_core[:,0].astype(int), label_core[:,1].astype(int)
    coords_error = np.sqrt(np.square(x - x0) + np.square(y - y0))
    mean_loc_error = coords_error.mean()
    return mean_loc_error, coords_error

In [52]:
mean_loc_error, coords_error = calculate_regr_error(df_pred,label_core)
print(mean_loc_error)

1.643231596707641


In [53]:
# noises point error
mean_loc_error_n, coords_error_n = calculate_regr_error(dataset_n,label_noises)
print(mean_loc_error_n)

1.013925358619815


In [54]:
### Error gabungan
np.sum((len(label_core) * mean_loc_error)+(len(label_noises) * mean_loc_error_n)) / (len(label_core) + len(label_noises))

1.5144589679174274