### Load Libraries and Train Data

In [1]:
import os
import sys
import glob
import math
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
from numpy import sqrt, square
from kneed import KneeLocator
from collections import Counter
from collections import defaultdict
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [2]:
## Load Training Data
path =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Train 1"
globbed_files = glob.glob(path + "/*.csv")
data = []
for csv in globbed_files:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data.append(frame)

In [3]:
# ## Load Training Data 2
# path2 =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Train 1"
# globbed_files2 = glob.glob(path2 + "/*.csv")
# for csv in globbed_files2:
#     frame = pd.read_csv(csv)
#     frame['x'] = os.path.basename(csv).split('.')[0][0]
#     frame['y'] = os.path.basename(csv).split('.')[0][1]
#     data.append(frame)

In [4]:
## Take the lowest number of set data (32)
attempt = []
for i, item in enumerate(data):
    attempt.append(data[i][['Router 1','Router 2','Router 3','Router 4','x','y']]) #.head(32)
attempt_concat = pd.concat(attempt)
data_train = attempt_concat.sample(frac=1, random_state=42).reset_index(drop=True)
data_train[['Router 1','Router 2','Router 3','Router 4']] = data_train[['Router 1','Router 2','Router 3','Router 4']].abs()
data_train = data_train.assign(Unique_ID = (data_train['x'].astype(str) + '_' + data_train['y'].astype(str)).astype('category').cat.codes)
x_train = data_train.iloc[:,0:4].values
y_train = data_train.iloc[:,4:]
ref_table = y_train.iloc[:, [0,1,2]].drop_duplicates().reset_index(drop=True)

In [5]:
# # scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)

### Left Testing

In [6]:
path_left =r"E:\PROJECT\VSCode Python Project\Sensor Network\Uji Coba\Progress Skripsi\Data\Data Nico\Data 15 April Tanpa Gangguan\Test\Left Test"
globbed_files_left = glob.glob(path_left + "/*.csv")
data_left = []
for csv in globbed_files_left:
    frame = pd.read_csv(csv)
    frame['x'] = os.path.basename(csv).split('.')[0][0]
    frame['y'] = os.path.basename(csv).split('.')[0][1]
    data_left.append(frame)
    
attempt_left = []
for i, item in enumerate(data_left):
    attempt_left.append(data_left[i][['Router 1','Router 2','Router 3','Router 4','x','y']])
attempt_concat_left = pd.concat(attempt_left)
data_test_left = attempt_concat_left.sample(frac=1, random_state=42).reset_index(drop=True)
data_test_left[['Router 1','Router 2','Router 3','Router 4']] = data_test_left[['Router 1','Router 2','Router 3','Router 4']] .abs()
data_test_left = data_test_left.tail(562).reset_index(drop=True)

In [7]:
## split features and label
x_test_left = data_test_left.iloc[:,0:4].values
y_test_left = data_test_left.iloc[:,4:]

### Normalization
# x_test_left = scaler.transform(x_test_left)

### PCA

In [8]:
class _PCA():
    def __init__(self, dataset, perc_of_var):
        '''
        param perc_of_var : (float) percent of variance from PCA
        return None
        '''
        self.data = dataset
        self.perc_of_var = perc_of_var

    def _train(self):
        '''
        param x_train : (DataFrame) Training Dataset
        return x_train : (DataFrame) Training Dataset after PCA
        return dim_red : param fitter for test data
        '''
        dim_red = PCA(n_components = self.perc_of_var, svd_solver='full')
        x_train = dim_red.fit_transform(self.data)
        return x_train, dim_red

    def _test(self, test, dim_red):
        '''
        Preforms PCA and keeps perc_of_var percent of variance 
        param x_test : (DataFrame) Test Dataset
        param dim_red : (pca) Instance of PCA
        return x_test : (DataFrame) Test Dataset after PCA
        '''
        x_test = dim_red.transform(test)
        return x_test

In [9]:
# pca = _PCA(x_train, 0.95)
# x_train, dim_red = pca._train()
# x_test_left = pca._test(x_test_left,dim_red)

### Perform DBSCAN Stage 1

In [12]:
nearest_neighbors = NearestNeighbors(n_neighbors=5)
neighbors = nearest_neighbors.fit(x_train)
distances, indices = neighbors.kneighbors(x_train)
distances = np.sort(distances[:,4], axis=0)
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')

In [17]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=distances[knee.knee], min_samples=6).fit(x_train)
labels = db.labels_

In [18]:
np.unique(labels)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23], dtype=int64)

In [32]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # jumlah cluster, total klas - 1 (kalo ada noise)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 1
Estimated number of noise points: 0


In [16]:
x_train_df = pd.DataFrame(x_train,columns=['PC_1','PC_2','PC_3','PC_4'])
y_train['dbscan_1'] = labels

In [17]:
combined = pd.concat((x_train_df,y_train),axis=1)

In [18]:
combined_core = combined.loc[~(combined['dbscan_1'] == -1)]
combined_core = combined_core.reset_index(drop=True)

In [19]:
x_train_cr = combined_core.iloc[:,0:4]
y_train_cr = combined_core.iloc[:,4:]

### Perform DBSCAN Stage 2

In [20]:
x_train_cr

Unnamed: 0,PC_1,PC_2,PC_3,PC_4
0,66,68,32,66
1,48,68,62,66
2,61,35,61,67
3,66,61,52,58
4,66,56,75,32
...,...,...,...,...
91,66,56,75,32
92,66,68,33,66
93,66,68,32,66
94,67,62,62,62


In [21]:
nearest_neighbors_cr = NearestNeighbors(n_neighbors=5)
neighbors_cr = nearest_neighbors_cr.fit(x_train_cr)
distances_cr, indices_cr = neighbors_cr.kneighbors(x_train_cr)
distances_cr = np.sort(distances_cr[:,4], axis=0)
i_cr = np.arange(len(distances_cr))
knee_cr = KneeLocator(i_cr, distances_cr, S=1, curve='convex', direction='increasing', interp_method='polynomial')

  return (a - min(a)) / (max(a) - min(a))
  results &= comparator(main, plus)
  results &= comparator(main, minus)
  results &= comparator(main, plus)
  results &= comparator(main, minus)
The line is probably not polynomial, try plotting
the difference curve with plt.plot(knee.x_difference, knee.y_difference)
Also check that you aren't mistakenly setting the curve argument


In [19]:
db_cr = DBSCAN(eps=distances_cr[knee_cr.knee], min_samples=5).fit(x_train_cr)
labels_cr = db_cr.labels_

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
np.unique(labels_cr)

In [None]:
x_train_cr_df = pd.DataFrame(x_train_cr,columns=['PC_1','PC_2','PC_3','PC_4'])
y_train_cr['dbscan_2'] = labels_cr

In [None]:
combined_cr = pd.concat((x_train_cr_df,y_train_cr),axis=1)

In [None]:
combined_cr_2 = combined_cr.loc[~(combined_cr['dbscan_2'] == -1)]
combined_cr_2 = combined_cr_2.reset_index(drop=True)

In [None]:
x_train_cr_2 = combined_cr_2.iloc[:,0:4]
y_train_cr_2 = combined_cr_2.iloc[:,4:]

### Handle Imbalaced Data

In [None]:
dict_ = {}
clusters = {}
for x in np.unique(combined_cr_2.values[:,-1]):
    dict_[x] = list(combined_cr_2[combined_cr_2.dbscan_2 == x]['Unique_ID'].values)
    clusters[x] = list(combined_cr_2[combined_cr_2.dbscan_2 == x][['PC_1', 'PC_2', 'PC_3', 'PC_4']].values)

In [None]:
def strategy_dict(y):
    '''
    balancing for oversampling strategy
    '''
    new_strategy = {}
    keys = Counter(y).keys()
    values = max(Counter(y).values())
    for key in keys:
        new_strategy[key] = values
    return new_strategy

## untuk bantu train regressor
def over_sampling(clusters, dict_, index):
    '''
    random oversampling strategy
    '''
    x_total = {}
    y_total = {}
    for i,item in enumerate(index):
        if len(item) > 1:
            x = [z for z in clusters[list(clusters.keys())[i]]]
            y = [z for z in dict_[list(dict_.keys())[i]]]
            strategy = strategy_dict(y)
            oversample = RandomOverSampler(sampling_strategy=strategy)
            x_over, y_over = oversample.fit_resample(x, y)
            x_total[list(clusters.keys())[i]] = np.array(x_over)
            y_total[list(clusters.keys())[i]] = np.array(y_over)
        else:
            x_total[list(clusters.keys())[i]] = np.array([z for z in clusters[list(clusters.keys())[i]]])
            y_total[list(clusters.keys())[i]] = np.array([z for z in dict_[list(dict_.keys())[i]]])
    return x_total, y_total

In [None]:
unique = [list(np.unique(x)) for x in dict_.values()]

In [None]:
x_total, y_total = over_sampling(clusters, dict_, unique)

### Separate Sensitive and Insensitive

In [None]:
x_insensitive = {}
y_insensitive = {}
for x in [i for i, x in enumerate(unique) if len(x) > 1]:
    x_insensitive[x] = x_total[x]
    y_insensitive[x] = y_total[x]

### Merge Dataframe

In [None]:
## matching process between new id and ref table
data_balanced = {}
for i, (x,y) in enumerate(zip(x_insensitive, y_insensitive)):
    data_balanced["{0}".format(list(x_insensitive.keys())[i])] = y_insensitive[y]
data_df_balanced = pd.DataFrame.from_dict(data_balanced, orient='index').T

In [None]:
def filter_list_id(df_id, ref_table):
    dict_loc = {}; id_total = []
    m_total = ref_table.shape[0]
    poses = []
    for c in df_id:
        x = []
        for i in range(len(df_id[c].dropna())):
            x.append(int(df_id[c][i]))
        var = np.array(x)
        id_total.append(var)
    for i in range(m_total):
        key = int(ref_table.iloc[i]['Unique_ID'])
        value = ref_table.iloc[i, 0:2].values
        dict_loc[key] = value
    for i in range(len(id_total)):
        pos = []
        for j in range(len(id_total[i])):
            x = id_total[i][j]
            pos.append(dict_loc.get(x))
        pos = np.array(pos)
        poses.append(pos)    
    return id_total, poses

In [None]:
id_total_balanced, poses_balanced = filter_list_id(data_df_balanced, ref_table)

In [None]:
def make_df(id_total,poses,i):
    df_id = pd.DataFrame(id_total[i],columns=[str(i)])
    df_pos = pd.DataFrame(poses[i],columns=['x','y'])
    return df_pos, df_id

df = {}
for i, (_id,pose) in enumerate(zip(id_total_balanced,poses_balanced)):
    df[list(x_insensitive.keys())[i]] = pd.concat((make_df(id_total_balanced,poses_balanced,i)),axis=1)

### Pre-Trained and Tuned

##### -- Regressor

In [None]:
## regression model for pose prediction
from sklearn.ensemble import RandomForestRegressor
def tuning_regr_rf(x_train,y_train):
    QUANTITATIVE_COLUMNS = ['x', 'y']
    regr = RandomForestRegressor()
    
    n_estimators = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    hyperparameters = {'n_estimators':n_estimators}
     
    grid = GridSearchCV(estimator = regr,
                        param_grid = hyperparameters,
                        scoring = 'neg_mean_squared_error',
                        cv = 3,
                        n_jobs = -1)

    tic = time.time()
    grid_result_regr = grid.fit(x_train,y_train[QUANTITATIVE_COLUMNS].values.astype(np.float64))
    toc = time.time()
    run_time = (toc - tic)/60
    return grid_result_regr.best_estimator_, grid_result_regr.best_score_, run_time

In [None]:
## regression model for pose prediction
def tuning_regr_knn(x_train,y_train):
    QUANTITATIVE_COLUMNS = ['x', 'y']
    regr = KNeighborsRegressor(n_neighbors=1)
    
    metric = ['manhattan','minkowski','euclidean']
    hyperparameters = {'metric': metric}
     
    grid = GridSearchCV(estimator = regr,
                        param_grid = hyperparameters,
                        scoring = 'neg_mean_squared_error',
                        cv = 3,
                        n_jobs = -1)

    tic = time.time()
    grid_result_regr = grid.fit(x_train,y_train[QUANTITATIVE_COLUMNS].values.astype(np.float64))
    toc = time.time()
    run_time = (toc - tic)/60
    return grid_result_regr.best_estimator_, grid_result_regr.best_score_, run_time

In [None]:
# regr_tuned = {}
# for i,(x,y) in enumerate(zip(x_total,df)):
#     regr, regr_score, runtime_regr = tuning_regr_knn(x_total[x], df[y])
#     regr_tuned["regr_{0}".format(list(x_total.keys())[i])] = regr, regr_score, runtime_regr

In [None]:
regr_tuned = {}
for i,(x,y) in enumerate(zip(x_insensitive,df)):
    regr, regr_score, runtime_regr = tuning_regr_rf(x_insensitive[x], df[y])
    regr_tuned["regr_{0}".format(list(x_insensitive.keys())[i])] = regr, regr_score, runtime_regr

### Testing Procedure

In [None]:
y_test_left['Unique_ID'] = [int(ref_table[(ref_table.x == str(x[0])) & (ref_table.y == str(x[1]))]['Unique_ID'].values) 
                              for x in y_test_left.values]

In [None]:
import scipy as sp
def nn_modified(test_noises, x_train, y_train):
    distance_n = []
    for x,y in zip(x_train, y_train):
        dim = np.array(x).shape[0]
        eu_distance = sp.spatial.distance.euclidean(test_noises.reshape(1,dim),np.array(x).reshape(1,dim))
        distance_n.append((eu_distance,y))
    distance_n = sorted(distance_n)[0][1]
    return distance_n

In [None]:
clus_ = []
for n in x_test_left:
    clus_.append(nn_modified(n, x_train_cr_2.values, y_train_cr_2['dbscan_2'].values))

In [None]:
def test_prediction(x_test_left, clus_, dict_, x_insensitive, regr_tuned, ref_table):
    result = []
    for i, (x,y) in enumerate(zip(x_test_left, clus_)):
        if y in list(x_insensitive.keys()):
            data = x.reshape(1,len(x))
            result.append(regr_tuned['regr_{0}'.format(int(y))][0].predict(data).astype(int).reshape(2,))
        else:
            y_ = np.unique(dict_[y])
            result.append(ref_table[ref_table.Unique_ID == int(y_)][['x','y']].values.astype(int).reshape(2,))
    return result

In [None]:
result = test_prediction(x_test_left, clus_, dict_, x_insensitive, regr_tuned, ref_table)

In [None]:
df_pred = pd.DataFrame(result,columns=['x_pred','y_pred'])

In [None]:
# index_core, index_noises = index_nc(clus_)

# test_core = x_test_left[index_core] ## core point testing
# test_noises = x_test_left[index_noises]

# label_core = y_test_left.values[index_core] ## core point testing
# label_noises = y_test_left.values[index_noises]

In [None]:
# prep_core = np.append(test_core, np.array(clus_knn)[index_core].reshape(len(np.array(clus_knn)[index_core]),1).astype(int), axis=1)

In [None]:
# class POS_Regr(): 
#     '''
#     Generate object for Coordinate Result
#     '''
#     def getRegr(self):
#         return self.regr
#     def __init__(self, regr):
#         self.regr = regr

In [None]:
# def test_prediction(prep_core, regr_tuned):
#     buff = []
#     for i,item in enumerate(prep_core):
#         N = len(item)-1
#         data = item[0:N].reshape(1,N)
#         hasil = regr_tuned['regr_{0}'.format(item[-1].astype(int))][0].predict(data)
#         value = POS_Regr(hasil)
#         buff.append(value)
#     return buff

In [None]:
# pred = test_prediction(prep_core, regr_tuned)

In [None]:
# def df_prediction(pred):
#     df_prediction = []
#     for i in range(len(pred)):
#         xs = list(pred[i].getRegr()[0])
#         df_prediction.append(xs)
#     dataset = pd.DataFrame(df_prediction,columns=['x_pred','y_pred']) 
#     return dataset

In [None]:
# df_pred = df_prediction(pred)

### Noise Testing Procedure

In [None]:
# x_train_n ## data latih untuk noises
# y_train_n ## label latih untuk noises
# test_noises ## data test untuk noises
# label_noises ## label test untuk noises

In [None]:
# ## Balancing data for noises
# dict_n_ = {}
# clusters_n = {}
# for x in np.unique(combined_noise.values[:,-1]):
#     dict_n_[x] = list(combined_noise['Unique_ID'].values)
#     clusters_n[x] = list(combined_noise[['PC_1', 'PC_2', 'PC_3', 'PC_4']].values)

In [None]:
# unique_n = [list(np.unique(x)) for x in dict_n_.values()]
# x_total_n, y_total_n = over_sampling(clusters_n, dict_n_, unique_n)

In [None]:
# ## matching process between new id and ref table
# data_balanced_n = {}
# data_balanced_n["-1"] = y_total_n[-1]
# data_df_balanced_n = pd.DataFrame.from_dict(data_balanced_n, orient='index').T

In [None]:
# id_total_balanced_n, poses_balanced_n = filter_list_id(data_df_balanced_n, ref_table)

In [None]:
# df_n = {}
# df_n[-1] = pd.concat((make_df(id_total_balanced_n,poses_balanced_n,0)),axis=1)

In [None]:
# regr_n, regr_score_n, runtime_regr_n = tuning_regr_rf(x_total_n[-1], df_n[-1])

In [None]:
# ### testing model for unseen data
# pred_n = regr_n.predict(test_noises)

In [None]:
# dataset_n = pd.DataFrame(pred_n,columns=['x_pred','y_pred'])

### Calculate Regression Error

In [None]:
def calculate_regr_error(df_final,label_core):
    x, y  = df_final['x_pred'].values, df_final['y_pred'].values
    x0, y0 = label_core[:,0].astype(int), label_core[:,1].astype(int)
    coords_error = np.sqrt(np.square(x - x0) + np.square(y - y0))
    mean_loc_error = coords_error.mean()
    return mean_loc_error, coords_error

In [None]:
## core point error
mean_loc_error, coords_error = calculate_regr_error(df_pred,y_test_left.values)
print(mean_loc_error)

In [None]:
# ## noises point error
# mean_loc_error_n, coords_error_n = calculate_regr_error(dataset_n,label_noises)
# print(mean_loc_error_n)

In [None]:
# ### Error gabungan
# np.sum((len(label_core) * mean_loc_error)+(len(label_noises) * mean_loc_error_n)) / (len(label_core) + len(label_noises))

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.
