In [1]:
import os
import random
from pathlib import Path
import numpy as np
import pandas as pd
import sys 

#Note: Change path to your folder dir (This is mine)
path_to_dir = "C:/Users/Asus/Documents/IPS_PCA_CLUSTERING"

sys.path.insert(1, path_to_dir)


import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA

# custom

from src.utils.file_utils import read_data
from src.utils.helper import normalize_data, separates_data_uts, constant_columns, detect_num_feature_rssi, save_data_npz, load_data_npz, separates_data_uji, separates_data_tampere

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# setup random seed
def set_seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

SEED = 2023
set_seed_everything(SEED)

## TAMPERE

### Data Path Tampere

In [3]:
raw_rssi_train_dir = "../../data/tampere_data/raw/Training_rss.csv"
raw_label_train_dir = "../../data/tampere_data/raw/Training_coordinates.csv"

raw_rssi_test_dir = "../../data/tampere_data/raw/Test_rss.csv"
raw_label_test_dir = "../../data/tampere_data/raw/Test_coordinates.csv"


process_train_dir = "../../data/tampere_data/processed/TUT_train.csv"
process_test_dir = "../../data/tampere_data/processed/TUT_test.csv"

data_path_save = f"../../data/tampere_data/pca/kernal_pca_cosine_75" 
# data_path_save = f"../../data/tampere_data/pca/pca_085"
Path(data_path_save).mkdir(parents=True, exist_ok=True)

In [4]:
rssi_train_data = pd.read_csv(raw_rssi_train_dir, header=None)
label_train_data = pd.read_csv(raw_label_train_dir, header=None)
rssi_test_data = pd.read_csv(raw_rssi_test_dir, header=None)
label_test_data = pd.read_csv(raw_label_test_dir, header=None)

rssi_header_dict = {idx:"WAP{:03d}".format(idx + 1) for idx in rssi_train_data.columns}
label_header_dict = {0: 'Pos_x', 1: 'Pos_y', 2:'Floor_ID'}

rssi_train_data = rssi_train_data.rename(columns=rssi_header_dict)
label_train_data = label_train_data.rename(columns=label_header_dict)
rssi_test_data = rssi_test_data.rename(columns=rssi_header_dict)
label_test_data = label_test_data.rename(columns=label_header_dict)

train_data = pd.concat([rssi_train_data, label_train_data], axis=1)
test_data = pd.concat([rssi_test_data, label_test_data], axis=1)

replace_floor_id = {0.0: 0, 3.7:1, 7.4:2, 11.1:3, 14.8:4}
train_data.replace({"Floor_ID": replace_floor_id}, inplace=True)
test_data.replace({"Floor_ID": replace_floor_id}, inplace=True)

NUM_FEATURES_RSSI = detect_num_feature_rssi(train_data)
const_cols = constant_columns(train_data, threshold=2, column_range=NUM_FEATURES_RSSI)

print(const_cols)

# constant columns


# normalize data
train_data = normalize_data(train_data, num_feature=NUM_FEATURES_RSSI, removed_columns=None, constant_columns=const_cols)
test_data = normalize_data(test_data, num_feature=NUM_FEATURES_RSSI, removed_columns=None, constant_columns=const_cols)

# save data processs
train_data.to_csv(process_train_dir, index=False)
test_data.to_csv(process_test_dir, index=False)

train_data = read_data(process_train_dir)
test_data = read_data(process_test_dir)

['WAP003', 'WAP005', 'WAP097', 'WAP120', 'WAP121', 'WAP122', 'WAP123', 'WAP124', 'WAP126', 'WAP135', 'WAP138', 'WAP151', 'WAP152', 'WAP154', 'WAP157', 'WAP159', 'WAP164', 'WAP166', 'WAP167', 'WAP168', 'WAP169', 'WAP173', 'WAP174', 'WAP175', 'WAP177', 'WAP179', 'WAP180', 'WAP181', 'WAP182', 'WAP183', 'WAP184', 'WAP188', 'WAP190', 'WAP191', 'WAP192', 'WAP209', 'WAP210', 'WAP211', 'WAP212', 'WAP214', 'WAP272', 'WAP273', 'WAP274', 'WAP275', 'WAP276', 'WAP310', 'WAP312', 'WAP313', 'WAP315', 'WAP317', 'WAP318', 'WAP319', 'WAP320', 'WAP321', 'WAP322', 'WAP336', 'WAP338', 'WAP348', 'WAP349', 'WAP350', 'WAP351', 'WAP352', 'WAP353', 'WAP354', 'WAP355', 'WAP357', 'WAP367', 'WAP371', 'WAP372', 'WAP373', 'WAP374', 'WAP375', 'WAP378', 'WAP379', 'WAP385', 'WAP388', 'WAP392', 'WAP393', 'WAP499', 'WAP501', 'WAP502', 'WAP506', 'WAP507', 'WAP508', 'WAP509', 'WAP510', 'WAP512', 'WAP514', 'WAP516', 'WAP520', 'WAP524', 'WAP525', 'WAP526', 'WAP532', 'WAP536', 'WAP539', 'WAP540', 'WAP541', 'WAP543', 'WAP544',

In [5]:
# seperate data
X_train_, Y_train = separates_data_tampere(train_data)
X_test_, Y_test = separates_data_tampere(test_data)

scX = StandardScaler()
pca = KernelPCA(n_components=75, kernel="cosine")
# pca = PCA(n_components=0.85)
X_train_ = scX.fit_transform(X_train_)
X_test_ = scX.transform(X_test_)
X_train = pca.fit_transform(X_train_)
X_test = pca.transform(X_test_)

# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.04, random_state=SEED)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.1, random_state=SEED)

# save data npz
save_data_npz([
    (os.path.join(data_path_save, "X_train.npz"), X_train),
    (os.path.join(data_path_save, "Y_train.npz"), Y_train),
    (os.path.join(data_path_save, "X_test.npz"), X_test),
    (os.path.join(data_path_save, "Y_test.npz"), Y_test),
    (os.path.join(data_path_save, "X_valid.npz"), X_valid),
    (os.path.join(data_path_save, "Y_valid.npz"), Y_valid)
])

In [6]:
(X_train, Y_train), (X_test, Y_test), (X_valid, Y_valid) = load_data_npz(
    [
        os.path.join(data_path_save, "X_train.npz"),
        os.path.join(data_path_save, "Y_train.npz"),
        os.path.join(data_path_save, "X_test.npz"),
        os.path.join(data_path_save, "Y_test.npz"),
        os.path.join(data_path_save, "X_valid.npz"),
        os.path.join(data_path_save, "Y_valid.npz")
    ]
)

In [7]:
print(X_train.shape)
print(Y_train.shape)

print(X_test.shape)
print(Y_test.shape)

print(X_valid.shape)
print(Y_valid.shape)

(627, 75)
(627, 3)
(3951, 75)
(3951, 3)
(70, 75)
(70, 3)


## UTS

### Data Path UTS

In [8]:
raw_train_dir = "../../data/uts_data/raw/UTS_training.csv"
raw_test_dir = "../../data/uts_data/raw/UTS_test.csv"

process_train_dir = "../../data/uts_data/processed/UTS_train.csv"
process_test_dir = "../../data/uts_data/processed/UTS_test.csv"

data_path_save = f"../../data/uts_data/pca/kernal_pca_cosine_250"
# data_path_save = f"../../data/uts_data/pca/pca_085"
Path(data_path_save).mkdir(parents=True, exist_ok=True)

### processing data

In [9]:
train_data = read_data(raw_train_dir)
test_data = read_data(raw_test_dir)

# constant columns
NUM_FEATURES_RSSI = detect_num_feature_rssi(train_data)
const_cols = constant_columns(train_data, threshold=2, column_range=NUM_FEATURES_RSSI)

# normalize data
train_data = normalize_data(train_data, num_feature=NUM_FEATURES_RSSI,
                               removed_columns=["User_ID", "Phone_type", "Time"],
                               constant_columns=const_cols)
test_data = normalize_data(test_data, num_feature=NUM_FEATURES_RSSI,
                           removed_columns=["User_ID", "Phone_type", "Time"],
                           constant_columns=const_cols)

# save data processs
train_data.to_csv(process_train_dir, index=False)
test_data.to_csv(process_test_dir, index=False)

In [10]:
train_data = read_data(process_train_dir)
test_data = read_data(process_test_dir)

In [11]:
# seperate data
X_train_, Y_train = separates_data_uts(train_data)
X_test_, Y_test = separates_data_uts(test_data)

scX = StandardScaler()
pca = KernelPCA(n_components=250, kernel="cosine")
# pca = PCA(n_components=0.85)
X_train_ = scX.fit_transform(X_train_)
X_test_ = scX.transform(X_test_)
X_train = pca.fit_transform(X_train_)
X_test = pca.transform(X_test_)

# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.02, random_state=SEED)
# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.04, random_state=SEED)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.09, random_state=SEED)

# save data npz
save_data_npz([
    (os.path.join(data_path_save, "X_train.npz"), X_train),
    (os.path.join(data_path_save, "Y_train.npz"), Y_train),
    (os.path.join(data_path_save, "X_test.npz"), X_test),
    (os.path.join(data_path_save, "Y_test.npz"), Y_test),
    (os.path.join(data_path_save, "X_valid.npz"), X_valid),
    (os.path.join(data_path_save, "Y_valid.npz"), Y_valid)
])

In [12]:
(X_train, Y_train), (X_test, Y_test), (X_valid, Y_valid) = load_data_npz(
    [
        os.path.join(data_path_save, "X_train.npz"),
        os.path.join(data_path_save, "Y_train.npz"),
        os.path.join(data_path_save, "X_test.npz"),
        os.path.join(data_path_save, "Y_test.npz"),
        os.path.join(data_path_save, "X_valid.npz"),
        os.path.join(data_path_save, "Y_valid.npz")
    ]
)

In [13]:
print(X_train.shape)
print(Y_train.shape)

print(X_test.shape)
print(Y_test.shape)

print(X_valid.shape)
print(Y_valid.shape)

(8288, 250)
(8288, 3)
(388, 250)
(388, 3)
(820, 250)
(820, 3)


## UJI

### Data Path UJI

In [14]:
raw_train_dir = "../../data/uji_data/raw/trainingData.csv"
raw_test_dir = "../../data/uji_data/raw/validationData.csv"

process_train_dir = "../../data/uji_data/processed/UJI_train.csv"
process_test_dir = "../../data/uji_data/processed/UJI_test.csv"

data_path_save = f"../../data/uji_data/pca/kernal_pca_cosine_100"
Path(data_path_save).mkdir(parents=True, exist_ok=True)

In [15]:
train_data = read_data(raw_train_dir)
test_data = read_data(raw_test_dir)

LONGITUDE_MIN = -7695.9387549299299
LATITUDE_MIN = 4864745.7450159714
train_data["LONGITUDE"] = train_data["LONGITUDE"] - LONGITUDE_MIN
train_data["LATITUDE"] = train_data["LATITUDE"] - LATITUDE_MIN
test_data["LONGITUDE"] = test_data["LONGITUDE"] - LONGITUDE_MIN
test_data["LATITUDE"] = test_data["LATITUDE"] - LATITUDE_MIN

# constant columns
NUM_FEATURES_RSSI = detect_num_feature_rssi(train_data)
const_cols = constant_columns(train_data, threshold=2, column_range=NUM_FEATURES_RSSI)

# normalize data
train_data = normalize_data(train_data, num_feature=NUM_FEATURES_RSSI,
                               removed_columns=["SPACEID", "RELATIVEPOSITION", "USERID", "PHONEID", "TIMESTAMP"],
                               constant_columns=const_cols)
test_data = normalize_data(test_data, num_feature=NUM_FEATURES_RSSI,
                           removed_columns=["SPACEID", "RELATIVEPOSITION", "USERID", "PHONEID", "TIMESTAMP"],
                           constant_columns=const_cols)

# save data processs
train_data.to_csv(process_train_dir, index=False)
test_data.to_csv(process_test_dir, index=False)

In [16]:
train_data = read_data(process_train_dir)
test_data = read_data(process_test_dir)

In [None]:
# seperate data
X_train_, Y_train = separates_data_uji(train_data)
X_test_, Y_test = separates_data_uji(test_data)

scX = StandardScaler()
pca = KernelPCA(n_components=100, kernel="cosine")
# pca = PCA(n_components=0.75)
X_train_ = scX.fit_transform(X_train_)
X_test_ = scX.transform(X_test_)
X_train = pca.fit_transform(X_train_)
X_test = pca.transform(X_test_)

# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.04, random_state=SEED)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.11, random_state=SEED)

# save data npz
save_data_npz([
    (os.path.join(data_path_save, "X_train.npz"), X_train),
    (os.path.join(data_path_save, "Y_train.npz"), Y_train),
    (os.path.join(data_path_save, "X_test.npz"), X_test),
    (os.path.join(data_path_save, "Y_test.npz"), Y_test),
    (os.path.join(data_path_save, "X_valid.npz"), X_valid),
    (os.path.join(data_path_save, "Y_valid.npz"), Y_valid)
])

In [None]:
(X_train, Y_train), (X_test, Y_test), (X_valid, Y_valid) = load_data_npz(
    [
        os.path.join(data_path_save, "X_train.npz"),
        os.path.join(data_path_save, "Y_train.npz"),
        os.path.join(data_path_save, "X_test.npz"),
        os.path.join(data_path_save, "Y_test.npz"),
        os.path.join(data_path_save, "X_valid.npz"),
        os.path.join(data_path_save, "Y_valid.npz")
    ]
)

In [None]:
print(X_train.shape)
print(Y_train.shape)

print(X_test.shape)
print(Y_test.shape)

print(X_valid.shape)
print(Y_valid.shape)