# Data prepricessing

### STEP 1 : txt to csv

In [25]:
import re
import csv
import os
import pandas as pd
import numpy as np

In [26]:
date = '2025_02_28'

In [27]:
# Path for input directory
input_directory = "txt_files_RAW_data"  # Replace with your directory path
output_directory = os.path.join(input_directory, f'{date}\\raw data')  # Output directory path

In [28]:
# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)


# Regular expression pattern to match each line of data
pattern = re.compile(
    r"Label: (.*?), Timestamp: (.*?), AP SSID: (.*?), BSSID: (.*?), Rssi: (-?\d+), Distance: (-?\d+) mm, StdDev: (\d+) mm, timeStemp: (\d+), mcOn: (true|false)"
)


# Iterate through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".txt"):  # Process only .txt files
        input_file_path = os.path.join(input_directory, filename)
        
        # Generate output file name based on input file name (replace .txt with .csv)
        output_file_name = filename.replace(".txt", ".csv")
        output_file_path = os.path.join(output_directory, output_file_name)
        
        # Open the input text file and corresponding output CSV file
        with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as outfile:
            # Define the CSV writer
            writer = csv.writer(outfile)

            # Write the CSV header
            writer.writerow(["Label", "Timestamp", "AP SSID", "BSSID", "Rssi", "Distance (mm)", "StdDev (mm)", "timeStemp", "mcOn"])

            # Process each line
            for line in infile:
                match = pattern.match(line)
                if match:
                    # Extract data using groups
                    writer.writerow(match.groups())
                    
        # Output the path to the saved CSV file
        print(f"File saved at: {output_file_path}")

File saved at: txt_files_RAW_data\2025_02_28\raw data\1-10_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-11_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-1_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-2_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-3_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-4_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-5_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-6_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-7_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-8_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\1-9_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\10-11_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\10-1_rtt_log.csv
File saved at: txt_files_RAW_data\2025_02_28\raw data\11-10_rtt_log.csv
File saved at

### STEP 2 : using timestamp to alignment

In [29]:
# 設定來源資料夾和目標資料夾
input_folder = output_directory  # 資料來源資料夾路徑
output_folder = f'{date}\\timestamp allign data'  # 處理後檔案的存放資料夾路徑

In [30]:
# 確保目標資料夾存在
os.makedirs(output_folder, exist_ok=True)

# 讀取資料夾中的所有檔案
for file_name in os.listdir(input_folder):
    if file_name.endswith('.csv'):  # 檢查是否為 CSV 檔案
        file_path = os.path.join(input_folder, file_name)
        data = pd.read_csv(file_path)

        # Step 1: 修改 BSSID 對應的 AP SSID
        data.loc[data['BSSID'] == '24:29:34:e2:4c:36', 'AP SSID'] = 'AP1'
        data.loc[data['BSSID'] == '24:29:34:e1:ef:d4', 'AP SSID'] = 'AP2'
        data.loc[data['BSSID'] == 'e4:5e:1b:a0:5e:85', 'AP SSID'] = 'AP3'
        data.loc[data['BSSID'] == 'b0:e4:d5:88:16:86', 'AP SSID'] = 'AP4'

        # Step 2: 忽略 timeStemp 欄位的最後一位數
        data['timeStemp'] = data['timeStemp'].astype(str).str[:-1]  # 刪除最後一位數
        data['timeStemp'] = data['timeStemp'].astype(int)  # 轉回數字型別（如果需要）

        # Step 3: Group by Timestamp 和 AP SSID，計算平均值
        grouped_data = (
            data.groupby(['timeStemp','Label' ,'AP SSID'])
            .agg({
                # 'Label': 'first',
                'Distance (mm)': 'mean',
                'Rssi': 'mean',
                'StdDev (mm)': 'mean'
            })
            .reset_index()
        )

        # Step 3: 將資料轉換成每個 Timestamp 一 row
        pivoted_data = grouped_data.pivot(
            
            index=['timeStemp','Label'],
            columns='AP SSID',
            values=['Distance (mm)', 'Rssi', 'StdDev (mm)']
        )

        # 展平多層欄位名稱
        pivoted_data.columns = [f"{ap}_{metric}" for metric, ap in pivoted_data.columns]
        pivoted_data.reset_index(inplace=True)

        # 將處理後的結果存成新的 CSV 檔案
        output_file_path = os.path.join(output_folder, f"processed_{file_name}")
        pivoted_data.to_csv(output_file_path, index=False)

        print(f"Processed and saved: {output_file_path}")

print("所有檔案處理完成！")


Processed and saved: 2025_02_28\timestamp allign data\processed_1-10_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-11_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-1_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-2_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-3_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-4_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-5_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-6_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-7_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-8_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_1-9_rtt_log.csv
Processed and saved: 2025_02_28\timestamp allign data\processed_10-11_rtt_log.csv
Processed and saved: 2025_02_28\time

### STEP 3 : Combine all csv to one

In [31]:
# 指定資料夾路徑
input_folder = output_folder  # 替換為你的資料夾路徑
output_file = f'{date}\\timestamp_allignment_{date}_rtt_logs.csv'  # 合併後的輸出檔案名稱

# 獲取資料夾內所有 CSV 檔案的路徑
file_paths = [os.path.join(input_folder, file) for file in os.listdir(input_folder) if file.endswith('.csv')]

# 合併所有 CSV 檔案
combined_data = pd.concat([pd.read_csv(file_path) for file_path in file_paths], ignore_index=True)

# 儲存合併後的檔案
combined_data.to_csv(output_file, index=False)

print(f"所有檔案已合併並儲存為: {output_file}")


所有檔案已合併並儲存為: 2025_02_28\timestamp_allignment_2025_02_28_rtt_logs.csv


### STEP 4 : make all number of data in each RP the same

In [34]:
# 讀取 Excel 檔案
file_path = output_file
df = pd.read_csv(file_path)

print(df.head())

# 假設 label 的欄位名稱為 'label'
# 計算每個 label 的資料筆數
label_counts = df['Label'].value_counts()

# 找出最少的資料筆數
min_count = label_counts.min()
max_count = label_counts.max()


# 隨機抽取每個 label 的資料，使其數量等於 min_count
df_balanced = df.groupby('Label').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)

# 儲存處理後的資料
output_path = f'{date}\\timestamp_allignment_Balanced_{date}_rtt_logs.csv'
df_balanced.to_csv(output_path, index=False)

print(f"處理後的資料已儲存至 {output_path}")

   timeStemp Label  AP1_Distance (mm)  AP2_Distance (mm)  AP3_Distance (mm)  \
0   33824855  1-10             -541.0             9339.0            10213.0   
1   33824868  1-10             -386.0            10796.0            14834.0   
2   33824879  1-10             -180.0             9829.0            12568.0   
3   33824891  1-10             -297.0             9214.0             8688.0   
4   33824920  1-10             -180.0             8071.0             9163.0   

   AP4_Distance (mm)  AP1_Rssi  AP2_Rssi  AP3_Rssi  AP4_Rssi  AP1_StdDev (mm)  \
0             5907.0     -57.0     -80.0     -67.0     -57.0             97.0   
1             6643.0     -57.0     -80.0     -68.0     -56.0            106.0   
2             6730.0     -58.0     -79.0     -67.0     -53.0            375.0   
3             4442.0     -59.0     -77.0     -61.0     -61.0            378.0   
4             5633.0     -59.0     -75.0     -62.0     -56.0            253.0   

   AP2_StdDev (mm)  AP3_StdDev (mm)  A

  df_balanced = df.groupby('Label').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)


In [35]:
print("Min: " + str(min_count))
print("Max: " + str(max_count))

Min: 406
Max: 615


In [None]:
print("ALL finished")

ALL finished
