# Load the data

In [1]:
import pandas as pd
train_df = pd.read_csv("../../cleaned_dataset/split_data/model_selection/train.csv")
test_df = pd.read_csv("../../cleaned_dataset/split_data/model_selection/test.csv")

# Now check the label and filter before doing anything

In [2]:
# Get all of the unique labels for train / test sets 
train_labels = list(train_df['room'].unique())
test_labels = list(test_df['room'].unique())

# Labels that appear in BOTH train and test
common_labels = list(set(train_labels) & set(test_labels))

# Now filter the train_df and test_df so that
# remove all record in train_df whose labels not in test labels 
# and remove all record in test_df whose labels not in train labels 
# therefore we can have a proper train and test df such that the labels appear 
# in train and test should be the same and vice versa
train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
test_df  = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

In [3]:
# sanity check for label matching between train and test set
assert set(train_df['room'].unique()) == set(test_df['room'].unique())
print("Train/Test labels are now perfectly aligned")

Train/Test labels are now perfectly aligned


In [4]:
train_df.head()

Unnamed: 0,timestamp,mac address,RSSI,room
0,2023-04-10 14:21:46+09:00,6,-93,kitchen
1,2023-04-10 14:21:46+09:00,6,-93,kitchen
2,2023-04-10 14:21:46+09:00,6,-93,kitchen
3,2023-04-10 14:21:46+09:00,6,-93,kitchen
4,2023-04-10 14:21:46+09:00,6,-93,kitchen


In [5]:
test_df.head()

Unnamed: 0,timestamp,mac address,RSSI,room
0,2023-04-12 10:00:53+09:00,4,-92,cafeteria
1,2023-04-12 10:00:53+09:00,4,-96,cafeteria
2,2023-04-12 10:00:53+09:00,4,-96,cafeteria
3,2023-04-12 10:00:53+09:00,4,-96,cafeteria
4,2023-04-12 10:00:53+09:00,4,-83,cafeteria


# Feature engineering

### Firstly, create a matrix of beacon value for each records

In [6]:
# Function to add 25 beacon_1, beacon_2, ... , beacon_25
def add_beacon_features(df, num_beacons=25):
    df = df.copy()

    for i in range(1, num_beacons + 1):
        df[f'beacon_{i}'] = df['RSSI'].where(df['mac address'] == i, 0)

    return df

# Apply with our current train and test dataframe
train_df = add_beacon_features(train_df)
test_df = add_beacon_features(test_df)

In [7]:
train_df.head()

Unnamed: 0,timestamp,mac address,RSSI,room,beacon_1,beacon_2,beacon_3,beacon_4,beacon_5,beacon_6,...,beacon_16,beacon_17,beacon_18,beacon_19,beacon_20,beacon_21,beacon_22,beacon_23,beacon_24,beacon_25
0,2023-04-10 14:21:46+09:00,6,-93,kitchen,0,0,0,0,0,-93,...,0,0,0,0,0,0,0,0,0,0
1,2023-04-10 14:21:46+09:00,6,-93,kitchen,0,0,0,0,0,-93,...,0,0,0,0,0,0,0,0,0,0
2,2023-04-10 14:21:46+09:00,6,-93,kitchen,0,0,0,0,0,-93,...,0,0,0,0,0,0,0,0,0,0
3,2023-04-10 14:21:46+09:00,6,-93,kitchen,0,0,0,0,0,-93,...,0,0,0,0,0,0,0,0,0,0
4,2023-04-10 14:21:46+09:00,6,-93,kitchen,0,0,0,0,0,-93,...,0,0,0,0,0,0,0,0,0,0


In [8]:
test_df.head()

Unnamed: 0,timestamp,mac address,RSSI,room,beacon_1,beacon_2,beacon_3,beacon_4,beacon_5,beacon_6,...,beacon_16,beacon_17,beacon_18,beacon_19,beacon_20,beacon_21,beacon_22,beacon_23,beacon_24,beacon_25
0,2023-04-12 10:00:53+09:00,4,-92,cafeteria,0,0,0,-92,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2023-04-12 10:00:53+09:00,4,-96,cafeteria,0,0,0,-96,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2023-04-12 10:00:53+09:00,4,-96,cafeteria,0,0,0,-96,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2023-04-12 10:00:53+09:00,4,-96,cafeteria,0,0,0,-96,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2023-04-12 10:00:53+09:00,4,-83,cafeteria,0,0,0,-83,0,0,...,0,0,0,0,0,0,0,0,0,0


### Secondly, now we would perforrm windowing with these aggregated statistics for each window - for each beacon: mean, std, min, max, count

In [9]:
# write a function to group by window size = 1 second 
# (size should be parameter so we can easily change) and for each window we 
# would calculate these 5 aggregated statistics for each beacon_1,...,beacon_25
import pandas as pd

def aggregate_beacons_by_time_window(
    df,
    window_size_seconds=1,
    beacon_prefix="beacon_",
    num_beacons=25
):
    df = df.copy()

    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.set_index("timestamp")

    beacon_cols = [f"{beacon_prefix}{i}" for i in range(1, num_beacons + 1)]

    agg_dict = {}

    # aggregate beacons
    for col in beacon_cols:
        agg_dict[col] = ["mean", "std", "max", "min", "count"]

    # keep room label
    agg_dict["room"] = "first"   # or "mode" if you prefer

    grouped = (
        df
        .groupby(pd.Grouper(freq=f"{window_size_seconds}s"))
        .agg(agg_dict)
    )

    # flatten columns
    grouped.columns = [
        f"{col}_{stat}" if col != "room" else "room"
        for col, stat in grouped.columns
    ]

    grouped = grouped.reset_index()

    return grouped

In [10]:
# apply the function of feature engineering with 2 train + test dataframe
print("Start feature engineering / windowing for both sets")
windowed_train_df = aggregate_beacons_by_time_window(train_df)
print("Finish feature engineering for training df")
windowed_test_df = aggregate_beacons_by_time_window(test_df)
print("Finish feature engineering for testing df")

Start feature engineering / windowing for both sets
Finish feature engineering for training df
Finish feature engineering for testing df


In [11]:
windowed_train_df.head()

Unnamed: 0,timestamp,beacon_1_mean,beacon_1_std,beacon_1_max,beacon_1_min,beacon_1_count,beacon_2_mean,beacon_2_std,beacon_2_max,beacon_2_min,...,beacon_24_std,beacon_24_max,beacon_24_min,beacon_24_count,beacon_25_mean,beacon_25_std,beacon_25_max,beacon_25_min,beacon_25_count,room
0,2023-04-10 14:21:46+09:00,0.0,0.0,0.0,0.0,111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,111,0.0,0.0,0.0,0.0,111,kitchen
1,2023-04-10 14:21:47+09:00,0.0,0.0,0.0,0.0,99,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,99,0.0,0.0,0.0,0.0,99,kitchen
2,2023-04-10 14:21:48+09:00,0.0,0.0,0.0,0.0,67,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,67,0.0,0.0,0.0,0.0,67,kitchen
3,2023-04-10 14:21:49+09:00,0.0,0.0,0.0,0.0,107,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,107,0.0,0.0,0.0,0.0,107,kitchen
4,2023-04-10 14:21:50+09:00,0.0,0.0,0.0,0.0,98,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,98,0.0,0.0,0.0,0.0,98,kitchen


In [12]:
windowed_test_df.head(10)

Unnamed: 0,timestamp,beacon_1_mean,beacon_1_std,beacon_1_max,beacon_1_min,beacon_1_count,beacon_2_mean,beacon_2_std,beacon_2_max,beacon_2_min,...,beacon_24_std,beacon_24_max,beacon_24_min,beacon_24_count,beacon_25_mean,beacon_25_std,beacon_25_max,beacon_25_min,beacon_25_count,room
0,2023-04-12 10:00:53+09:00,0.0,0.0,0.0,0.0,25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,25,0.0,0.0,0.0,0.0,25,cafeteria
1,2023-04-12 10:00:54+09:00,0.0,0.0,0.0,0.0,15,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,15,0.0,0.0,0.0,0.0,15,cafeteria
2,2023-04-12 10:00:55+09:00,0.0,0.0,0.0,0.0,15,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,15,0.0,0.0,0.0,0.0,15,cafeteria
3,2023-04-12 10:00:56+09:00,0.0,0.0,0.0,0.0,10,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10,0.0,0.0,0.0,0.0,10,cafeteria
4,2023-04-12 10:00:57+09:00,0.0,0.0,0.0,0.0,10,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10,0.0,0.0,0.0,0.0,10,cafeteria
5,2023-04-12 10:00:58+09:00,0.0,0.0,0.0,0.0,20,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,20,0.0,0.0,0.0,0.0,20,cafeteria
6,2023-04-12 10:00:59+09:00,0.0,0.0,0.0,0.0,31,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,31,0.0,0.0,0.0,0.0,31,cafeteria
7,2023-04-12 10:01:00+09:00,0.0,0.0,0.0,0.0,33,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,33,0.0,0.0,0.0,0.0,33,cafeteria
8,2023-04-12 10:01:01+09:00,0.0,0.0,0.0,0.0,18,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,18,0.0,0.0,0.0,0.0,18,cafeteria
9,2023-04-12 10:01:02+09:00,0.0,0.0,0.0,0.0,31,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,31,0.0,0.0,0.0,0.0,31,cafeteria


### Thirdly, check value counts of label to see which is the minority class here

In [13]:
windowed_train_df['room'].value_counts()

room
nurse station    7222
cafeteria        3320
kitchen          2885
hallway           620
523               232
522               181
512               159
cleaning          108
511                96
520                70
506                65
501                50
517                15
513                15
502                 9
518                 5
Name: count, dtype: int64

- We will now determine the "minority class" based on the train set value counts (note that never analyze the test set, that will be a kind of data leakage)
- We will consider these as minority classes first, if we want to try some other way to improve score in the future, change the minority classes list to even be more extremely: ["518", "502", "513", "517"] 

### Fourthly, we would conduct the relabeling method technique here
    - For each minority and also the other classes: define the vector [ fl, sl, s, f, sr, fr] - 6 surroundings beacon, we would use full matching here, not use partial matching
    - Use KL Divergence to determine which is the best match room for each minority class's room

In [14]:
# define the list of minority classes
minority_class = ["518", "502", "513", "517"]

In [15]:
# defining the 6 surrounding beacons vector for each classes

In [16]:
# and then for each room in minority class, perform KL Divergence calculation with 
# the majority classes to find the best match

### Fifthly, we would conduct relabeling and add the relabelled records to original training set

### Sixthly, train and evaluate the Random Forest Model