# Some basic observation

In [39]:
import pandas as pd 
df = pd.read_csv("Dataset/5f_label_loc_train.csv")

In [40]:
df.shape

(1334, 11)

In [41]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,activity,started_at,finished_at,deleted_at,updated_at,user_id,user,room,floor
0,170,170,Location,2023-04-10 14:21:46+09:00,2023-04-10 14:21:50+09:00,,2023-04-10 05:22:02 UTC,97,5th-location,kitchen,5th
1,171,171,Location,2023-04-10 14:21:55+09:00,2023-04-10 14:22:16+09:00,,2023-04-10 05:23:02 UTC,97,5th-location,cafeteria,5th
2,172,172,Location,2023-04-10 14:22:23+09:00,2023-04-10 14:23:25+09:00,,2023-04-10 05:24:01 UTC,97,5th-location,kitchen,5th
3,173,173,Location,2023-04-10 14:23:29+09:00,2023-04-10 14:23:44+09:00,,2023-04-10 05:24:02 UTC,97,5th-location,cleaning,5th
4,174,174,Location,2023-04-10 14:23:42+09:00,2023-04-10 14:24:05+09:00,,2023-04-10 05:25:03 UTC,97,5th-location,nurse station,5th


# Data cleaning

In [42]:
# Firstly count number of records with NaN value for userid column 
df['user_id'].isnull().sum().sum()

np.int64(0)

In [43]:
# drop the two unnamed/index columns and add placeholders
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

In [44]:
# just take the record with activity to be "Location"
df = df.loc[df['activity'] == 'Location']

# display head 
df.head()

Unnamed: 0,activity,started_at,finished_at,deleted_at,updated_at,user_id,user,room,floor
0,Location,2023-04-10 14:21:46+09:00,2023-04-10 14:21:50+09:00,,2023-04-10 05:22:02 UTC,97,5th-location,kitchen,5th
1,Location,2023-04-10 14:21:55+09:00,2023-04-10 14:22:16+09:00,,2023-04-10 05:23:02 UTC,97,5th-location,cafeteria,5th
2,Location,2023-04-10 14:22:23+09:00,2023-04-10 14:23:25+09:00,,2023-04-10 05:24:01 UTC,97,5th-location,kitchen,5th
3,Location,2023-04-10 14:23:29+09:00,2023-04-10 14:23:44+09:00,,2023-04-10 05:24:02 UTC,97,5th-location,cleaning,5th
4,Location,2023-04-10 14:23:42+09:00,2023-04-10 14:24:05+09:00,,2023-04-10 05:25:03 UTC,97,5th-location,nurse station,5th


In [45]:
# filter the records with NaN value in started_at / finished_at or 
# un-NaN value in deleted_at (this means that this record is 
# intended to be deleted, not to be used)
df = df[
    df['started_at'].notna() &  # Keep only records where started_at is not NaN
    df['finished_at'].notna() &  # Keep only records where finished_at is not NaN
    df['deleted_at'].isna()      # Keep only records where deleted_at IS NaN (not deleted)
]

In [46]:
# okay done now we can safely drop the extra columns: deleted_at 
# and updated_at 
df.drop(columns=['deleted_at', 'updated_at'], inplace=True)

In [47]:
# now just keep the record with user id == 97, as we just care 
# about 5th floor 
df = df.loc[df['user_id'] == 97]

df.head()

Unnamed: 0,activity,started_at,finished_at,user_id,user,room,floor
0,Location,2023-04-10 14:21:46+09:00,2023-04-10 14:21:50+09:00,97,5th-location,kitchen,5th
1,Location,2023-04-10 14:21:55+09:00,2023-04-10 14:22:16+09:00,97,5th-location,cafeteria,5th
2,Location,2023-04-10 14:22:23+09:00,2023-04-10 14:23:25+09:00,97,5th-location,kitchen,5th
3,Location,2023-04-10 14:23:29+09:00,2023-04-10 14:23:44+09:00,97,5th-location,cleaning,5th
4,Location,2023-04-10 14:23:42+09:00,2023-04-10 14:24:05+09:00,97,5th-location,nurse station,5th


In [48]:
# check the possible values for "user" and "floor" features --> need to ensure all are 5th floor
df["user"].value_counts()

user
5th-location    451
Name: count, dtype: int64

In [49]:
df["floor"].value_counts()

floor
5th        410
2nd,5th     41
Name: count, dtype: int64

In [50]:
df.shape

(451, 7)

In [51]:
# Okay now verify the final df once again before save it to 
# a cleaned csv file
df.head()

Unnamed: 0,activity,started_at,finished_at,user_id,user,room,floor
0,Location,2023-04-10 14:21:46+09:00,2023-04-10 14:21:50+09:00,97,5th-location,kitchen,5th
1,Location,2023-04-10 14:21:55+09:00,2023-04-10 14:22:16+09:00,97,5th-location,cafeteria,5th
2,Location,2023-04-10 14:22:23+09:00,2023-04-10 14:23:25+09:00,97,5th-location,kitchen,5th
3,Location,2023-04-10 14:23:29+09:00,2023-04-10 14:23:44+09:00,97,5th-location,cleaning,5th
4,Location,2023-04-10 14:23:42+09:00,2023-04-10 14:24:05+09:00,97,5th-location,nurse station,5th


In [52]:
# save to a new csv file
df.to_csv("cleaned_dataset/cleaned_label_loc.csv")