**RealWorld2016** dataset has some anomalies, which must be handled with this notebook before running the pipeline.

In [1]:
import shutil
import os
import zipfile
import polars as pl
from glob import glob


def rreplace(s: str, old: str, new: str, occurrence: int = 1):
    """
    Replace last occurrence of a substring
    Args:
        s: the whole string
        old: old substring
        new: new substring
        occurrence: number of occurrences

    Returns:
        new string
    """
    li = s.rsplit(old, occurrence)
    return new.join(li)


raw_modals = ['acc', 'gyr', 'mag', 'lig', 'mic', 'gps']
raw_folder = '/mnt/data_partition/downloads/realworld2016_dataset'

get paths of all zip files

In [2]:
first_submodal = raw_modals[0]
files = {
    first_submodal: sorted(glob(f'{raw_folder}/proband*/data/{first_submodal}_*_csv.zip'))
}
for sub_modal in raw_modals[1:]:
    files[sub_modal] = [rreplace(f, first_submodal, sub_modal) for f in files[first_submodal]]
df_sessions = pl.DataFrame(files)
print(df_sessions.shape)
df_sessions.head()

(120, 6)


acc,gyr,mag,lig,mic,gps
str,str,str,str,str,str
"""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…"
"""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…"
"""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…"
"""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…"
"""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…","""/mnt/data_part…"


Find and move anomaly zip files to the 'anomaly' folder.</br>
These abnormal zip files don't contain csv files like others, but more zip files inside.

In [3]:
anomaly_folder = os.path.join(raw_folder, 'anomaly')
abnormal_files = []

for session in df_sessions.iter_rows():
    for submodal_file in session:
        with zipfile.ZipFile(submodal_file, 'r') as F:
            compressed_list = F.namelist()

        for compressed_item in compressed_list:
            if compressed_item.endswith('.zip'):
                dst = submodal_file.split('proband')
                dst = os.path.join(anomaly_folder, f'proband{dst[1]}')
                os.makedirs(os.path.split(dst)[0], exist_ok=True)
                shutil.move(submodal_file, dst)
                
                abnormal_files.append(dst)
                print(f'moved {submodal_file}\nto {dst}\n')

                break

moved /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/acc_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/acc_climbingdown_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/gyr_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingdown_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/mag_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/mag_climbingdown_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/lig_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/lig_climbingdown_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/mic_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/mic_climbingdown

Extract abnormal files

In [4]:
for abn_file in abnormal_files:
    # rename to avoid same name when extracting
    new_name = os.path.splitext(abn_file)[0] + 'org.zip'
    os.rename(abn_file, new_name)

    with zipfile.ZipFile(new_name, 'r') as F:
        F.extractall(os.path.split(new_name)[0])
    os.remove(new_name)
    
    print(f'extracted {abn_file}')

extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/acc_climbingdown_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingdown_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/mag_climbingdown_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/lig_climbingdown_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/mic_climbingdown_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gps_climbingdown_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/acc_climbingup_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingup_csv.zip
extracted /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/mag_climbingup_csv.zip
extracted /mnt/d

Each anomaly session is split into 3 sub-session [1,2,3], but some [1] sessions don't have the number "1" in its name.</br>
Now add this number "1" to the zip file name

In [5]:
sessions_wo_number1 = [f for f in glob(f'{anomaly_folder}/proband*/data/*.zip')
                       if len(os.path.split(f)[1].split('_')) == 3]
for session_path in sessions_wo_number1:
    new_name = rreplace(session_path, '_csv.zip', '_1_csv.zip')
    os.rename(session_path, new_name)
    print(f'renamed {session_path}\nto {new_name}\n')

renamed /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gps_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gps_climbingdown_1_csv.zip

renamed /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gps_climbingup_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gps_climbingup_1_csv.zip

renamed /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingdown_1_csv.zip

renamed /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingup_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingup_1_csv.zip

renamed /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/lig_climbingdown_csv.zip
to /mnt/data_partition/downloads/realworld20

Move back extracted files to the original raw folder

In [6]:
new_zips = glob(f'{anomaly_folder}/proband*/data/*.zip')
for new_zip in new_zips:
    dst = rreplace(new_zip, 'anomaly/', '')
    shutil.move(new_zip, dst)
    print(f'moved {new_zip}\nto {dst}\n')
shutil.rmtree(anomaly_folder)

moved /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/gyr_climbingup_3_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/gyr_climbingup_3_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/acc_climbingdown_1_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/acc_climbingdown_1_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/acc_climbingdown_2_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/acc_climbingdown_2_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/acc_climbingdown_3_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/acc_climbingdown_3_csv.zip

moved /mnt/data_partition/downloads/realworld2016_dataset/anomaly/proband14/data/acc_climbingup_1_csv.zip
to /mnt/data_partition/downloads/realworld2016_dataset/proband14/data/acc_