#### __Preparation__

##### __Dependencies__

In [1]:
import copy  # copy big/deep objects by value
import os  # OS operations (read/write files/folders)

# process parallelization
from multiprocessing import Manager, Pool, cpu_count

import numpy as np  # array/matrix operations (e.g. linear algebra)
import pandas as pd  # operate with dataframes
from IPython.display import display  # print nicely
from tqdm.notebook import tqdm  # mother of progressbars for Python

##### __Options and constants__

In [2]:
# filename format f"Path_ MS Wissenschaft-{condition}-{uid}.{extension}"

# progress bar customized format
B_FORMAT = """📄 {n_fmt} of {total_fmt} {desc} processed: {bar} 
            {percentage:3.0f}% ⏱️{elapsed} ⏳{remaining} ⚙️{rate_fmt}{postfix}"""

# specify decimals format on pandas tables
pd.options.display.float_format = "{:.2f}".format

# specify data paths
PATH_R = "./data/raw"  # raw data path (root)
PATH_RC = f"{PATH_R}/CsvData"  # ray-casted path (from event + experiment)
PATH_CO = f"{PATH_R}/Corrupted"  # failed RT (corrupted event or experiment)
PATH_QS = f"{PATH_R}/Questionnaires"  # questionnaire data path
BMBF_Q = f"{PATH_QS}/bmbf-answers.csv"
MSW_Q = f"{PATH_QS}/msw-answers.csv"

# dfs (vr and questionnaire) to store and display counters and percentages
COLS = ["msw", "bmbf", "avas", "radio", "taxi", "left", "right", "total"]
p_stats = pd.DataFrame(columns=COLS)
q_stats = pd.DataFrame(columns=COLS)
# add distinctive index names
p_stats.index.name = "rides"
q_stats.index.name = "questionnaires"


CORES = cpu_count()  # number of cpu threads for multiprocessing
print(f"Total CPU threads: {CORES}")

Total CPU threads: 16


##### __Helper functions__

In [3]:
def pbar_fork_hack():
    """
    Hack to enforce progress bars to be displayed by fork processes on
    IPython Apps like Jupyter Notebooks.

    Avoids [IPKernelApp] WARNING | WARNING: attempted to send message from fork

    Important: pass this function as argument for the initializer parameter
    while initializing a multiprocessing pool to make it work. E.g.:

    pool = Pool(processes=N_CORES, initializer=pbar_fork_hack)

    Source:
     - https://github.com/ipython/ipython/issues/11049#issue-306086846
     - https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
    """
    print(" ", end="", flush=True)


def update_stats(df, label, stats):
    """
    Update the given dataframe with counters and percentages.

    Parameters:
        df (pandas.DataFrame): Dataframe to calculate the counters and %.
        label (str): Label to use for counter and % rows to include on the df.
        stats (pandas.DataFrame): Dataframe containing current stats.
    Returns:
        pandas.DataFrame: Updated dataframe with new stats.
    """

    # get counters
    total = df.index.size
    total_old = stats.iloc[0].total if stats.index.size > 0 else total
    n_msw = df[df.expo == "MSW"].index.size
    n_bmbf = df[df.expo == "BMBF"].index.size
    n_avas = df[df.condition == "AVAS"].index.size
    n_radio = df[df.condition == "RadioTalk"].index.size
    n_taxi = df[df.condition == "TaxiDriver"].index.size
    n_left = df[df.side == "left"].index.size
    n_right = df[df.side == "right"].index.size

    # store counters
    stats.loc[f"{label}"] = [
        n_msw,
        n_bmbf,
        n_avas,
        n_radio,
        n_taxi,
        n_left,
        n_right,
        total,
    ]

    # get percentages
    p_total = total * 100 / total_old
    p_msw = n_msw * 100 / total
    p_bmbf = n_bmbf * 100 / total
    p_avas = n_avas * 100 / total
    p_radio = n_radio * 100 / total
    p_taxi = n_taxi * 100 / total
    p_left = n_left * 100 / total
    p_right = n_right * 100 / total
    # total_old used to calculate the total % in respect of the original total

    # store percentages
    stats.loc[f"%{label}"] = [
        p_msw,
        p_bmbf,
        p_avas,
        p_radio,
        p_taxi,
        p_left,
        p_right,
        p_total,
    ]

    return stats

#### __VR rides cleanup__

##### __Load raw participants dataframe__

Load all participants ids (unique identifier), dates (experiment date), exhibition (MSW/BMBF) and side (left/right seat of the car) from the compilation `participants_raw.csv`.

This compilation was generated from reading all raw data from the original backup dataset (MSW and BMBF left/right folders).
The notebooks used to achieve that are:
1. [checks/ids/ids_expos_sides_bmbf-left.ipynb](./checks/ids/ids_expos_sides_bmbf-left.ipynb)
2. [checks/ids/ids_expos_sides_bmbf-right.ipynb](./checks/ids/ids_expos_sides_bmbf-right.ipynb)
3. [checks/ids/ids_expos_sides_msw-left.ipynb](./checks/ids/ids_expos_sides_msw-left.ipynb)
4. [checks/ids/ids_expos_sides_msw-right.ipynb](./checks/ids/ids_expos_sides_msw-right.ipynb)
5. [checks/ids/unify_ids_checks_and_notes.ipynb](./checks/ids/unify_ids_checks_and_notes.ipynb)

Later on a bug by collecting the date of the participant experiment was detected. Thus the following notebook was used to fix the date, and include the condition (AVAS, RadioTalk, TaxiDriver):

[checks/fix_dates.ipynb](./checks/fix_dates.ipynb)

In [4]:
# "participants_full.csv"
part_raw = pd.read_csv("./participants_raw.csv")
part_raw = part_raw.set_index("id")

# set date column as standard datetime format
part_raw.date = pd.to_datetime(part_raw.date)

# ensure participants ordering by date
part_raw = part_raw.sort_values(by="date")

part_raw

Unnamed: 0_level_0,date,expo,side,condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d2ae34df3118440cb66b5c27ade904f1,2019-05-09 19:34:00,MSW,right,TaxiDriver
b8d1f4aa336e433891d26271ee3f2e8f,2019-05-09 19:56:00,MSW,right,TaxiDriver
3622dbec36ac48bd9ad1159270e8d01f,2019-05-09 19:59:00,MSW,left,TaxiDriver
10c30561d9fb4f40bd4b95da3264f421,2019-05-09 19:59:00,MSW,right,TaxiDriver
1f55858f8a7d4d8da568cc93ee196f43,2019-05-09 20:48:00,MSW,right,AVAS
...,...,...,...,...
611d35a7c3e940cc82495e53c2a8532d,2020-01-03 16:12:00,BMBF,right,TaxiDriver
3b6fda285d9e412eb081986b2f22a2e3,2020-01-03 16:13:00,BMBF,left,AVAS
18ffb0abdc8642098c479380bfa533d1,2020-01-03 16:15:00,BMBF,left,RadioTalk
c06f123b35b74bb489ec239b1cac9eb4,2020-01-03 16:16:00,BMBF,right,TaxiDriver


##### __Check participant UIDs__

Check all raycasted filenames, compare to all recordings list, and display the missing/corrupted ones if any.

In [5]:
# get all raycasted data filenames
rc = os.listdir(PATH_RC)
for f in rc:  # iterate over participant filenames
    # filter hidden/config files and folders
    if f.startswith(".") or not f.endswith(".csv"):
        rc.remove(f)  # remove hidden/config file

# generate uid-filename pairs on a dict
rc = {f.split("-")[2].split(".")[0]: f for f in rc}
rc_uids = list(rc.keys())

# display length difference between full participant list and all raycasts
print(f"Total raycasts: {len(rc_uids)}")
print(f"Total uid list: {part_raw.index.size}")
print()

# same for all corrupted files if any
co = os.listdir(PATH_CO)
for f in co:  # iterate over corrupted participant recordings
    # filter hidden/config files and folders
    if f.startswith(".") or not f.endswith(".bin") and not f.endswith(".raw"):
        co.remove(f)  # remove hidden/config file

# get only the uid from the filename
co_uids = [f.split("-")[2].split(".")[0] for f in co]
# make sure they don't repeat between .raw and .bin
co_uids = list(set(co_uids))

# display stored corrupted recordings
print("Missing/corrupted participant recording/s stored:")
[print(c) for c in co_uids]
print()  # linebreak

# check and compare between raycasted and full lists
uids = part_raw.index.tolist()

# raycast progress bar
uids_pbar = tqdm(
    uids,
    total=len(uids),
    desc="🧾 participants",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

corrupted = []
for uid in uids_pbar:  # iterate over participants uids
    if uid not in rc_uids:
        corrupted.append(uid)  # collect corrupted
        part_raw = part_raw.drop(uid)  # remove participant from df

# display corrupted found and final list
print()
print("Missing/corrupted participant recording/s found:")
[print(c) for c in corrupted]
print()
print(f"Final uid list: {part_raw.index.size}")

Total raycasts: 26571
Total uid list: 26572

Missing/corrupted participant recording/s stored:
e915c692a27b4f09a9fda2ec8f53b0b5



📄 0 of 26572 🧾 participants processed:                                                                        …


Missing/corrupted participant recording/s found:
e915c692a27b4f09a9fda2ec8f53b0b5

Final uid list: 26571


##### __Add total frames on participant dataframe__

In [6]:
def collect_frames(f):
    """Collect total number of frames per participant.

    Parameters:
        f (str): Recording filename."""

    # set path and load file
    f_path = f"{PATH_RC}/{f}"
    f_df = pd.read_csv(f_path)

    # parse uid and condition from filename
    uid = f.split("-")[2].split(".")[0]

    # total frames = last stored frame - 400 (start frame)
    n_frames = f_df["frameNumber"].iloc[-1] - 400

    # store total number of frames
    frames[uid] = n_frames


# add condition and frames (total) columns
part_raw["frames"] = 0
# ensure frames columns as int32 type
part_raw["frames"] = part_raw["frames"].astype("int32")

# files and uids to iterate
files = list(rc.values())
uids = part_raw.index.tolist()

manager = Manager()  # manage shared memory types
# proxy objects progress bar
ob_pbar = tqdm(
    iterable=uids,
    total=len(uids),
    desc="🧾 proxy objects",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)
# convert into memory-shared dict of lists for multiprocessing
frames = manager.dict({uid: None for uid in ob_pbar})

# initialize pool of processes according to the available cpu core threads
pool = Pool(processes=CORES, initializer=pbar_fork_hack)

# raycast progress bar
rc_pbar = tqdm(
    iterable=pool.imap(func=collect_frames, iterable=files),
    total=len(files),
    desc="📂 files",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

# loop necessary for displaying properly the progressbar with multiprocessing
# source: https://stackoverflow.com/a/40133278
for _ in rc_pbar:
    pass

# close pool instance, no more work to submit
pool.close()
# wait for the worker processes to terminate
pool.join()

# cast proxy object to dict (otherwise they can't be accessed)
frames = dict(frames)

# total frames progress bar
frames_pbar = tqdm(
    iterable=uids,
    total=len(uids),
    desc="🧾 participants",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

for uid in frames_pbar:  # iterate over collected total frames
    part_raw.loc[uid, "frames"] = frames[uid]  # store them into the df

📄 0 of 26571 🧾 proxy objects processed:                                                                       …

                

📄 0 of 26571 📂 files processed:                                                                               …

📄 0 of 26571 🧾 participants processed:                                                                        …

Store and preview the enhanced VR participant list.

In [7]:
part_raw.to_csv("./participants_full.csv")
part_raw

Unnamed: 0_level_0,date,expo,side,condition,frames
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d2ae34df3118440cb66b5c27ade904f1,2019-05-09 19:34:00,MSW,right,TaxiDriver,2670
b8d1f4aa336e433891d26271ee3f2e8f,2019-05-09 19:56:00,MSW,right,TaxiDriver,2670
3622dbec36ac48bd9ad1159270e8d01f,2019-05-09 19:59:00,MSW,left,TaxiDriver,2670
10c30561d9fb4f40bd4b95da3264f421,2019-05-09 19:59:00,MSW,right,TaxiDriver,2670
1f55858f8a7d4d8da568cc93ee196f43,2019-05-09 20:48:00,MSW,right,AVAS,2670
...,...,...,...,...,...
611d35a7c3e940cc82495e53c2a8532d,2020-01-03 16:12:00,BMBF,right,TaxiDriver,2670
3b6fda285d9e412eb081986b2f22a2e3,2020-01-03 16:13:00,BMBF,left,AVAS,2670
18ffb0abdc8642098c479380bfa533d1,2020-01-03 16:15:00,BMBF,left,RadioTalk,2670
c06f123b35b74bb489ec239b1cac9eb4,2020-01-03 16:16:00,BMBF,right,TaxiDriver,2670


##### __Stats before selecting finished experiments__

In [8]:
p_stats = update_stats(part_raw, "raw", p_stats)
p_stats

Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
rides,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
raw,24335.0,2236.0,8896.0,8796.0,8879.0,15067.0,11504.0,26571.0
%raw,91.58,8.42,33.48,33.1,33.42,56.7,43.3,100.0


##### __Stats after selecting finished experiments__

In [9]:
part_clean = part_raw[part_raw.frames == 2670]
# get rid of frames column since now all are finished experiments
part_clean = part_clean.drop(columns=["frames"])

# update stats
p_stats = update_stats(part_clean, "finished", p_stats)
p_stats

Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
rides,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
raw,24335.0,2236.0,8896.0,8796.0,8879.0,15067.0,11504.0,26571.0
%raw,91.58,8.42,33.48,33.1,33.42,56.7,43.3,100.0
finished,22381.0,2070.0,8288.0,7928.0,8235.0,13785.0,10666.0,24451.0
%finished,91.53,8.47,33.9,32.42,33.68,56.38,43.62,92.02


##### __Stats after selecting exhibition time (no tests)__

In [10]:
# ensure date column as datetime type
part_clean.date = pd.to_datetime(part_clean.date)

# select only data from the start of each expo
msw_p = (part_clean.expo == "MSW") & (part_clean.date > "2019-05-14")
bmbf_p = (part_clean.expo == "BMBF") & (part_clean.date > "2019-07-10")

# select msw + bmbf exhibition time data
part_clean = part_clean[msw_p | bmbf_p].sort_values(by="date")

# store and preview
part_clean.to_csv("./participants_clean.csv")
display(part_clean)

# update stats
p_stats = update_stats(part_clean, "expo", p_stats)
p_stats

Unnamed: 0_level_0,date,expo,side,condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4f583872ffed48249874b00f3d389cfc,2019-05-14 13:21:00,MSW,right,TaxiDriver
33d68ad55ef544fab9fd08108e3913ca,2019-05-14 16:44:00,MSW,right,AVAS
8598a83e2af441b8bcd0ae5d84beb875,2019-05-14 16:51:00,MSW,right,RadioTalk
37d7e67934974217830bb429bba7fd76,2019-05-14 16:54:00,MSW,left,AVAS
42bac596059749b5b8e8e83ae61de9b4,2019-05-14 16:56:00,MSW,right,TaxiDriver
...,...,...,...,...
cfe9482181f74f80b88cd4b1c048ab94,2019-12-30 15:29:00,BMBF,right,AVAS
611d35a7c3e940cc82495e53c2a8532d,2020-01-03 16:12:00,BMBF,right,TaxiDriver
3b6fda285d9e412eb081986b2f22a2e3,2020-01-03 16:13:00,BMBF,left,AVAS
18ffb0abdc8642098c479380bfa533d1,2020-01-03 16:15:00,BMBF,left,RadioTalk


Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
rides,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
raw,24335.0,2236.0,8896.0,8796.0,8879.0,15067.0,11504.0,26571.0
%raw,91.58,8.42,33.48,33.1,33.42,56.7,43.3,100.0
finished,22381.0,2070.0,8288.0,7928.0,8235.0,13785.0,10666.0,24451.0
%finished,91.53,8.47,33.9,32.42,33.68,56.38,43.62,92.02
expo,22359.0,1997.0,8251.0,7908.0,8197.0,13737.0,10619.0,24356.0
%expo,91.8,8.2,33.88,32.47,33.65,56.4,43.6,91.66


#### __Questionnaires cleanup__

##### __Load raw MSW and BMBF questionnaire data__

In [11]:
# load questionnaire data
msw_q = pd.read_csv(MSW_Q)
bmbf_q = pd.read_csv(BMBF_Q)

# show preview
display(msw_q)
display(bmbf_q)

Unnamed: 0,id,uid,date,Intention to Use 1,Perceived Usefulness 4,Perceived Ease of Use 2,Trust,Sex,Age,Aviophobia,Driving Frequency,Playing Hours,VR Playing Frequency
0,1,1234567890qwertyuiop,2019-04-23 13:25:43,100,0,50,,Keine Angabe,50,0,0,60,mehr als 10 mal
1,2,1234567890qwertyuiop,2019-04-23 14:13:16,100,52,53,,Intersex,43,1,47,60,unter 10
2,3,3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-23 18:43:07,100,51,49,,Intersex,43,0,0,60,unter 10
3,4,3b90576c-0c96-46ba-b07a-9b3c8fc6a5c5,2019-04-24 18:35:22,74,100,91,,Intersex,34,0,17,45,mehr als 10 mal
4,5,3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-24 18:45:33,100,18,19,,Keine Angabe,0,1,0,0,Ein mal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8868,8869,bdbdee21cfd241e0b55645b783eb6cd6,2019-10-24 17:31:19,25,25,75,72.00,keine Angabe,46,0,10,0,Ein mal
8869,8870,852bb085e7a54eae8a9d547e65f8b1a4,2019-10-24 17:35:45,0,0,50,50.00,männlich,38,0,20,0,Ein mal
8870,8871,b00ce9f2ce144880a1f7d563928f7111,2019-10-24 17:39:22,100,100,100,100.00,weiblich,23,0,5,0,Noch nie
8871,8872,6dd167e98afb43d3add70997cd8ecd48,2019-10-24 17:47:12,98,50,50,22.00,männlich,56,0,35,0,mehr als 10 mal


Unnamed: 0,id,uid,date,Intention to Use 1,Perceived Usefulness 4,Perceived Ease of Use 2,Trust,Sex,Age,Aviophobia,Driving Frequency,Playing Hours,VR Playing Frequency
0,1,99,2019-07-02 10:24:12,75,50,43,100,intersex,12,1,3,2,Ein mal
1,2,99,2019-07-02 10:27:51,50,100,0,97,weiblich,86,1,6,53,Ein mal
2,3,99,2019-07-02 10:56:13,83,100,82,7,weiblich,1,1,2,2,Ein mal
3,4,99,2019-07-02 10:58:10,77,33,43,16,weiblich,1,1,2,1,Ein mal
4,5,99,2019-07-02 11:00:09,79,78,90,61,weiblich,1,1,3,3,unter 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,1205,eda9175e9e1d4153ac2ca3038b2ab008,2019-12-23 09:23:08,65,30,33,20,männlich,12,0,0,25,Ein mal
1205,1206,b8c9e625f07444cc8d4a0fc245bab5dd,2019-12-23 09:28:25,100,50,50,100,weiblich,12,1,99,9,unter 10
1206,1207,2cd78ad9a37945de9f8cfd5f81c42082,2019-12-27 12:56:23,86,20,71,69,männlich,57,0,39,0,Noch nie
1207,1208,dba04b626e4c4c6e89aea4fb1f9cc32b,2019-12-30 13:50:37,84,89,61,92,männlich,73,0,55,0,Noch nie


##### __Fix id column, add expo column, merge dataframes, rename cols, translate German->English values__

In [12]:
# remove column id (autoincrement)
msw_q.drop(columns=["id"], inplace=True)
bmbf_q.drop(columns=["id"], inplace=True)

# set uid as index
msw_q.set_index("uid", inplace=True)
bmbf_q.set_index("uid", inplace=True)

# short column names mapping
columns_short = {
    "Intention to Use 1": "Intention",
    "Perceived Usefulness 4": "Usefulness",
    "Perceived Ease of Use 2": "Ease",
    "Trust": "Trust",
    "Driving Frequency": "Driving",
    "Playing Hours": "Play",
    "VR Playing Frequency": "VR",
    "condition": "Condition",
    "Sex": "Gender",
}

# gender and vr values translation (DE->EN)
gender_translation = {
    "männlich": "Male",
    "weiblich": "Female",
    "intersex": "intersex",
    "keine Angabe": "N/A",
}
vr_translation = {
    "Noch nie": "never",
    "Ein mal": "once",
    "unter 10": "less than 10 times",
    "mehr als 10 mal": "more than 10 times",
}

# add expo identifier column
msw_q["expo"] = ["MSW" for row in range(msw_q.index.size)]
bmbf_q["expo"] = ["BMBF" for row in range(bmbf_q.index.size)]

# merge data
quest = msw_q.append(bmbf_q)

# rename index
quest.index.name = "id"

# rename (shorten) column namnes
quest.rename(columns=columns_short, inplace=True)

# translate column values
quest.Gender = quest.Gender.map(gender_translation)
quest.VR = quest.VR.map(vr_translation)

quest  # arranged dataframe preview

Unnamed: 0_level_0,date,Intention,Usefulness,Ease,Trust,Gender,Age,Aviophobia,Driving,Play,VR,expo
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1234567890qwertyuiop,2019-04-23 13:25:43,100,0,50,,,50,0,0,60,more than 10 times,MSW
1234567890qwertyuiop,2019-04-23 14:13:16,100,52,53,,,43,1,47,60,less than 10 times,MSW
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-23 18:43:07,100,51,49,,,43,0,0,60,less than 10 times,MSW
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c5,2019-04-24 18:35:22,74,100,91,,,34,0,17,45,more than 10 times,MSW
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-24 18:45:33,100,18,19,,,0,1,0,0,once,MSW
...,...,...,...,...,...,...,...,...,...,...,...,...
eda9175e9e1d4153ac2ca3038b2ab008,2019-12-23 09:23:08,65,30,33,20.00,Male,12,0,0,25,once,BMBF
b8c9e625f07444cc8d4a0fc245bab5dd,2019-12-23 09:28:25,100,50,50,100.00,Female,12,1,99,9,less than 10 times,BMBF
2cd78ad9a37945de9f8cfd5f81c42082,2019-12-27 12:56:23,86,20,71,69.00,Male,57,0,39,0,never,BMBF
dba04b626e4c4c6e89aea4fb1f9cc32b,2019-12-30 13:50:37,84,89,61,92.00,Male,73,0,55,0,never,BMBF


##### __Include side (left/right), condition (AVAS/RadioTalk/TaxiDriver) and finished (2670 frames) on the dataframe__

In [13]:
# add new empty columns to the questionnaire dataframe
quest["side"] = ""
quest["condition"] = ""
quest["finished"] = ""

# get questionnaire uids
uids = quest.index.tolist()

# collector progress bar
quest_pbar = tqdm(
    iterable=uids,
    total=len(uids),
    desc="🧾 questionnaires",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

# get vr rides (raw) uids
vr_uids = part_raw.index.tolist()

# iterate over questionnaire uids
for uid in quest_pbar:
    # if questionnare uid has a matching vr ride
    if uid in vr_uids:
        cur = part_raw.loc[uid]
        side = cur.side
        cond = cur.condition
        ended = "Yes" if cur.frames == 2670 else "No"
    # otherwise set the fields empty
    else:
        side = ""
        cond = ""
        ended = ""
    # store the collected (vr ride) fields into the questionnaire df
    quest.loc[uid, ["side", "condition", "finished"]] = [side, cond, ended]

quest

📄 0 of 10082 🧾 questionnaires processed:                                                                      …

Unnamed: 0_level_0,date,Intention,Usefulness,Ease,Trust,Gender,Age,Aviophobia,Driving,Play,VR,expo,side,condition,finished
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1234567890qwertyuiop,2019-04-23 13:25:43,100,0,50,,,50,0,0,60,more than 10 times,MSW,,,
1234567890qwertyuiop,2019-04-23 14:13:16,100,52,53,,,43,1,47,60,less than 10 times,MSW,,,
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-23 18:43:07,100,51,49,,,43,0,0,60,less than 10 times,MSW,,,
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c5,2019-04-24 18:35:22,74,100,91,,,34,0,17,45,more than 10 times,MSW,,,
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-24 18:45:33,100,18,19,,,0,1,0,0,once,MSW,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eda9175e9e1d4153ac2ca3038b2ab008,2019-12-23 09:23:08,65,30,33,20.00,Male,12,0,0,25,once,BMBF,right,RadioTalk,Yes
b8c9e625f07444cc8d4a0fc245bab5dd,2019-12-23 09:28:25,100,50,50,100.00,Female,12,1,99,9,less than 10 times,BMBF,right,TaxiDriver,Yes
2cd78ad9a37945de9f8cfd5f81c42082,2019-12-27 12:56:23,86,20,71,69.00,Male,57,0,39,0,never,BMBF,right,RadioTalk,Yes
dba04b626e4c4c6e89aea4fb1f9cc32b,2019-12-30 13:50:37,84,89,61,92.00,Male,73,0,55,0,never,BMBF,left,TaxiDriver,Yes


##### __Query questionnaires without ride, and unfinished rides__

How many and which questionnaire participants were not found on the VR participants dataframe?

In [14]:
# select "test" questionnaires
selection = quest[quest.condition == ""]
print(selection.index.size)  # how many

selection

44


Unnamed: 0_level_0,date,Intention,Usefulness,Ease,Trust,Gender,Age,Aviophobia,Driving,Play,VR,expo,side,condition,finished
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1234567890qwertyuiop,2019-04-23 13:25:43,100,0,50,,,50,0,0,60,more than 10 times,MSW,,,
1234567890qwertyuiop,2019-04-23 14:13:16,100,52,53,,,43,1,47,60,less than 10 times,MSW,,,
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-23 18:43:07,100,51,49,,,43,0,0,60,less than 10 times,MSW,,,
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c5,2019-04-24 18:35:22,74,100,91,,,34,0,17,45,more than 10 times,MSW,,,
3b90576c-0c96-46ba-b07a-9b3c8fc6a5c0,2019-04-24 18:45:33,100,18,19,,,0,1,0,0,once,MSW,,,
66a8e6c1a1f64b1ba2d17d275ebd3c16,2019-04-26 12:35:46,77,18,68,,,98,1,0,15,more than 10 times,MSW,,,
6c0a6c4add0e4ec8b100e6c2532ee023,2019-04-29 12:26:12,66,83,32,,,22,0,0,37,more than 10 times,MSW,,,
f88488a1aa40424b94e8588b7ed9c176,2019-04-29 12:40:48,39,56,80,,,63,0,27,0,never,MSW,,,
b1bc0d0cd4a04e90ac7c3bb934fda45c,2019-04-29 16:59:22,70,65,67,,,22,0,50,22,more than 10 times,MSW,,,
e5b706acbd784ea5803f143a22e6224f,2019-04-29 17:00:35,42,60,25,,,32,1,49,7,once,MSW,,,


About these not matching participants:
- `3b90576c-0c96-46ba-b07a-9b3c8fc6a9c9` contains "-", unlike the rest of uids, so it was probably entered manually for testing. Same for the `undefined`, repeated, or odd (numeric only) ones.
- `e915c692a27b4f09a9fda2ec8f53b0b5` was removed from the VR dataset since the Event/Experiment data was corrupted, so the raycast could not be computed.
- `eefe3e4bcb5f4cfb86eba0f03a35ca6e` and `312581bb08594c4eb1f884926a5ea3c8` still unclear why they were not found.

How many and which questionnaire participants did not finish the VR ride?

In [15]:
# select unfinished rides
selection = quest[quest.finished == "No"]
print(selection.index.size)

selection

7


Unnamed: 0_level_0,date,Intention,Usefulness,Ease,Trust,Gender,Age,Aviophobia,Driving,Play,VR,expo,side,condition,finished
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7d056a15febf472fb4e9d27aa0e4eeba,2019-05-18 15:34:56,100,100,50,,Male,10,0,0,2,less than 10 times,MSW,left,TaxiDriver,No
dad926695bc540a8a5c2c785d071e432,2019-07-16 11:06:48,87,86,79,84.0,Male,21,0,36,0,never,MSW,right,AVAS,No
bb7d8a7cf87b47168e46e83020c3dde0,2019-09-25 11:30:46,0,0,0,0.0,intersex,99,1,99,99,more than 10 times,MSW,left,RadioTalk,No
1d6295b6d42b45118cea5ff51c2db64f,2019-09-25 12:02:17,100,86,84,0.0,Female,13,0,55,5,never,MSW,right,RadioTalk,No
adc13a555a854d6ab195e6e499f43793,2019-09-26 11:18:29,38,92,100,76.0,,66,0,4,5,more than 10 times,MSW,left,RadioTalk,No
eb8e81579d6f4e8dbf4c7cd31f3ad584,2019-11-08 13:27:19,27,65,39,64.0,Male,45,0,25,0,once,BMBF,right,RadioTalk,No
c8e3da30839048cca44ece8832fc6972,2019-12-20 09:42:59,50,50,50,50.0,Male,15,0,11,12,never,BMBF,left,AVAS,No


##### __Remove unfinished rides and questionnaires witout ride__

In [16]:
quest = quest[quest.finished != "No"]
quest = quest[quest.side != ""]

# also drop finished column since it is useless from now on
quest = quest.drop(columns=["finished"])

##### __Stats with finished rides__

In [17]:
q_stats = update_stats(quest, "finished", q_stats)
q_stats

Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
questionnaires,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
finished,8832.0,1199.0,3451.0,3223.0,3357.0,5781.0,4250.0,10031.0
%finished,88.05,11.95,34.4,32.13,33.47,57.63,42.37,100.0


##### __Stats after selecting exhibition time__

In [18]:
# ensure standard datetime series format
quest.date = pd.to_datetime(quest.date)

# select only data from the start of each expo
msw_q = (quest.expo == "MSW") & (quest.date > "2019-05-14")
bmbf_q = (quest.expo == "BMBF") & (quest.date > "2019-07-10")

# include former selections together and show results preview
quest = quest[msw_q | bmbf_q]
display(quest)

# update and show stats
q_stats = update_stats(quest, "expo", q_stats)
q_stats

Unnamed: 0_level_0,date,Intention,Usefulness,Ease,Trust,Gender,Age,Aviophobia,Driving,Play,VR,expo,side,condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8598a83e2af441b8bcd0ae5d84beb875,2019-05-14 15:54:53,100,0,100,,Male,30,0,5,10,once,MSW,right,RadioTalk
42bac596059749b5b8e8e83ae61de9b4,2019-05-14 15:58:49,50,34,83,,Female,47,0,30,0,once,MSW,right,TaxiDriver
586c107173344c59aa4f71e3573233f0,2019-05-14 15:59:00,2,3,1,,Female,37,0,0,0,more than 10 times,MSW,left,AVAS
9cdd85098b0b4ad5ab2282a5ac371a5e,2019-05-14 16:02:09,19,21,50,,Female,40,0,22,0,never,MSW,right,AVAS
ff846d92c7e6471183595bd2678f29f6,2019-05-14 16:40:20,0,0,50,,Female,40,0,22,2,more than 10 times,MSW,right,AVAS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eda9175e9e1d4153ac2ca3038b2ab008,2019-12-23 09:23:08,65,30,33,20.00,Male,12,0,0,25,once,BMBF,right,RadioTalk
b8c9e625f07444cc8d4a0fc245bab5dd,2019-12-23 09:28:25,100,50,50,100.00,Female,12,1,99,9,less than 10 times,BMBF,right,TaxiDriver
2cd78ad9a37945de9f8cfd5f81c42082,2019-12-27 12:56:23,86,20,71,69.00,Male,57,0,39,0,never,BMBF,right,RadioTalk
dba04b626e4c4c6e89aea4fb1f9cc32b,2019-12-30 13:50:37,84,89,61,92.00,Male,73,0,55,0,never,BMBF,left,TaxiDriver


Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
questionnaires,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
finished,8832.0,1199.0,3451.0,3223.0,3357.0,5781.0,4250.0,10031.0
%finished,88.05,11.95,34.4,32.13,33.47,57.63,42.37,100.0
expo,8823.0,1150.0,3430.0,3209.0,3334.0,5755.0,4218.0,9973.0
%expo,88.47,11.53,34.39,32.18,33.43,57.71,42.29,99.42


##### __Stats after deleting duplicated ids__

In [19]:
# get duplicated IDs and preview them
duplicated = quest[quest.index.duplicated(keep=False)]
display(duplicated)

# drop them and preview the rest
quest = quest.drop(duplicated.index)
display(quest)

# update stats
q_stats = update_stats(quest, "no duplicates", q_stats)
q_stats

Unnamed: 0_level_0,date,Intention,Usefulness,Ease,Trust,Gender,Age,Aviophobia,Driving,Play,VR,expo,side,condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
18881d3ec46e4a34bf3d1ff80de5109a,2019-05-16 09:13:04,100,100,100,,Male,50,0,4,0,more than 10 times,MSW,left,AVAS
18881d3ec46e4a34bf3d1ff80de5109a,2019-05-16 09:14:03,100,100,80,,Male,10,0,0,1,more than 10 times,MSW,left,AVAS
4a81013077fc458f8b22ea3c9cc686fa,2019-05-16 10:34:02,50,50,50,,intersex,0,1,0,99,less than 10 times,MSW,right,TaxiDriver
4a81013077fc458f8b22ea3c9cc686fa,2019-05-16 10:34:13,94,50,50,,Male,8,1,0,14,never,MSW,right,TaxiDriver
1ba35aa9ceac4c6a85f55dcee0e8ded2,2019-05-18 18:09:23,0,0,0,,,0,0,0,0,,MSW,left,AVAS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
847d87275dd1461e8aa87f41cc4a70a8,2019-10-18 15:34:09,50,80,60,70.00,Male,14,0,0,20,less than 10 times,MSW,left,AVAS
4885a09a14d2454c8f06bee7ee0a5b47,2019-10-21 10:20:29,100,100,100,100.00,Male,11,0,8,0,never,MSW,right,TaxiDriver
4885a09a14d2454c8f06bee7ee0a5b47,2019-10-21 10:21:22,0,50,29,100.00,Female,12,1,3,1,less than 10 times,MSW,right,TaxiDriver
b347ec7e17384c2987f4a48cfd85cb3c,2019-10-23 09:45:09,48,6,27,100.00,Female,12,0,0,85,once,MSW,left,AVAS


Unnamed: 0_level_0,date,Intention,Usefulness,Ease,Trust,Gender,Age,Aviophobia,Driving,Play,VR,expo,side,condition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8598a83e2af441b8bcd0ae5d84beb875,2019-05-14 15:54:53,100,0,100,,Male,30,0,5,10,once,MSW,right,RadioTalk
42bac596059749b5b8e8e83ae61de9b4,2019-05-14 15:58:49,50,34,83,,Female,47,0,30,0,once,MSW,right,TaxiDriver
586c107173344c59aa4f71e3573233f0,2019-05-14 15:59:00,2,3,1,,Female,37,0,0,0,more than 10 times,MSW,left,AVAS
9cdd85098b0b4ad5ab2282a5ac371a5e,2019-05-14 16:02:09,19,21,50,,Female,40,0,22,0,never,MSW,right,AVAS
ff846d92c7e6471183595bd2678f29f6,2019-05-14 16:40:20,0,0,50,,Female,40,0,22,2,more than 10 times,MSW,right,AVAS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eda9175e9e1d4153ac2ca3038b2ab008,2019-12-23 09:23:08,65,30,33,20.00,Male,12,0,0,25,once,BMBF,right,RadioTalk
b8c9e625f07444cc8d4a0fc245bab5dd,2019-12-23 09:28:25,100,50,50,100.00,Female,12,1,99,9,less than 10 times,BMBF,right,TaxiDriver
2cd78ad9a37945de9f8cfd5f81c42082,2019-12-27 12:56:23,86,20,71,69.00,Male,57,0,39,0,never,BMBF,right,RadioTalk
dba04b626e4c4c6e89aea4fb1f9cc32b,2019-12-30 13:50:37,84,89,61,92.00,Male,73,0,55,0,never,BMBF,left,TaxiDriver


Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
questionnaires,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
finished,8832.0,1199.0,3451.0,3223.0,3357.0,5781.0,4250.0,10031.0
%finished,88.05,11.95,34.4,32.13,33.47,57.63,42.37,100.0
expo,8823.0,1150.0,3430.0,3209.0,3334.0,5755.0,4218.0,9973.0
%expo,88.47,11.53,34.39,32.18,33.43,57.71,42.29,99.42
no duplicates,8723.0,1150.0,3396.0,3173.0,3304.0,5697.0,4176.0,9873.0
%no duplicates,88.35,11.65,34.4,32.14,33.47,57.7,42.3,98.42


##### __Stats after deleting NaNs__

Drop NaNs and store clean questionnaires

In [20]:
quest = quest.dropna()  # drop NaNs (inplace=True throws warning)

# set Trust column type to int (read as float)
quest.Trust = quest.Trust.astype("int64")

# store clean df into CSV
quest.to_csv("./questionnaires.csv")

# update stats
q_stats = update_stats(quest, "no NaNs", q_stats)
q_stats

Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
questionnaires,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
finished,8832.0,1199.0,3451.0,3223.0,3357.0,5781.0,4250.0,10031.0
%finished,88.05,11.95,34.4,32.13,33.47,57.63,42.37,100.0
expo,8823.0,1150.0,3430.0,3209.0,3334.0,5755.0,4218.0,9973.0
%expo,88.47,11.53,34.39,32.18,33.43,57.71,42.29,99.42
no duplicates,8723.0,1150.0,3396.0,3173.0,3304.0,5697.0,4176.0,9873.0
%no duplicates,88.35,11.65,34.4,32.14,33.47,57.7,42.3,98.42
no NaNs,8357.0,1150.0,3284.0,3046.0,3177.0,5513.0,3994.0,9507.0
%no NaNs,87.9,12.1,34.54,32.04,33.42,57.99,42.01,94.78


##### __Check for wrong values across all stored__

In [21]:
filter_out = ["date", "expo"]  # dates and exhibition columns to filter out
for column in quest.columns.to_list():  # for each column of the df
    if column not in filter_out:  # apply filter
        # display column name and inputted unique values
        print(f"{column}: \n{sorted(quest[column].unique())}\n")

Intention: 
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]

Usefulness: 
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]

Ease: 
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,

#### __Save the final participants dataset__

Add `questionnare` (Yes/No) column into `participants_clean` dataframe and store it.

In [22]:
part_clean["questionnaire"] = ""

# get uids to iterate over
uids_p = part_clean.index.tolist()
uids_q = quest.index.tolist()

# collector progress bar
uids_pbar = tqdm(
    iterable=uids_p,
    total=len(uids_p),
    desc="🧾 participants",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

# iterate over vr ride uids
for uid in uids_pbar:
    # store if they answered the questionnare or not
    q = "Yes" if uid in uids_q else "No"
    part_clean.loc[uid, "questionnaire"] = q

📄 0 of 24356 🧾 participants processed:                                                                        …

Collect percentage of null hit data per participant. Include it into participants_clean dataframe.

In [23]:
def collect_nulls(uid):
    """Collect total number of frames per participant.

    Parameters:
        uid (str): Participant UID."""

    # get condition and filename from particimant uid
    cond = part_clean.loc[uid].condition
    file = f"Path_ MS Wissenschaft-{cond}-{uid}.csv"

    # set path and load file
    f_path = f"{PATH_RC}/{file}"
    df = pd.read_csv(f_path)

    # calculate percentage of null hits
    total = df.index.size
    nulls_cnt = df[pd.isnull(df.hitObjectNames)].index.size
    nulls_per = nulls_cnt * 100 / total

    # store percentage of null hits
    nulls[uid] = nulls_per


# uids to iterate
uids = part_clean.index.tolist()

manager = Manager()  # manage shared memory types

# convert into memory-shared dict of lists for multiprocessing
nulls = manager.dict({uid: None for uid in uids})

# initialize pool of processes according to the available cpu core threads
pool = Pool(processes=CORES, initializer=pbar_fork_hack)

# raycast progress bar
rc_pbar = tqdm(
    iterable=pool.imap(func=collect_nulls, iterable=uids),
    total=len(uids),
    desc="📂 participants",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

# loop necessary for displaying properly the progressbar with multiprocessing
# source: https://stackoverflow.com/a/40133278
for _ in rc_pbar:
    pass

# close pool instance, no more work to submit
pool.close()
# wait for the worker processes to terminate
pool.join()

# cast proxy object to dict (otherwise they can't be accessed)
nulls = dict(nulls)
part_clean["nulls_%"] = 0

# total frames progress bar
nulls_pbar = tqdm(
    iterable=uids,
    total=len(uids),
    desc="🧾 participants",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

for uid in nulls_pbar:  # iterate over collected total frames
    part_clean.loc[uid, "nulls_%"] = nulls[uid]  # store them into the df

# store and preview the enhanced participants list
part_clean.to_csv("./participants_clean.csv")
part_clean

                

📄 0 of 24356 📂 participants processed:                                                                        …

📄 0 of 24356 🧾 participants processed:                                                                        …

Unnamed: 0_level_0,date,expo,side,condition,questionnaire,nulls_%
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4f583872ffed48249874b00f3d389cfc,2019-05-14 13:21:00,MSW,right,TaxiDriver,No,6.74
33d68ad55ef544fab9fd08108e3913ca,2019-05-14 16:44:00,MSW,right,AVAS,No,0.22
8598a83e2af441b8bcd0ae5d84beb875,2019-05-14 16:51:00,MSW,right,RadioTalk,No,46.24
37d7e67934974217830bb429bba7fd76,2019-05-14 16:54:00,MSW,left,AVAS,No,8.05
42bac596059749b5b8e8e83ae61de9b4,2019-05-14 16:56:00,MSW,right,TaxiDriver,No,26.88
...,...,...,...,...,...,...
cfe9482181f74f80b88cd4b1c048ab94,2019-12-30 15:29:00,BMBF,right,AVAS,No,34.18
611d35a7c3e940cc82495e53c2a8532d,2020-01-03 16:12:00,BMBF,right,TaxiDriver,No,7.41
3b6fda285d9e412eb081986b2f22a2e3,2020-01-03 16:13:00,BMBF,left,AVAS,No,7.15
18ffb0abdc8642098c479380bfa533d1,2020-01-03 16:15:00,BMBF,left,RadioTalk,Yes,13.89


How many participants have the % of null data bigger than...?

In [24]:
# take the max as starting point and we kept checking until 0%
start = round(part_clean["nulls_%"].max(), 2)

print(f"The participant with the biggest % of nulls has {start}%")
print()

total = part_clean.index.size

for i in range(0, int(start) + 1):
    cur = start - i if start - i >= 1 else 0
    cnt = part_clean[part_clean["nulls_%"] >= cur].index.size
    per = cnt * 100 / total
    print(f"{cnt}/{total} -> {per:.2f}% participants have {cur:.2f}% or more of null data.")

The participant with the biggest % of nulls has 66.04%

1/24356 -> 0.00% participants have 66.04% or more of null data.
2/24356 -> 0.01% participants have 65.04% or more of null data.
2/24356 -> 0.01% participants have 64.04% or more of null data.
3/24356 -> 0.01% participants have 63.04% or more of null data.
4/24356 -> 0.02% participants have 62.04% or more of null data.
5/24356 -> 0.02% participants have 61.04% or more of null data.
6/24356 -> 0.02% participants have 60.04% or more of null data.
8/24356 -> 0.03% participants have 59.04% or more of null data.
9/24356 -> 0.04% participants have 58.04% or more of null data.
10/24356 -> 0.04% participants have 57.04% or more of null data.
16/24356 -> 0.07% participants have 56.04% or more of null data.
24/24356 -> 0.10% participants have 55.04% or more of null data.
34/24356 -> 0.14% participants have 54.04% or more of null data.
42/24356 -> 0.17% participants have 53.04% or more of null data.
60/24356 -> 0.25% participants have 52.04% 

#### __Save and show the final stats__

In [25]:
p_stats.to_csv("./ride_stats.csv")
q_stats.to_csv("./quest_stats.csv")

display(p_stats)
display(q_stats)

Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
rides,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
raw,24335.0,2236.0,8896.0,8796.0,8879.0,15067.0,11504.0,26571.0
%raw,91.58,8.42,33.48,33.1,33.42,56.7,43.3,100.0
finished,22381.0,2070.0,8288.0,7928.0,8235.0,13785.0,10666.0,24451.0
%finished,91.53,8.47,33.9,32.42,33.68,56.38,43.62,92.02
expo,22359.0,1997.0,8251.0,7908.0,8197.0,13737.0,10619.0,24356.0
%expo,91.8,8.2,33.88,32.47,33.65,56.4,43.6,91.66


Unnamed: 0_level_0,msw,bmbf,avas,radio,taxi,left,right,total
questionnaires,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
finished,8832.0,1199.0,3451.0,3223.0,3357.0,5781.0,4250.0,10031.0
%finished,88.05,11.95,34.4,32.13,33.47,57.63,42.37,100.0
expo,8823.0,1150.0,3430.0,3209.0,3334.0,5755.0,4218.0,9973.0
%expo,88.47,11.53,34.39,32.18,33.43,57.71,42.29,99.42
no duplicates,8723.0,1150.0,3396.0,3173.0,3304.0,5697.0,4176.0,9873.0
%no duplicates,88.35,11.65,34.4,32.14,33.47,57.7,42.3,98.42
no NaNs,8357.0,1150.0,3284.0,3046.0,3177.0,5513.0,3994.0,9507.0
%no NaNs,87.9,12.1,34.54,32.04,33.42,57.99,42.01,94.78
