# Classification Tasks with Kinematic Time Series from Head-Mounted Displays

## **Load and preprocess datasets**

In [1]:
# Add files to sys.path
from pathlib import Path
import sys,os
this_path = None
try:
    this_path = str(os.path.dirname(__file__)) #str(Path().absolute())+"/" # str(os.path.dirname(__file__))
except:
    this_path = str(Path().absolute())+"/" #str(Path().absolute())+"/" # str(os.path.dirname(__file__))
print("File Path:", this_path)
sys.path.append(os.path.join(this_path, "kinemats"))

# Enable debugger in IPython with command set_trace()
#from IPython.core.debugger import set_trace

# Import classes
import utils  # Utils for generation of files and paths

from data_loader import dataset_ucr,\
                        dataset_IMT,\
                        dataset_Tsinghua

from data_loader.dataset_IMT import VideoList

# Import data science libs
import numpy as np
import pandas as pd

import matplotlib
#matplotlib.rcParams['text.usetex'] = True
#%matplotlib inline
import matplotlib.pyplot as plt

File Path: d:\dsv\dev\git_repos\headmov-classif-360videos/


---
# SETUP

In [2]:
# CONSTANTS
import experiment_config
from experiment_config import Datasets


---
# UTILITY FUNCTIONS

Generate paths to write output files

In [3]:
STR_DATASET = str(experiment_config.DATASET_MAIN)+"/"
print(STR_DATASET)
def gen_path_plot(filename):
    # Generates full paths for PLOTS just by specifying a name
    return utils.generate_complete_path(filename, \
                                        main_folder=experiment_config.PLOT_FOLDER, \
                                        subfolders=STR_DATASET+NOTEBOOK_SUBFOLDER_NAME, \
                                        file_extension=experiment_config.IMG_FORMAT, save_files=experiment_config.EXPORT_PLOTS)

def gen_path_temp(filename, subfolders="", extension=experiment_config.TEMP_FORMAT):
    # Generates full paths for TEMP FILES just by specifying a name
    return utils.generate_complete_path(filename, \
                                        main_folder=experiment_config.TEMP_FOLDER, \
                                        subfolders=STR_DATASET+subfolders, \
                                        file_extension=extension)

def gen_path_dataset(filename, subfolders="", extension=""):
    # Generates full paths for RESULTS FILES (like pandas dataframes)
    return utils.generate_complete_path(filename, \
                                        main_folder=experiment_config.DATASET_FOLDER, \
                                        subfolders=STR_DATASET+subfolders, \
                                        file_extension=extension)

Tsinghua/


# Load and preprocess datasets

# 1. Dataset UCR

In [4]:
# if experiment_config.DATASET_MAIN == Datasets.UCR:
#     # keep_sets define whether merging or not the training and test set.
#     d = data_loader.dataset_ucr.load_ucr_dataset("UWaveGestureLibrary", keep_sets=True, root_folder = "dataset/UCR/", suffix_folders = ["X","Y","Z"], sets = ["TRAIN","TEST"], file_format = ".tsv", missing_val = 0)

#     for el in d:
#         print(el.shape)

# 2. Dataset Head Movements IMT

File from http://dash.ipv6.enstb.fr/headMovements/ and paper in http://doi.org/10.1145/3083187.3083215

## Extract data from Tar file

In [5]:
if experiment_config.DATASET_MAIN == Datasets.IMT:
    # Original compressed dataset
    dataset_path = experiment_config.DATASET_IMT_TAR
    # Path of JSON dictionary used to store the data per user
    dict_json_name = gen_path_temp('files_index_per_user', extension=".json")

    # Class with all file manipulation for the dataset IMT
    data = dataset_IMT.DatasetHeadMovIMT(dataset_path,dict_json_name)

In [6]:
if experiment_config.DATASET_MAIN == Datasets.IMT:
    # Filename of the file containing demographics and HMD movements data
    general_data_filename = experiment_config.DATASET_DEMOGRAPHICS
    movement_data_filename= gen_path_temp("hmd_movements", extension=".pickle")

    # Initial number of users is 63. Data from 5 users was removed due to missing 
    # values in main videos. Total size is 58. Moreover, videos used for 
    # familiarizing users with VR were deleted: `VideoList.Elephant` and `VideoList.Rhino`
    skip_users_indices = [14, 33, 52, 61, 62]
    videos_to_delete = [VideoList.Elephant, VideoList.Rhino]


    ### INPUTS / OUTPUTS
    """EDIT CUSTOM FILENAMES"""
    input_files = [general_data_filename, movement_data_filename]

    RELOAD_TRIES = experiment_config.RELOAD_TRIES
    # Try to load files maximum two times
    for tries in range(RELOAD_TRIES):
        try:
            ### LOAD FILE
            print(f"Trying {tries+1}/{RELOAD_TRIES} to load files: {input_files}")
            
            ### CUSTOM SECTION TO READ FILES
            """EDIT CUSTOM READ"""
            data.general = pd.read_csv(input_files[0]) # data.general is a pd.DataFrame
            print(f"File {input_files[0]} was successfully loaded")
            data.movement = utils.load_pickle(input_files[1]) # data.movement is a Dictionary
            print(f"File {input_files[1]} was successfully loaded")

        except FileNotFoundError as e:
            ### CREATE FILE
            print(f"File not found. Creating again! {e}")

            ### CUSTOM SECTION TO CREATE FILES 
            """EDIT CUSTOM WRITE"""
            # Create JSON with dictionary of structured data
            data.generate_file_index()
            # Load JSON
            files_index = utils.load_json(dict_json_name)
            print("Number of users in file index:", len(files_index.keys()))
            # Transform the paths in the compressed file into bytes
            data.uncompress_data(files_index,
                                    #debug_users = 15,                      # Load just this users for test purposes
                                    list_unprocessed_users = skip_users_indices     # Users ID with empty data
                                )

            # Delete head-movement data of specific video keys
            data.delete_data_from_videos(videos_to_delete)
            print("Removing data from specific video keys... Done!")

            # Save files
            data.general.to_csv(input_files[0], index=False)
            utils.create_pickle(data.movement, input_files[1])

            ### ---- CONTROL RETRIES
            if tries+1 < RELOAD_TRIES:
                continue
            else:
                raise
        break

# 3. Tsinghua

In [7]:
if experiment_config.DATASET_MAIN == Datasets.Tsinghua:

    # Original compressed dataset
    dataset_path = experiment_config.DATASET_TSINGHUA_ZIP
    # Path of JSON dictionary used to store the data per user
    dict_json_name = gen_path_temp('files_index_per_user', extension=".json")

    data = dataset_Tsinghua.DatasetHeadMovTsinghua(dataset_path, dict_json_name)

In [8]:
if experiment_config.DATASET_MAIN == Datasets.Tsinghua:
    # Filename of the file containing demographics and HMD movements data
    demographics_data_filename = experiment_config.DATASET_DEMOGRAPHICS
    original_data_filename= gen_path_temp("hmd_movements", extension=".pickle")


    ### INPUTS / OUTPUTS
    """EDIT CUSTOM FILENAMES"""
    input_files = [demographics_data_filename, original_data_filename]

    RELOAD_TRIES = experiment_config.RELOAD_TRIES
    # Try to load files maximum two times
    for tries in range(RELOAD_TRIES):
        try:
            ### LOAD FILE
            print(f"Trying {tries+1}/{RELOAD_TRIES} to load files: {input_files}")
            
            ### CUSTOM SECTION TO READ FILES
            """EDIT CUSTOM READ"""
            data.demographics = pd.read_csv(input_files[0]) # data.general is a pd.DataFrame
            print(f"File {input_files[0]} was successfully loaded")
            data.original_data = utils.load_pickle(input_files[1]) # data.movement is a Dictionary
            print(f"File {input_files[1]} was successfully loaded")

        except FileNotFoundError as e:
            ### CREATE FILE
            print(f"File not found. Creating again! {e}")

            ### CUSTOM SECTION TO CREATE FILES 
            """EDIT CUSTOM WRITE"""
            # Create JSON with dictionary of structured data
            data.generate_file_index()
            # Transform the paths in the compressed file into bytes
            data.uncompress_data(#debug_users = 15,                      # Load just this users for test purposes
                                 #list_unprocessed_users = skip_users_indices     # Users ID with empty data
                                )

            # # Delete head-movement data of specific video keys
            # data.delete_data_from_videos(videos_to_delete)
            # print("Removing data from specific video keys... Done!")

            # Save files
            data.demographics.to_csv(input_files[0], index=False)
            utils.create_pickle(data.original_data, input_files[1])

            ### ---- CONTROL RETRIES
            if tries+1 < RELOAD_TRIES:
                continue
            else:
                raise
        break

Trying 1/2 to load files: ['./dataset/Tsinghua/demographics.csv', './temp/Tsinghua/hmd_movements.pickle']
File not found. Creating again! [Errno 2] File b'./dataset/Tsinghua/demographics.csv' does not exist: b'./dataset/Tsinghua/demographics.csv'
JSON file was created in ./temp/Tsinghua/files_index_per_user.json
Loading... 1
Loading... 10
Loading... 11
Loading... 12
Loading... 13
Loading... 14
Loading... 15
Loading... 16
Loading... 17
Loading... 18
Loading... 19
Loading... 2
Loading... 20
Loading... 21
Loading... 22
Loading... 23
Loading... 24
Loading... 25
Loading... 26
Loading... 27
Loading... 28
Loading... 29
Loading... 3
Loading... 30
Loading... 31
Loading... 32
Loading... 33
Loading... 34
Loading... 35
Loading... 36
Loading... 37
Loading... 38
Loading... 39
Loading... 4
Loading... 40
Loading... 41
Loading... 42
Loading... 43
Loading... 44
Loading... 45
Loading... 46
Loading... 47
Loading... 48
Loading... 5
Loading... 6
Loading... 7
Loading... 8
Loading... 9
Loading... 1
Loading...

---
## Data Synchronization with data interpolation {COMMON FOR ALL DATASETS}
***Generate CSV file with summary of sampling frequency and duration***: The CSV file defines the criteria to resample all time series in common length.

In [9]:
# Filename of the file containing demographics and HMD movements data
sampling_stats_filename = experiment_config.DATASET_SUMMARY # Original sampling stats

### INPUTS / OUTPUTS
"""EDIT CUSTOM FILENAMES"""
input_files = [sampling_stats_filename]

# Try to load files maximum two times
for tries in range(RELOAD_TRIES):
    try:
        ### LOAD FILE
        print(f"Trying {tries+1}/{RELOAD_TRIES} to load files: {input_files}")
        
        ### CUSTOM SECTION TO READ FILES
        """EDIT CUSTOM READ"""
        sampling_stats = pd.read_csv(input_files[0]) # data.general is a pd.DataFrame
        print(f"File {input_files[0]} was successfully loaded")

    except FileNotFoundError as e:
        ### CREATE FILE
        print(f"File not found. Creating again! {e}")

        ### CUSTOM SECTION TO CREATE FILES 
        """EDIT CUSTOM WRITE"""
        # Summary of original sampling frequencies
        sampling_stats = data.create_original_sampling_summary()
        sampling_stats.to_csv(input_files[0], index=False)

        ### ---- CONTROL RETRIES
        if tries+1 < RELOAD_TRIES:
            continue
        else:
            raise
    break

print(sampling_stats.head())

Trying 1/2 to load files: ['./dataset/Tsinghua/summary_timeseries.csv']
File not found. Creating again! [Errno 2] File b'./dataset/Tsinghua/summary_timeseries.csv' does not exist: b'./dataset/Tsinghua/summary_timeseries.csv'
Trying 2/2 to load files: ['./dataset/Tsinghua/summary_timeseries.csv']
File ./dataset/Tsinghua/summary_timeseries.csv was successfully loaded
   experiment  user  video  startingTime  endTime      N   magQuat  \
0           0     1      0         1.247  164.203  14726  1.000005   
1           0     1      1         0.000  201.141  18180  0.999997   
2           0     1      2         0.021  293.239  26272  1.000006   
3           0     1      3         0.000  172.577  15478  0.999998   
4           0     1      4         0.021  205.708  18443  1.000005   

   avTsampling  avFsampling  
0     0.011066    90.367952  
1     0.011064    90.384357  
2     0.011161    89.598865  
3     0.011150    89.687502  
4     0.011153    89.665365  


## SLERP (Spherical Linear Interpolation)

Slerp is shorthand for spherical linear interpolation. It refers to constant-speed motion along a unit-radius great circle arc, given the ends and an interpolation parameter. "A major appeal is that interpolation is carried out as a rotation about a fixed axis at constant angular velocity" [REF,pg.18](http://web.cs.iastate.edu/~cs577/handouts/quaternion.pdf)

Let $p_{0}$ and $p_{1}$ be the first and last points in the arc, let $t$ be the parameter where $0 \le t \le 1$. Compute $\Omega$ as the angle subtended by the arc so that $cos \Omega = p_{0} \cdot p_{1}$

$Slerp(p_{0},p_{1};t) = \frac{sin[(1-t)\Omega]}{sin(\Omega)}\cdot p_{0} + \frac{sin(t\Omega)}{sin(\Omega)}\cdot p_{1}$



In [10]:
if experiment_config.DATASET_MAIN == Datasets.IMT:
    SAMPLING_FREQUENCY = 30
    STARTING_TIME_SECS = 5
    ENDING_TIME_SECS = 35
elif experiment_config.DATASET_MAIN == Datasets.Tsinghua:
    SAMPLING_FREQUENCY = 30
    STARTING_TIME_SECS = 35
    ENDING_TIME_SECS = 155

    EXPERIMENT_ID = 0 # 0: Experiment_1: No instructions to look at video ROI >> 1: Experiment_2: Instruction to focus on video ROI;;; Check dataset paper for description 

This interpolation is common for both

In [11]:
# Structure with resampled time-series
movement_resampled_data_filename = gen_path_temp("hmd_movements_resampled", extension=".pickle")

### INPUTS / OUTPUTS
"""EDIT CUSTOM FILENAMES"""
input_files = [movement_resampled_data_filename]

# Try to load files maximum two times
for tries in range(RELOAD_TRIES):
    try:
        ### LOAD FILE
        print(f"Trying {tries+1}/{RELOAD_TRIES} to load files: {input_files}")
        
        ### CUSTOM SECTION TO READ FILES
        """EDIT CUSTOM READ"""
        data.processed = utils.load_pickle(input_files[0]) 
        print(f"File {input_files[0]} was successfully loaded")

    except FileNotFoundError as e:
        ### CREATE FILE
        print(f"File not found. Creating again! {e}")

        ### CUSTOM SECTION TO CREATE FILES 
        """EDIT CUSTOM WRITE"""
        
        
        if experiment_config.DATASET_MAIN == Datasets.IMT:
            data.resample_movement(sampling_frequency = SAMPLING_FREQUENCY, starting_time = STARTING_TIME_SECS, end_time = ENDING_TIME_SECS)
        elif experiment_config.DATASET_MAIN == Datasets.Tsinghua:
            data.resample_movement(experiment_id = EXPERIMENT_ID, sampling_frequency = SAMPLING_FREQUENCY, starting_time = STARTING_TIME_SECS, end_time = ENDING_TIME_SECS)

        # Create pickle file with resampled head-movement data
        utils.create_pickle(data.processed, input_files[0])

        ### ---- CONTROL RETRIES
        if tries+1 < RELOAD_TRIES:
            continue
        else:
            raise
    break

Trying 1/2 to load files: ['./temp/IMT/hmd_movements_resampled.pickle']
File not found. Creating again! [Errno 2] No such file or directory: './temp/IMT/hmd_movements_resampled.pickle'
Each numpy array will be resampled to 901 samples. Timestamps from 5 to 35 seconds
USER: 0 VIDEO: Diving | NEW SHAPE: (901, 5) idx_resampled 901
USER: 0 VIDEO: Paris | NEW SHAPE: (901, 5) idx_resampled 901
USER: 0 VIDEO: Rollercoaster | NEW SHAPE: (901, 5) idx_resampled 901
USER: 0 VIDEO: Timelapse | NEW SHAPE: (901, 5) idx_resampled 901
USER: 0 VIDEO: Venice | NEW SHAPE: (901, 5) idx_resampled 901
USER: 1 VIDEO: Diving | NEW SHAPE: (901, 5) idx_resampled 901
USER: 1 VIDEO: Paris | NEW SHAPE: (901, 5) idx_resampled 901
USER: 1 VIDEO: Rollercoaster | NEW SHAPE: (901, 5) idx_resampled 901
USER: 1 VIDEO: Timelapse | NEW SHAPE: (901, 5) idx_resampled 901
USER: 1 VIDEO: Venice | NEW SHAPE: (901, 5) idx_resampled 901
USER: 2 VIDEO: Diving | NEW SHAPE: (901, 5) idx_resampled 901
USER: 2 VIDEO: Paris | NEW SHAPE

## Validate that interpolation is working properly

In [12]:
if experiment_config.DATASET_MAIN == Datasets.IMT: # Validate visually interpolation in some users
    userId = 7
    video = VideoList.Rollercoaster # VideoList.Paris, VideoList.Rollercoaster

    # Extract user from original and resampled datasets
    data_orig = data.get_movement_filtered(userId, video, column_to_filter=0, min_value=STARTING_TIME_SECS, max_value=ENDING_TIME_SECS)
    data_orig = np.delete(data_orig, 1, axis=1) # Exclude the frameId column
    data_processed = data.processed[userId][video.value]
    print(data_orig.shape)
    print(data_processed.shape)

    print(data_orig[:10,0])
    print(data_processed[:10,0])

    # Plot time series
    if False:
        fig, axes = plt.subplots(1, 2, facecolor='w', edgecolor='k', sharex=True, sharey=True, figsize=(12, 4))

        ax = axes[0]
        ax.plot(data_orig[:,0], data_orig[:,1:],'-')
        ax.set(xlabel="Elapsed video - Time (s)", ylabel="Quaternion", title="Original")

        ax = axes[1]
        ax.plot(data_processed[:,0], data_processed[:,1:],'--')
        ax.set(xlabel="Elapsed video - Time (s)", ylabel="Quaternion", title="Resampled")

        plt.suptitle('Visual inspection to resampling process')
        fig.tight_layout()
        plt.show()

(901, 4)
(901, 5)
[5.         5.03333333 5.06666667 5.1        5.13333333 5.16666667
 5.2        5.23333333 5.26666667 5.3       ]
[5.         5.03333333 5.06666667 5.1        5.13333333 5.16666667
 5.2        5.23333333 5.26666667 5.3       ]


## Create dataset in structured format

In [13]:
if experiment_config.DATASET_MAIN == Datasets.IMT: # Validate visually interpolation in some users
    # Summary of resampled head movement data
    num_users = len(data.processed[:])
    videos_per_user = len(data.processed[0].keys())
    total_trajectories = num_users * videos_per_user
    video_data_rows, video_data_cols = data.processed[0][str(VideoList.Diving)].shape

    print("Total number of users",num_users)
    print("Total number of videos per user",videos_per_user)
    print("Total number of time series", total_trajectories )
    print("Head movement per video has size:", (video_data_rows, video_data_cols))

Total number of users 58
Total number of videos per user 5
Total number of time series 290
Head movement per video has size: (901, 5)


In [14]:
if experiment_config.DATASET_MAIN == Datasets.IMT: # Validate visually interpolation in some users
    # Data for combined time series to cluster
    labels_filename = experiment_config.DATASET_LABELS # Cluster index TRUE_LABEL
    timestamps_filename = experiment_config.DATASET_TIMESTAMPS # Timestamps
    dataset_filename = experiment_config.DATASET_DATA # Resampled data stats

    # Load or create dataframe with statistics of initial dataset (58 users, 5 videos)
    labels = None
    timestamps = None
    dataset = None

    ### INPUTS / OUTPUTS
    """EDIT CUSTOM FILENAMES"""
    input_files = [labels_filename, timestamps_filename, dataset_filename]

    # Try to load files maximum two times
    for tries in range(RELOAD_TRIES):
        try:
            ### LOAD FILE
            print(f"Trying {tries+1}/{RELOAD_TRIES} to load files: {input_files}")
            
            ### CUSTOM SECTION TO READ FILES
            """EDIT CUSTOM READ"""
            labels = pd.read_csv(input_files[0])
            print(f"File {input_files[0]} was successfully loaded")
            timestamps = np.loadtxt(input_files[1])
            print(f"File {input_files[1]} was successfully loaded")
            dataset = utils.load_binaryfile_npy(input_files[2])
            print(f"File {input_files[2]} was successfully loaded")


        except Exception as e:
            ### CREATE FILE
            print(f"File not found. Creating again! {e}")

            ### CUSTOM SECTION TO CREATE FILES 
            """EDIT CUSTOM WRITE"""
            ## Create DataFrame with labels
            labels_cols = ["id","user","videoId"]
            labels = np.empty((total_trajectories, len(labels_cols)))

            # All time series are resampled with the same timestamps, just pick one!
            timestamps = data.processed[0][VideoList.Paris.value][:,0]

            # Contains all the trajectories in array,
            dataset = np.empty((total_trajectories, video_data_rows, video_data_cols - 1))  ## The timestamp is in a different array
            
            # Convert the enum of the videos to an index
            videolist_converter = {
                                    VideoList.Diving.value: 1, 
                                    VideoList.Paris.value: 2, 
                                    VideoList.Rollercoaster.value: 3, 
                                    VideoList.Timelapse.value: 4, 
                                    VideoList.Venice.value: 5, }

            # Time series index, used to map them back the original series with their respective user and index.
            ts_idx = 0 
            # Put together all the structured time series in one numpy array to do distance calculations
            for user in range(num_users): #[0,1]:
                for video in data.processed[user].keys():
                    ## CHECK THAT ALL THE QUATERNIONS IN THE VIDEO HAVE MAGNITUDE 1. [Unit Quaternions]
                    magnitudes = [np.linalg.norm(data.processed[user][video][row,1:]) for row in range(data.processed[user][video].shape[0])]
                    [print("Quaternion norm not equal 1+/-0.01",val, "user:", user, "video", video, "row", i) for i,val in enumerate(magnitudes) if (val > 1.01 or val < 0.99)]

                    # Index of which time series corresponded to which video and which user
                    labels[ts_idx] = [ts_idx, user, videolist_converter[video]]

                    # Copy the original structured data in two np array with all the trajectories, offset of one sample
                    dataset[ts_idx,:,:] = data.processed[user][video][:,1:] ## SKIP LAST SAMPLE

                    # Time-series Index, combining the structure per user, per video.
                    ts_idx += 1

            ## SAVE FILES
            # Create dataframe with time index
            labels = pd.DataFrame(data=labels, columns=labels_cols)
            labels.to_csv(input_files[0], index=False)
            print("Cluster index created at", input_files[0])

            # Save timestamps
            np.savetxt(input_files[1], timestamps, fmt='%f') # Supress scientific notation
            print("Timestamps created at",input_files[1])

            # Create pickle file with combined time-series for clustering
            utils.save_binaryfile_npy(dataset, input_files[2])
            print("Head movement resampled created at", input_files[2])


            ### ---- CONTROL RETRIES
            if tries+1 < RELOAD_TRIES:
                continue
            else:
                raise
        
        # Finish iteration
        break


Trying 1/2 to load files: ['./dataset/IMT/labels.csv', './dataset/IMT/timestamps.csv', './dataset/IMT/dataset.npy']
File not found. Creating again! [Errno 2] File b'./dataset/IMT/labels.csv' does not exist: b'./dataset/IMT/labels.csv'
Cluster index created at ./dataset/IMT/labels.csv
Timestamps created at ./dataset/IMT/timestamps.csv
Head movement resampled created at ./dataset/IMT/dataset.npy
Trying 2/2 to load files: ['./dataset/IMT/labels.csv', './dataset/IMT/timestamps.csv', './dataset/IMT/dataset.npy']
File ./dataset/IMT/labels.csv was successfully loaded
File ./dataset/IMT/timestamps.csv was successfully loaded
File ./dataset/IMT/dataset.npy was successfully loaded


## Summary data structure of Dataset 1: IMT

**NOT VERY USEFUL**
- `data.general` is a `pd.DataFrame`
- `data.movement[0]` = returns a dictionary for the userid `0` with keys `{'video_id': <ndarray>}`,`video_id` is got automatically from the Enum `VideoList`
- `data.movement[0][VideoList.Paris.value]` = returns a ndarray with hmd movement for a specific user and specific video
- `data.processed[0][VideoList.Paris.value]` = returns a ndarray after applying the resampling

**MOST USEFUL**
- The files `labels`, `timestamps`, and `dataset` are used to load the structured dataset in any other data processing file.

See example below:

# 3. Dataset Tsinghua


In [15]:
if experiment_config.DATASET_MAIN == Datasets.Tsinghua: # Validate visually interpolation in some users
    # Summary of resampled head movement data
    num_users = len(data.processed.keys())
    videos_per_user = len(data.processed[1].keys())
    total_trajectories = num_users * videos_per_user
    video_data_rows, video_data_cols = data.processed[1][0].shape

    print("Total number of users",num_users)
    print("Total number of videos per user",videos_per_user)
    print("Total number of time series", total_trajectories )
    print("Head movement per video has size:", (video_data_rows, video_data_cols))

In [16]:
if experiment_config.DATASET_MAIN == Datasets.Tsinghua: # Validate visually interpolation in some users
    # Data for combined time series to cluster
    labels_filename = experiment_config.DATASET_LABELS # Cluster index TRUE_LABEL
    timestamps_filename = experiment_config.DATASET_TIMESTAMPS # Timestamps
    dataset_filename = experiment_config.DATASET_DATA # Resampled data stats

    # Load or create dataframe with statistics of initial dataset (58 users, 5 videos)
    labels = None
    timestamps = None
    dataset = None

    ### INPUTS / OUTPUTS
    """EDIT CUSTOM FILENAMES"""
    input_files = [labels_filename, timestamps_filename, dataset_filename]

    # Try to load files maximum two times
    for tries in range(RELOAD_TRIES):
        try:
            ### LOAD FILE
            print(f"Trying {tries+1}/{RELOAD_TRIES} to load files: {input_files}")
            
            ### CUSTOM SECTION TO READ FILES
            """EDIT CUSTOM READ"""
            labels = pd.read_csv(input_files[0])
            print(f"File {input_files[0]} was successfully loaded")
            timestamps = np.loadtxt(input_files[1])
            print(f"File {input_files[1]} was successfully loaded")
            dataset = utils.load_binaryfile_npy(input_files[2])
            print(f"File {input_files[2]} was successfully loaded")

        except Exception as e:
            ### CREATE FILE
            print(f"File not found. Creating again! {e}")

            ### CUSTOM SECTION TO CREATE FILES 
            """EDIT CUSTOM WRITE"""
            ## Create DataFrame with labels
            labels_cols = ["id","user","videoId"]
            labels = np.empty((total_trajectories, len(labels_cols)))

            # All time series are resampled with the same timestamps, just pick one!
            timestamps = data.processed[1][0][:,0]

            # Contains all the trajectories in array,
            dataset = np.empty((total_trajectories, video_data_rows, video_data_cols - 1))  ## The timestamp is in a different array
            
            # Time series index, used to map them back the original series with their respective user and index.
            ts_idx = 0 
            # Put together all the structured time series in one numpy array to do distance calculations
            for user in range(1,num_users+1): #[0,1]: ### USERS exist in original data from from 1-48, not 0-47
                for video in data.processed[user].keys():
                    ## CHECK THAT ALL THE QUATERNIONS IN THE VIDEO HAVE MAGNITUDE 1. [Unit Quaternions]
                    magnitudes = [np.linalg.norm(data.processed[user][video][row,1:]) for row in range(data.processed[user][video].shape[0])]
                    [print("Quaternion norm not equal 1+/-0.01",val, "user:", user, "video", video, "row", i) for i,val in enumerate(magnitudes) if (val > 1.01 or val < 0.99)]

                    # Index of which time series corresponded to which video and which user
                    labels[ts_idx] = [ts_idx, user, video]

                    # Copy the original structured data in two np array with all the trajectories
                    dataset[ts_idx,:,:] = data.processed[user][video][:,1:] ## SKIP FIRST COLUMN

                    # Time-series Index, combining the structure per user, per video.
                    ts_idx += 1

            ## SAVE FILES
            # Create dataframe with time index
            labels = pd.DataFrame(data=labels, columns=labels_cols)
            labels.to_csv(input_files[0], index=False)
            print("Cluster index created at", input_files[0])

            # Save timestamps
            np.savetxt(input_files[1], timestamps, fmt='%f') # Supress scientific notation
            print("Timestamps created at",input_files[1])

            # Create pickle file with combined time-series for clustering
            utils.save_binaryfile_npy(dataset, input_files[2])
            print("Head movement resampled created at", input_files[2])


            ### ---- CONTROL RETRIES
            if tries+1 < RELOAD_TRIES:
                continue
            else:
                raise
        
        # Finish iteration
        break


In [17]:
print(">> FINISHED WITHOUT ERRORS!!")

>> FINISHED WITHOUT ERRORS!!
