# Scan Path Analysis pt4
#### @tutor: Ms SHARAFI Zohreh
#### @student: Mr SHAW Oscar

#### The purpose of this notebook is only to clean and transform data so it could be analysed thanks to the ScanMatch Toolbox to Matlab that can be found here:

F. Cristino, S. Mathôt, J. Theeuwes & I. D. Gilchrist (2010). ScanMatch: A Novel Method for Comparing Fixation Sequences. Behaviour Research Methods, 42, 692-700. (pdf)

https://seis.bristol.ac.uk/~psidg/ScanMatch/#FileDes

The matlab toolbox takes the x,y coordinates and the fixation duration in input so we need to clean and keep only those columns of our dataframe, we'll then extract those in another csv 

In [2]:
# import libraries
# !pip install pandas

import os
import pandas as pd
import numpy as np

## 1- Data Import

In [3]:
# Path of the folder which contains the files
FOLDER_PATH = "C:\\Users........"
DATA_FOLDER = os.path.join(FOLDER_PATH, 'data_files')
DESTINATION_FOLDER = os.path.join(FOLDER_PATH, 'destination_files')


# Dictionary establishing the correspondence between a file and its sequence of entities
# i.e data{key = file_id, value = [seq. of entities]} where entities are in [‘Comment’, ‘Bug_Report’, ‘Member_Variable’, 
# ‘Method_Body’, ‘Method_Signature’]
# ex: file_id = P_103 & value = ['Comment','Bug_Report','None','Bug_Report']
data = {}


# For each csv files, we use the dataframe to file the dictionnary
for filename in os.listdir(DATA_FOLDER):
    df = pd.read_csv(os.path.join(DATA_FOLDER, filename),delimiter=',')
    data[filename.split('_')[0]] = df

## 2- Cleaning & Preprocessing


In [4]:
# Divide the DF regarding the phase number
def divide(df,features,i):
#     input:  df,i  -> object (dataframe) large dataframe with all phase number, int number of phase_number
#     output: df -> object (dataframe) array of df depending on the phase number 
    inp = [df.loc[df['phase_number'] == i+1, features] for i in range(i)]
    return inp

In [5]:
# Allow to clean NA values
def cleanNA(df):
#     input:  df  -> object (dataframe) with NA  
#     output: df -> object (dataframe) without NA and reindexed 
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [6]:
# Convert fix_dur in ms rather than ns since ScanMatch toolbox considers ms
def convertToMillisec(df):
    df['fix_dur'] =  (df['fix_dur'] // 1000000) # to go in ms
    return df

## 3- Pipeline

In [7]:
# Iterate over each dataframes of data and clean data in each, then convert the data.
# There we use another dictionnary for more reusability of past data
def pipeline(data):
#      input: data, features  -> dictionnary{key = file_id, value = object (dataframe)}, array of string containing the feature we want to keep
#      output: new_data -> dictionnary{key = file_id, value = [sequency]}   
    new_data = {}
    for i in data:
        divided_df = divide(data[i],['pixel_x','pixel_y','fix_dur'],max(data[i]['phase_number']))
        res = []
        for df in divided_df:
            df = cleanNA(df)
            df = convertToMillisec(df)
            res.append(df)
        new_data[i] = res
    return new_data

new_data = pipeline(data)
new_data

{'P102': [     pixel_x  pixel_y  fix_dur
  0        407      244      155
  1        425      185      116
  2        364      126      101
  3        388      252      224
  4        470      342      115
  ..       ...      ...      ...
  449      437      260      193
  450      421      241      108
  451      416      291      208
  452      523      422      102
  453      481      249      122
  
  [454 rows x 3 columns],
       pixel_x  pixel_y  fix_dur
  0        440      287      148
  1        473      291      593
  2        465      328      463
  3        428      335      150
  4        496      318       90
  ..       ...      ...      ...
  618      443      384      483
  619      435      395     2740
  620      517      387      241
  621      410      390      316
  622      438      392      308
  
  [623 rows x 3 columns],
       pixel_x  pixel_y  fix_dur
  0        425      403      506
  1        549      407      325
  2        452      420      266
  3       

## 4- Load Data

In [8]:
# Check if the folder exists, otherwise create it
if not os.path.exists(DESTINATION_FOLDER):
    os.makedirs(DESTINATION_FOLDER)

# Load every dataframe in a file regarding the phase number
for x in new_data.items():
    for j in range(len(x[1])):
# Check if dataframe is empty, if it's empty then don't create csv
        if not x[1][j].empty:
            filename = x[0] + '_' + str(j+1) + '_data.csv'
            dt = x[1][j]
            dt.to_csv(os.path.join(DESTINATION_FOLDER, filename), index=False, header=False)