# Preprocess data

In [None]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import skimage.io

import config
from utils.normalize2 import normalize_features02, normalize_labels

%matplotlib inline

## Load dataset

In [None]:
from utils.features02_dlib import FEATURES, TARGETS

In [None]:
feat_types = {f:np.int32 for f in FEATURES}

In [None]:
feat_types.update({
    'eye_right_image':  np.str,
    'eye_left_image':  np.str,
    # Shot
    'y': np.int32,
    'x': np.int32,
    'score': np.int32,
    'timestamp':  np.int32,
    'img_path':  np.str,
    'img': np.str,  # Relative to the raw dataset

    # Game
    'game_id': np.str,
    'glasses': np.bool,
    'screen_diagonal': np.float,
    'camera_position': np.str,
    'screen_height': np.int32,
    'screen_width': np.int32,
})

In [None]:
# Load dataset
data = pd.read_csv(
    config.PATH_DATA_FEATURES02_DLIB_AUGMENTED_CSV,
    usecols=feat_types
)

## Scale

* x axis: [-1, 1]
* y axis: [-1, 1]

In [None]:
normalize_features02(data, FEATURES, config.WEBCAM_WIDTH, config.WEBCAM_HEIGHT)
normalize_features02(data,TARGETS, config.SCREEN_WIDTH, config.SCREEN_HEIGHT)

## Delete rows not in images

In order not to generate all the augmented data again, I only delete wrong samples from the 'raw' dataset an load augmented samples that are remaining samples in 'raw'.

In [None]:
from utils.data import Data

raw_data = Data(config.PATH_DATA_RAW)
raw_data_list = list(raw_data.iterate())
raw_imgs = {d['img_path'] for d in raw_data_list}

In [None]:
len(data)

In [None]:
data = data[data['img_path'].isin(raw_imgs) ]

In [None]:

len(data)

## Drop unneeded columns

In [None]:
data.drop(
    labels= [
        'timestamp', 'img_path', 'img', 'game_id', 'screen_diagonal',
        'camera_position', 'screen_height', 'screen_width', 'score'
    ],
    axis=1,
    inplace=True
)

## Clean data

In [None]:
data.describe().loc[['min','max']]

Everything makes sense.

## Target stats

In [None]:
sns.jointplot(x="x", y="y", data=data, kind="scatter", stat_func=None, size=12, ratio=10)

In [None]:
sns.distplot(data['y'], hist=True, norm_hist=False, kde=False)

In [None]:
sns.distplot(data['x'], hist=True, norm_hist=False, kde=False)

## Eye images structure

In [None]:
imgs_left = []
imgs_right = []
errors = []

for i, row in data.iterrows():
    try:
        # Scale pixel value in range [-1,1]
        imgs_left.append((skimage.io.imread(config.PATH_DATA_FEATURES02_DLIB_AUGMENTED+row['eye_left_image'])/255-0.5)*2)
    except Exception as e:
        errors.append(path)
    try:
        # Scale pixel value in range [-1,1]
        imgs_right.append((skimage.io.imread(config.PATH_DATA_FEATURES02_DLIB_AUGMENTED+row['eye_right_image'])/255-0.5)*2)
    except Exception as e:
        errors.append(path)

In [None]:
q = (skimage.io.imread(config.PATH_DATA_FEATURES02_DLIB_AUGMENTED+row['eye_left_image'])/255-0.5)*2

## Save things

In [None]:
if not os.path.exists(config.PATH_DATA_FEATURES02_DLIB_AUGMENTED_NORM):
    os.makedirs(config.PATH_DATA_FEATURES02_DLIB_AUGMENTED_NORM)

# CSV
data.to_csv(config.PATH_DATA_FEATURES02_DLIB_AUGMENTED_NORM_CSV, index=False)
# Images
np.save(file=config.PATH_DATA_FEATURES02_DLIB_AUGMENTED_NORM_IMGS_LEFT, arr=imgs_left)
np.save(file=config.PATH_DATA_FEATURES02_DLIB_AUGMENTED_NORM_IMGS_RIGHT, arr=imgs_right)