In [3]:
%matplotlib notebook
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import glob
import string
import itertools
#Included metrics are Levenshtein, Hamming, Jaccard, and Sorensen distance, plus some bonuses
import distance
import math

# From configuration file
from configparser import ConfigParser
import codecs

import collections

from fastdtw import fastdtw # Dynamic time warping
from IPython.display import clear_output
# Shapely - for points and polygons (AOIS)
from shapely.geometry import Point, Polygon

# Distances
from scipy.spatial.distance import *
from scipy import stats

# Machine learning and clustering scitic
from sklearn.decomposition import PCA 
from sklearn.metrics import silhouette_samples, silhouette_score 
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn import preprocessing # predspracovanie dat - scale
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

# Custom modules
from data_validation import data_validation_filter as dvf
from data_processing import aoi as ac
from data_processing import data_preprocessing as dp

# RQA
from algorithms.RQA import DynamicalSystemsModule as RecurrenceFunctions
from algorithms.RQA import SpatioTemporalEyeTrackingModule as ReoccurrenceFunctions

# Clustering from pyclustering
import pyclustering.cluster.dbscan as cluster
import pyclustering.cluster.kmedoids as kmedoid

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from os import listdir
from os.path import isfile, join

import pickle

In [2]:
COLORS = ["cyan", "yellow","red","blue", "green","magenta","black","white"]

BOLD_START = '\033[1m'
BOLD_END = '\033[0m'

### trieda Participant
Reprezentacia participanta

In [3]:
class Participant:
    
    def __init__(self, name):
        self.data = {}
        self.data["Name"] = name
        self.data["Scanpath"] = ""
        self.data["ScanpathShort"] = ""
        self.data["NumberOfAoisHits"] = 0
        self.data["NumberOfAoisHitsShort"] = 0
        
        self.data["NumberOfFixations"] = 0
        self.data["FixationsDuration"] = 0
        self.data["AverageFixationDuration"] = 0
        
        # Saccades
        # Average
        self.data["AverageSaccadeSpeed"] = 0
        # Average
        self.data["AverageSaccadeLength"] = 0
        
        #RQA
        # Reoccurrence
        self.data["Reoccurrence"] = 0
        self.data["ReoccurrenceRate"] = 0
        self.data["ReoccurrenceDeterminism"] = 0
        self.data["ReoccurrenceLaminarity"] = 0
        self.data["ReoccurrenceCORM"] = 0
        # Recurrence
        self.data["Recurrence"] = 0
        self.data["RecurrenceRate"] = 0
        self.data["RecurrenceMeanX"] = 0
        self.data["RecurrenceMeanY"] = 0
        self.data["RecurrenceStandardDeviationX"] = 0
        self.data["RecurrenceStandardDeviationY"] = 0
        
        #DTW - Dynamic time warping average score (comparing to each scanpatch exlude itself)
        self.data["DTW"] = 0
        
        # LCS, LEV - Longest common subsequence, levensthein distance
        self.data["MaxLcs"] = 0
        self.data["MeanLcs"] = 0
        self.data["MinLcs"] = 0
        
        self.data["MaxLev"] = 0
        self.data["MeanLev"] = 0
        self.data["MinLev"] = 0
        
        
        #self.data["OutlierScore"] = 0
        
        # For RQA
        self.fixations = []
        self.fixationsToAoi = []
        self.outlierScore = 0
        self.completeTask = False
        self.passCalibration = False
        self.passValidation = False
        self.hasEnoughData = False

        
    def set_data(self, key, value):
        self.data[key] = value

Rozne pomocne metody

In [4]:
def clusters_to_labels(X, clusters, noise = []):
    labels = [0] * len(X)
    for nois in noise:
        labels[nois] = -1
        
    i = 0  
    for one_cluster in clusters:
        for one in one_cluster:
            labels[one] = i
        i += 1
    
    return np.array(labels, dtype="int64")


def pair_labels_with_participants_dict(labels, df_names):
    participants_labels = {}
    i = 0
    for i in range(0,len(df_names)):
        participants_labels[df_names[i]] = round(labels[i], 3)
    return participants_labels

def pair_scores_with_participants(scores, df_names):
    participants_scores = []
    #i = 0
    for i in range(0,len(df_names)):
        participants_scores.append([df_names[i], round(scores[i], 3)])
    return participants_scores

def pair_multiple_scores_with_participants(scores, df_names):
    scores_together = np.zeros((len(df_names),len(scores)))
    participants_scores = []
    for i in range(0, len(scores)):
        for j in range(0, len(scores[i])):
            scores_together[j][i] = round(scores[i][j],3)

    for l in range(0, len(df_names)):
        participants_scores.append([df_names[l], scores_together[l]])
         
    return participants_scores

def pair_labels_with_participants(labels, df_names):
    participants_labels = []
    i = 0
    for i in range(0,len(df_names)):
        participants_labels.append([df_names[i], labels[i]])
    return participants_labels


def get_participants_with_label(labels, label, testers_names):
    indexes = np.where(labels == label)[0]
    return list(map(lambda i: ALL_PARTICIPANTS[testers_names[i]], indexes))


def transform_data(data):
    data_array = []
    for d in data:
        data_array.append(d.data)
        
    return data_array


def outliers_nd_sd(X_scores, paired_scores):
# https://www.kdnuggets.com/2017/02/removing-outliers-standard-deviation-python.html
# http://colingorrie.github.io/outlier-detection.html
    final_list = []
    final_list_temp = []
    outliers = []
    mean = np.mean(X_scores, axis=0)
    sd = np.std(X_scores, axis=0)

    #final_list = [x for x in paired_labels_orig if (x[1] > mean - 2 * sd)]
    for x in paired_scores:
        if(x[1] > mean - 2 * sd):
            final_list_temp.append(x)
        else:
            outliers.append(x)
    #final_list = [x for x in final_list if (x[1] < mean + 2 * sd)]
    for x in final_list_temp:
        if(x[1] < mean + 2 * sd):
            final_list.append(x)
        else:
            outliers.append(x)
            
    return outliers

def outliers_z_score(ys):
    threshold = 3

    mean_y = np.mean(ys)
    stdev_y = np.std(ys)
    z_scores = [(y - mean_y) / stdev_y for y in ys]
    return np.where(np.abs(z_scores) > threshold)

def loopRange(start, end):
    return range(start, end+1)

## Features
_________________________

### LCS 
Longest common subsequence - z dvoch stringov najde najdlhsu podpostupnost (to znamena, ze neberie do uvahy "noisy" znaky)

Implementacia Lcs() prevzata z https://rosettacode.org/wiki/Longest_common_subsequence#Python

In [20]:
def lcs(a, b):
    lengths = [[0 for j in range(len(b)+1)] for i in range(len(a)+1)]
    # row 0 and column 0 are initialized to 0 already
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    # read the substring out from the matrix
    result = ""
    x, y = len(a), len(b)
    while x != 0 and y != 0:
        if lengths[x][y] == lengths[x-1][y]:
            x -= 1
        elif lengths[x][y] == lengths[x][y-1]:
            y -= 1
        else:
            assert a[x-1] == b[y-1]
            result = a[x-1] + result
            x -= 1
            y -= 1
    return result

def get_LCS_features(participant, all_participants_features_names):
    lcs_values = []
    for tester in all_participants_features_names:
        if(tester != participant):
            lcs_values.append(len(lcs(ALL_PARTICIPANTS[participant].data["ScanpathShort"], ALL_PARTICIPANTS[tester].data["ScanpathShort"])))
    
    scanpath_len = len(ALL_PARTICIPANTS[participant].data["ScanpathShort"])
    lcs_max = np.max(lcs_values) / scanpath_len
    lcs_mean = np.mean(lcs_values) / scanpath_len
    lcs_min = np.min(lcs_values) / scanpath_len
    
    ALL_PARTICIPANTS[participant].set_data("MaxLcs", lcs_max)
    ALL_PARTICIPANTS[participant].set_data("MeanLcs", lcs_mean)
    ALL_PARTICIPANTS[participant].set_data("MinLcs", lcs_min)
    

### Levensthein

In [6]:
def get_LEV_features(participant, all_participants_features_names):
    lev_values = []
    for tester in all_participants_features_names:
        if(tester != participant):
            lev_values.append(distance.levenshtein(ALL_PARTICIPANTS[participant].data["ScanpathShort"], ALL_PARTICIPANTS[tester].data["ScanpathShort"]))
    
    scanpath_len = len(ALL_PARTICIPANTS[participant].data["ScanpathShort"])
    lev_max = np.max(lev_values)
    lev_mean = np.mean(lev_values)
    lev_min = np.min(lev_values)
    
    ALL_PARTICIPANTS[participant].set_data("MaxLev", lev_max)
    ALL_PARTICIPANTS[participant].set_data("MeanLev", lev_mean)
    ALL_PARTICIPANTS[participant].set_data("MinLev", lev_min)      

### Dynamic Time Warping

In [7]:
def get_DTW(participant, all_participants_features_names):
    dist = []
    for m in all_participants_features_names:
        if(m != participant):
            distance, path = fastdtw(ALL_PARTICIPANTS[participant].fixationsToAoi, ALL_PARTICIPANTS[m].fixationsToAoi, dist=minkowski)
            dist.append(distance)
    
    dtw_mean = np.mean(dist)
    ALL_PARTICIPANTS[participant].set_data("DTW", dtw_mean)

In [21]:
### RQA

In [8]:
def get_RQA_features(participant):
    clusteringDistanceThreshold = 70
    timeDelayValue = 1
    numTimeDelaySamples = 3
    phaseSpaceClusteringThreshold = 0.5

    results = {}
    fixations = []
    for fixation in participant.fixations:
        fixations.append([int(fixation.x), int(fixation.y)])
        #print(fixation)
        # reoccurrences
    #clusteringDistanceThreshold = int(parser.get('RQA', 'clusteringDistanceThreshold'))
    matrix = ReoccurrenceFunctions.CreateReoccurrenceMatrix(fixations, clusteringDistanceThreshold=clusteringDistanceThreshold)
    #print(len(matrix))
    if(len(matrix) < 8):
        participant.hasEnoughData = False
        return False
    #Added 
    #results["Name"] = parti.name
    results["Reoccurrence"] = ReoccurrenceFunctions.getReoccurrence(matrix)
    results["ReoccurrenceRate"] = ReoccurrenceFunctions.getReoccurrenceRate(matrix)
    results["ReoccurrenceDeterminism"] = ReoccurrenceFunctions.getDeterminism(matrix)
    results["ReoccurrenceLaminarity"] = ReoccurrenceFunctions.getLaminarity(matrix)
    results["ReoccurrenceCORM"] = ReoccurrenceFunctions.getCORM(matrix)
    

    #reccurrences
    #timeDelayValue = int(parser.get('RQA', 'timeDelayValue'))
    #numTimeDelaySamples = int(parser.get('RQA', 'numTimeDelaySamples'))
    #phaseSpaceClusteringThreshold = float(parser.get('RQA', 'phaseSpaceClusteringThreshold'))
    fixationsXYPhaseSpaceData = RecurrenceFunctions.TimeDelayEmbedding(timeSeriesObservations=fixations,
                                                                       delayStep=timeDelayValue,
                                                                       delaySamples=numTimeDelaySamples)
    recurrenceMatrixData = RecurrenceFunctions.CreateRecurrenceMatrix(phaseSpaceData=fixationsXYPhaseSpaceData,
                                                                      clusteringDistanceThreshold= 	phaseSpaceClusteringThreshold );

    results["Recurrence"] = RecurrenceFunctions.getRecurrence(recurrenceMatrixData, numTimeDelaySamples);
    results["RecurrenceRate"] = RecurrenceFunctions.getRecurrenceRate(recurrenceMatrixData, numTimeDelaySamples);
    (results["RecurrenceMeanX"], results["RecurrenceMeanY"]) = RecurrenceFunctions.getRecurrenceMean(recurrenceMatrixData, numTimeDelaySamples);
    (results["RecurrenceStandardDeviationX"],results["RecurrenceStandardDeviationY"]) = RecurrenceFunctions.getRecurrenceStandardDeviation(recurrenceMatrixData,
                                                                                             numTimeDelaySamples);
    #reoccurrence[parti.name] = results
   
    for resultKey in results:
        participant.set_data(resultKey, results[resultKey])

## User features 
Vypocitaju sa crty pre participantov, okrem crt ktore si vyzaduju vsetky fixacie pre pozivatelov, napriklad RQA alebo parove porovnania ako DTW, LEV, LCS

In [9]:
def get_user_features(dataset, _participant, fil = False):
    print("Working on " + _participant.data["Name"])
    # Vytiahnem data od konkretneho usera
    d = dataset[dataset['ParticipantName'] == _participant.data["Name"]]
    # Vyfiltrujem data na konkretny task 
    # (neuvazujem Instrukcie alebo Questionare na konci - len ked sa zacalo a skoncilo nahravanie)
    if fil:
        d = dp.filter_users_data_to_task(d)
    # Vyfiltrujem take fixacie ktore su NaN, teda neboli zaznamenane a nedalo by sa s nimi pracovat
    d = dp.filter_users_fixations_wod(d)
    
    #_participant = Participant(tester)
    #_participant = tester
    
    if(len(d) == 0):
        _participant.hasEnoughData = False
        return _participant 
    
     # init
    allFixations = []
    # For saccade calculation and their speed, length
    allFixationsWithTimestamp = []
    allFixationsToAoi = []
    scanpath = ""
    scanpathShort = ""
    scanpathLength = 0
    scanpathShortLength = 0
    numberOfAoisHits = 0
    numberOfAoisHitsShort = 0
    numberOfFixations = 0
    fixationsDuration = 0
    averageFixationDuration = 0
    
    averageSaccadeSpeed = []
    averageSaccadeLength = []
   

    #_participant = Participant(tester)

    i = 0
    # Prechadzam jednotlive riadky
    for index, row in d.iterrows():
        distance = 0
        speed = 0
        fixationPoint = Point(row['FixationPointX (MCSpx)'], row['FixationPointY (MCSpx)'])
        allFixations.append(fixationPoint)
        allFixationsWithTimestamp.append([fixationPoint, row['RecordingTimestamp']])
        
        fixationsDuration += row['GazeEventDuration']
        numberOfFixations += 1

        for aoi in AOIS:
            if AOIS[aoi].contains(fixationPoint):
                scanpath += aoi
                numberOfAoisHits += 1
                allFixationsToAoi.append([fixationPoint.x, fixationPoint.y])
                break
                
        if(i > 1):
            distance = allFixationsWithTimestamp[i-1][0].distance(allFixationsWithTimestamp[i][0])
            speed = distance / (allFixationsWithTimestamp[i][1] - allFixationsWithTimestamp[i-1][1])
            averageSaccadeLength.append(distance)
            averageSaccadeSpeed.append(speed)
        
        i += 1
               
    scanpathShort = ''.join(ch for ch, _ in itertools.groupby(scanpath))
    numberOfAoisHitsShort = len(scanpathShort)
    averageFixationDuration = ((fixationsDuration/numberOfFixations) / 1000) if numberOfFixations > 0 else 0
    
    _participant.set_data("Scanpath", scanpath)
    _participant.set_data("ScanpathShort", scanpathShort)
    _participant.set_data("NumberOfFixations", numberOfFixations)
    _participant.set_data("NumberOfAoisHits", numberOfAoisHits)
    _participant.set_data("NumberOfAoisHitsShort", numberOfAoisHitsShort)
    _participant.set_data("FixationsDuration", fixationsDuration / 1000) # in seconds
    _participant.set_data("AverageFixationDuration", averageFixationDuration)
    _participant.set_data("AverageSaccadeLength", np.mean(np.array(averageSaccadeLength)) if len(averageSaccadeLength) > 0 else 0)
    _participant.set_data("AverageSaccadeSpeed", np.mean(np.array(averageSaccadeSpeed)) if len(averageSaccadeSpeed) > 0 else 0)

    #_participant.scanpath = scanpath
    #_participant.scanpath_short = ''.join(ch for ch, _ in itertools.groupby(scanpath))
    #_participant.allFixationsDuration = duration
    
    _participant.fixations = allFixations
    _participant.fixationsToAoi = np.array(allFixationsToAoi)
    
    
    #ALL_PARTICIPANTS[tester] = _participant
    
    if(len(scanpathShort) > 1):
        _participant.hasEnoughData = True
        get_RQA_features(_participant)
    else:
        _participant.hasEnoughData = False
        
    return _participant
    

_______________
### Filtration
Implementacia fitracii v metode

In [10]:
# Filter based on the calibration values at the beginning of the experiment
def calibration_filter(accur=0.85, precision=0.85):
    
    df_after_calibration = dvf.calibration_filter(DATASET, "begin", PATH_CALIB_DATA,  2, accur, precision)
    testers_names_calib = df_after_calibration['ParticipantName'].unique()

    #ALL_PARTICIPANTS_after_calibration = {}
    dont_pass_calibration = list(set(ALL_PARTICIPANTS_NAMES) - set(testers_names_calib))

    for tester in ALL_PARTICIPANTS:
        if tester not in dont_pass_calibration:
            ALL_PARTICIPANTS[tester].passCalibration = True
            #ALL_PARTICIPANTS_after_calibration[tester] = ALL_PARTICIPANTS[tester]

    print(str(len(ALL_PARTICIPANTS)) + " -> " + str(len(ALL_PARTICIPANTS) - len(dont_pass_calibration)))
    return dont_pass_calibration


# Filter dataset based on validation
def validation_filter(percent = 25):
    dataset_filtered = dvf.eyes_validity_filter(DATASET, 2, percent)
    testers_names_validation = dataset_filtered['ParticipantName'].unique()

    #ALL_PARTICIPANTS_after_validation = {}
    dont_pass_validation = list(set(ALL_PARTICIPANTS_NAMES) - set(testers_names_validation))

    for tester in ALL_PARTICIPANTS:
        if tester not in dont_pass_validation:
            ALL_PARTICIPANTS[tester].passValidation = True
            #ALL_PARTICIPANTS_after_validation[tester] = ALL_PARTICIPANTS[tester]

    print(str(len(ALL_PARTICIPANTS)) + " -> " + str(len(ALL_PARTICIPANTS) - len(dont_pass_validation)))
    return dont_pass_validation
    
    
# Filter users who don't complete task
def completion_filter():
    #ALL_PARTICIPANTS_who_complete = {}
    for tester in ALL_PARTICIPANTS:
        if tester not in DONT_COMPLETE_TASK:
            ALL_PARTICIPANTS[tester].completeTask = True
            #ALL_PARTICIPANTS_who_complete[tester] = ALL_PARTICIPANTS[tester]

    print(str(len(ALL_PARTICIPANTS)) + " -> " + str(len(ALL_PARTICIPANTS) - len(DONT_COMPLETE_TASK)))

    
# Get users who pass
def filter_participants(validationPercentage, calibration_check = True, validation_check = True, completion_check = True, accur=0.85, precision=0.85):
    dont_pass_calibration = None
    dont_pass_validation = None
    
    if(calibration_check):
        print("Calibration filter: ")
        dont_pass_calibration = calibration_filter(accur,precision)
        
    if(validation_check):
        print("Validation filter: ")
        dont_pass_validation = validation_filter(validationPercentage)
        
    if(completion_check):
        print("Completion filter: ")
        completion_filter()
        
    for tester in ALL_PARTICIPANTS:
        if((ALL_PARTICIPANTS[tester].completeTask == completion_check) & (ALL_PARTICIPANTS[tester].passValidation == validation_check) & (ALL_PARTICIPANTS[tester].passCalibration == calibration_check)):
            ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES.append(tester)
    
    print("Participants after all filtrations: ")
    print(str(len(ALL_PARTICIPANTS)) + " -> " + str(len(ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES)))
    
    return dont_pass_calibration, dont_pass_validation

## Machine learning
_________
Zhlukovacie algoritmy a LOF

### Data preprocessing

In [11]:
def prepare_data(participants_names, columns_to_drop = ['Name','Scanpath','ScanpathShort']):
    
    all_data = [ALL_PARTICIPANTS[tester].data for tester in participants_names]
    
    df_all_data = pd.DataFrame(all_data)
    df_names = df_all_data['Name'].copy()
    df_ = df_all_data.drop(columns=columns_to_drop)
    df_final = df_.values.tolist()
    X = preprocessing.scale(df_final)

    return X, df_names, df_all_data, df_

def split_data_to_2_clusters(labels, df_names, df_all_data):
    gpw_0_names = [n.data["Name"] for n in get_participants_with_label(labels, 0, df_names)]
    gpw_1_names = [n.data["Name"] for n in get_participants_with_label(labels, 1, df_names)]
    df_cluster_0 = df_all_data.loc[df_all_data['Name'].isin(gpw_0_names)]
    df_cluster_1 = df_all_data.loc[df_all_data['Name'].isin(gpw_1_names)]
    
    return df_cluster_0, df_cluster_1

### Feature Importance

https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

In [12]:
def featureImportance(X, labels, df_, estimators = 200, typeOfBar = 1):
    # Build a classification task using 3 informative features
    # X, y = make_classification(n_samples=1000,
    #                            n_features=10,
    #                            n_informative=3,
    #                            n_redundant=0,
    #                            n_repeated=0,
    #                            n_classes=2,
    #                            random_state=0,
    #                            shuffle=False)

    # Build a forest and compute the feature importances
    #forest = ExtraTreesClassifier(n_estimators=1000)
    forest = RandomForestClassifier(n_estimators=estimators)

    forest.fit(X, labels)
    #forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("%d. feature %d, %s (%f)" % (f + 1, indices[f], df_.columns[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    if(typeOfBar == 1):
        plt.figure()
        plt.title("Feature importances")
        #plt.bar(range(X.shape[1]), importances[indices],
               #color="r", yerr=std[indices], align="center")
        #plt.xticks(range(X.shape[1]), indices)
        #plt.xlim([-1, X.shape[1]])
        plt.barh(range(X.shape[1]), importances[indices], color="r", align="center")
        plt.yticks(range(X.shape[1]), [df_.columns[indices[f]] for f in range(X.shape[1])])
        plt.ylim([-1, X.shape[1]])
        plt.gca().invert_yaxis()
        plt.gcf().subplots_adjust(left=0.4)
        plt.tight_layout()
        plt.show()
    else:
        d_crt = pd.DataFrame(importances[indices])
        d_crt.columns=["Dôležitost črty"]
        ax = d_crt.plot(kind='bar', color='red', figsize=(10,10))
        ax.set_xticklabels([df_.columns[indices[f]] for f in range(X.shape[1])], rotation='vertical', fontsize=10)
        
    return forest

### Local Outlier Factor

https://scikit-learn.org/stable/auto_examples/neighbors/plot_lof_outlier_detection.html

In [13]:
def run_LOF(X, df_names, k_neighbors = 15, m_metric = "minkowski", with_plot = False, with_text = True, x=-10, y=25, name_index = 6):
    clf = LocalOutlierFactor(n_neighbors=k_neighbors, contamination=0.05, metric=m_metric)
    # use fit_predict to compute the predicted labels of the training samples
    # (when LOF is used for outlier detection, the estimator has no predict,
    # decision_function and score_samples methods).
    y_pred = clf.fit_predict(X)
    X_scores = clf.negative_outlier_factor_

    if(with_plot == True):
        pca_lof = PCA(n_components=2).fit(X)
        plot_lof = pca_lof.transform(X)

        plt.title("Local Outlier Factor (LOF)")
        plt.scatter(plot_lof[:, 0], plot_lof[:, 1], color='k', s=3., label='Data points')
        # plot circles with radius proportional to the outlier scores
        radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
        plt.scatter(plot_lof[:, 0], plot_lof[:, 1], s=1000 * radius, edgecolors='r',
                    facecolors='none', label='Outlier scores')
        plt.axis('tight')
        plt.xlim((x, y))
        plt.ylim((x, y))
        legend = plt.legend(loc='upper left')
        legend.legendHandles[0]._sizes = [10]
        legend.legendHandles[1]._sizes = [20]
        
        if(with_text == True):
            for i,name in enumerate(df_names):
                x = plot_lof[i][0]
                y = plot_lof[i][1]
                plt.text(x+0.2, y+0.05, name[name_index:], fontsize=8)

        plt.show()

    
    return X_scores,clf

### K-means

In [14]:
def run_KMEANS(X, df_names, num_clusters = 2, name_Index = 1, with_plot = True, with_text = True, colors = COLORS):
    kmeans = KMeans(n_clusters=num_clusters, init="k-means++", algorithm="auto", n_init=15, random_state=None).fit(X)

    labels = kmeans.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print("Silhouette Coefficient: %0.3f" % silhouette_score(X, labels))
    
    if(with_plot == True):
        plt.figure()
        # Vytvorenie a nafitovanie PCA modelu
        pca = PCA(n_components=2).fit(X)
        plot = pca.transform(X)
        plt.scatter(x=plot[:,0], y=plot[:,1], c=[colors[l_ + int(1)] for l_ in labels], s=50)  
        plt.xlim((-6, 12))
        plt.ylim((-6, 12))
        
        if(with_text == True):
            for i,name in enumerate(df_names):
                x = plot[i][0]
                y = plot[i][1]
                plt.text(x+0.2, y+0.05, name[name_Index:], fontsize=8)
                
        # Zobrazenie PCA 2D modelu
        title = "Number of clusters: %d" % n_clusters_
        plt.title(title)
        
    return labels, kmeans

### K-medoids

In [15]:
def run_KMEDOIDS(X, df, init_medoids = [10, 30], name_Index = 1, with_plot = True, with_text = True, colors = COLORS):
 
    initial_medoids = init_medoids

    # create instance of K-Medoids algorithm
    kmedoids_instance = kmedoid.kmedoids(X, initial_medoids)

    # run cluster analysis and obtain results
    kmedoids_instance.process();
    clusters = kmedoids_instance.get_clusters()
    labels = clusters_to_labels(X, clusters)
    
    print("Silhouette Coefficient: %0.3f" % silhouette_score(X, labels))
    
    if(with_plot == True):
        plt.figure()
        # Vytvorenie a nafitovanie PCA modelu
        pca = PCA(n_components=2).fit(X)
        plot = pca.transform(X)
        plt.scatter(x=plot[:,0], y=plot[:,1], c=[colors[l_ + int(1)] for l_ in labels], s=50)  
        
        if(with_text == True):
            for i,name in enumerate(df_names):
                x = plot[i][0]
                y = plot[i][1]
                plt.text(x+0.2, y+0.05, name[name_Index:], fontsize=8)
                
        # Zobrazenie PCA 2D modelu
        title = "Number of clusters: %d" % len(clusters)
        plt.title(title)
        
    return labels, kmedoids_instance

## Calculation of user's features

Participant musi mat dostatok dat pre vypocet crt

Vypocet vsetkych crt pre vsetkych participantov

In [16]:
def set_participants_features(w = True, fil = False):
    ALL_PARTICIPANTS_FEATURES_NAMES = []
    for tester in ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES:
        ALL_PARTICIPANTS[tester] = get_user_features(DATASET, ALL_PARTICIPANTS[tester],fil)
        if(ALL_PARTICIPANTS[tester].hasEnoughData == True):
            ALL_PARTICIPANTS_FEATURES_NAMES.append(tester)
        else:
            DONT_HAVE_ENOUGH_DATA.append(tester)
    print("-----------------------------------------------------------------------")
    print("Other features")
    if(w == True):
        ### Get DTW, Levensthein, LCS for each participant 
        # We must have valid and enough data - all fixations and scanpath ready
        for tester in ALL_PARTICIPANTS_FEATURES_NAMES:
            print("Working on " + tester)
            get_DTW(tester, ALL_PARTICIPANTS_FEATURES_NAMES)
            get_LEV_features(tester, ALL_PARTICIPANTS_FEATURES_NAMES)
            get_LCS_features(tester, ALL_PARTICIPANTS_FEATURES_NAMES)
            
    print("Participants after feature calculations: ")
    print(len(ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES) - len(DONT_HAVE_ENOUGH_DATA))
    print("-----------------------------------------------------------------------")
    
    return ALL_PARTICIPANTS_FEATURES_NAMES

### See dropouts (participants)

In [17]:
def get_overall_dropouts(all_participants_names):
    print("Overal dropouts: ")
    overall_dropouts = list(set(all_participants_names) - set(ALL_PARTICIPANTS_FEATURES_NAMES))
    df_cleaned = DATASET
    for tester in overall_dropouts:
        df_cleaned = df_cleaned[df_cleaned['ParticipantName'] != tester]
    
    if(DONT_PASS_CALIBRATION != None):
        print("Participants who didn't pass through calibration = " + str(len(DONT_PASS_CALIBRATION)) + " -> " + str(DONT_PASS_CALIBRATION))
    if(DONT_COMPLETE_TASK != None):
        print("Participants who didn't complete task = " + str(len(DONT_COMPLETE_TASK)) + " -> " + str(DONT_COMPLETE_TASK))
    if(DONT_PASS_VALIDATION != None):
        print("Participants who didn't pass through validation = " + str(len(DONT_PASS_VALIDATION)) + " -> " + str(DONT_PASS_VALIDATION))
    if(DONT_HAVE_ENOUGH_DATA != None):
        print("Participants who didn't have enough data = " + str(len(DONT_HAVE_ENOUGH_DATA)) + " -> " + str(DONT_HAVE_ENOUGH_DATA))
    print("------------------")
    print("All Participants who didnt pass = " + str(len(overall_dropouts)) + " -> " + str(overall_dropouts))

### LOF in iterations and finding outliers

In [None]:
def run_lof_iterations(X,df_names_lof, lower, upper, metric="minkowski"):
    #metric = "correlation"
    #metric = "minkowski"
    all_X_scores = []
    #X_ = X #X_cluster_0/1 # X
    #df_names_lof = df_names # df_names_cluster_0/1 # df_names
    for k in loopRange(lower,upper):
        sc, lf = run_LOF(X ,df_names_lof, k, metric)
        all_X_scores.append(sc)

    pso = pair_multiple_scores_with_participants(all_X_scores, np.array(df_names_lof))
    fr = pd.DataFrame(pso)
    pso_max = []
    scoores = []
    for name in pso:
        pso_max.append([name[0],np.min(name[1])])
        scoores.append(np.min(name[1]))

    #pso_max
    #pso_max_sorted = sorted(pso_max, key=lambda x: x[1])
    #pddf1 = pd.DataFrame(pso_max_sorted)
    sorted_scores = pd.DataFrame(pso_max).sort_values(1)
    return scoores,pso_max, sorted_scores

def get_outliers_based_on_score(scoores,pso_max):
    outliers = outliers_nd_sd(np.array(scoores), pso_max)
    outliers = [x[0] for x in outliers]
    print("Outliers in data -> " + str(outliers))
    return outliers

def get_outliers(X, df_names, cluster=False,metric="correlation", num_of_outliers_to_find = 2):
    #metric = "correlation"
    #metric = "minkowski"
    all_X_scores = []
    X_ = X #X_cluster_0/1 # X
    df_names_lof = df_names # df_names_cluster_0/1 # df_names
    
    bc = len(df_names_cluster_0) if (len(df_names_cluster_0) > len(df_names_cluster_1)) else len(df_names_cluster_1)
    
    # dolny a horny index
    if(cluster):
        l = int(len(df_names) - (len(df_names) / 4))
    else:
        l = bc - num_of_outliers_to_find
        
    h = len(df_names) - num_of_outliers_to_find
    
    print("** " + metric + " **")
    scoores,pso_max, sorted_scores = run_lof_iterations(X,df_names_lof, l, h, metric)
    outliers = get_outliers_based_on_score(scoores,pso_max)
    
    return outliers

### Ploting participant's fixations

In [18]:
def plot_participant_fixations_to_aoi(testerName, one = False, testerName2 = "", with_image = False, imgPath = ""):
    if(with_image == True):
        img = plt.imread(imgPath)
        fig, ax = plt.subplots()
        ax.imshow(img)
        
    if(one == False):
        plt.plot(ALL_PARTICIPANTS[testerName].fixationsToAoi[:,0], ALL_PARTICIPANTS[testerName].fixationsToAoi[:,1], c='blue')
        plt.plot(ALL_PARTICIPANTS[testerName2].fixationsToAoi[:,0], ALL_PARTICIPANTS[testerName2].fixationsToAoi[:,1], c='red')
    else:
        plt.plot(ALL_PARTICIPANTS[testerName].fixationsToAoi[:,0], ALL_PARTICIPANTS[testerName].fixationsToAoi[:,1])
        
    plt.show()

def plot_participant_fixations(testerName, one = False, testerName2 = "",  with_image = False, imgPath = ""):
    if(with_image == True):
        img = plt.imread(imgPath)
        fig, ax = plt.subplots()
        ax.imshow(img)
        
    if(one == False):
        plt.plot([a.x for a in ALL_PARTICIPANTS[testerName].fixations], [b.y for b in ALL_PARTICIPANTS[testerName].fixations], c='blue')
        plt.plot([a.x for a in ALL_PARTICIPANTS[testerName2].fixations], [b.y for b in ALL_PARTICIPANTS[testerName2].fixations], c='red')
    else:
        plt.plot([a.x for a in ALL_PARTICIPANTS[testerName].fixations], [b.y for b in ALL_PARTICIPANTS[testerName].fixations])

    plt.show()



### Save / load all main variables 

In [19]:
def save_variables(TASK_NUMBER, path):
    with open(path + TASK_NUMBER + '/ALL_PARTICIPANTS', 'wb') as f:
        pickle.dump(ALL_PARTICIPANTS, f)
        f.close()
    with open(path + TASK_NUMBER + '/ALL_PARTICIPANTS_FEATURES_NAMES', 'wb') as f:
        pickle.dump(ALL_PARTICIPANTS_FEATURES_NAMES, f)
        f.close()
    with open(path + TASK_NUMBER + '/ALL_PARTICIPANTS_NAMES', 'wb') as f:
        pickle.dump(ALL_PARTICIPANTS_NAMES, f)
        f.close()
    with open(path + TASK_NUMBER + '/ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES', 'wb') as f:
        pickle.dump(ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES, f)
        f.close()
    with open(path + TASK_NUMBER + '/DONT_HAVE_ENOUGH_DATA', 'wb') as f:
        pickle.dump(DONT_HAVE_ENOUGH_DATA, f)
        f.close()
    with open(path + TASK_NUMBER + '/DONT_COMPLETE_TASK', 'wb') as f:
        pickle.dump(DONT_COMPLETE_TASK, f)
        f.close()
    with open(path + TASK_NUMBER + '/DONT_PASS_CALIBRATION', 'wb') as f:
        pickle.dump(DONT_PASS_CALIBRATION, f)
        f.close()
    with open(path + TASK_NUMBER + '/DONT_PASS_VALIDATION', 'wb') as f:
        pickle.dump(DONT_PASS_VALIDATION, f)
        f.close()
        
def load_variables(TASK_NUMBER, path):
    infile = open(path+TASK_NUMBER + '/ALL_PARTICIPANTS','rb')
    ALL_PARTICIPANTS = pickle.load(infile)
    infile.close()
    
    infile = open(path + TASK_NUMBER+ '/ALL_PARTICIPANTS_FEATURES_NAMES','rb')
    ALL_PARTICIPANTS_FEATURES_NAMES = pickle.load(infile)
    infile.close()
    
    infile = open(path+TASK_NUMBER+ '/ALL_PARTICIPANTS_NAMES','rb')
    ALL_PARTICIPANTS_NAMES = pickle.load(infile)
    infile.close()
    
    infile = open(path+TASK_NUMBER+ '/ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES','rb')
    ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES = pickle.load(infile)
    infile.close()
    
    infile = open(path+TASK_NUMBER+ '/DONT_HAVE_ENOUGH_DATA','rb')
    DONT_HAVE_ENOUGH_DATA = pickle.load(infile)
    infile.close()
    
    infile = open(path+TASK_NUMBER+ '/DONT_COMPLETE_TASK','rb')
    DONT_COMPLETE_TASK = pickle.load(infile)
    infile.close()
    
    infile = open(path+TASK_NUMBER+ '/DONT_PASS_CALIBRATION','rb')
    DONT_PASS_CALIBRATION = pickle.load(infile)
    infile.close()
    
    infile = open(path+TASK_NUMBER+ '/DONT_PASS_VALIDATION','rb')
    DONT_PASS_VALIDATION = pickle.load(infile)
    infile.close()
   
    return ALL_PARTICIPANTS,ALL_PARTICIPANTS_FEATURES_NAMES, ALL_PARTICIPANTS_NAMES, \
    ALL_PARTICIPANTS_AFTER_COMPLETE_CHECK_NAMES,DONT_HAVE_ENOUGH_DATA,DONT_COMPLETE_TASK,DONT_PASS_CALIBRATION,DONT_PASS_VALIDATION


In [62]:
## Funkcie exluzivne potrebne len pre MSNV
def parse_tsv_by_screen():
    columns = data_mmd.columns
    data = {
        3 : pd.DataFrame(columns=columns),
        5 : pd.DataFrame(columns=columns),
        9 : pd.DataFrame(columns=columns),
        11 : pd.DataFrame(columns=columns),
        18 : pd.DataFrame(columns=columns),
        20 : pd.DataFrame(columns=columns),
        27 : pd.DataFrame(columns=columns),
        28 : pd.DataFrame(columns=columns),
        30 : pd.DataFrame(columns=columns),
        60 : pd.DataFrame(columns=columns),
        62 : pd.DataFrame(columns=columns),
        66 : pd.DataFrame(columns=columns),
        72 : pd.DataFrame(columns=columns),
        74 : pd.DataFrame(columns=columns),
        76 : pd.DataFrame(columns=columns)
    }
    
    for i in range(0, len(gaze_exports_names)):
        participant_seg_files = pd.read_csv(path + segfiles_path + seg_files_names[i], low_memory=False, sep=",")
        participant = f = pd.read_csv(path + gazes_path + gaze_exports_names[i], low_memory=False, sep="\t")
        print("Working on " + participant.iloc[0]['ParticipantName'])
        for ide,row in participant_seg_files.iterrows():
            mmd_id = row['mmd_id']
            start = row['start']
            end = row['end']
            print("--" + str(mmd_id))

            data_mmd = participant[(participant['RecordingTimestamp'] >= start) & (participant['RecordingTimestamp'] <= end)]
            data[mmd_id] = data[mmd_id].append(data_mmd)
        print("--------------------------")  
        i = i + 1
        
    for d in data:
        data[d].to_csv(path + data_by_screen_path + str(d) +".tsv",sep="\t")
        
        
def load_aois(aois_index):
    all_aois = {}

    aois_columns = ["Typ","1","2","3","4"]
    aois_file = pd.read_csv(path + aois_path + aois_files_names[aois_index], sep="\t").iloc[:, : 5]
    aois_file.columns = aois_columns

    i = 0
    for index, row in aois_file.iterrows():
        a = Point(float(aois_file.iloc[i][1].split(",")[0]), float(aois_file.iloc[i][1].split(",")[1]))
        b = Point(float(aois_file.iloc[i][2].split(",")[0]), float(aois_file.iloc[i][2].split(",")[1]))
        c = Point(float(aois_file.iloc[i][3].split(",")[0]), float(aois_file.iloc[i][3].split(",")[1]))
        d = Point(float(aois_file.iloc[i][4].split(",")[0]), float(aois_file.iloc[i][4].split(",")[1]))

        poly = Polygon(((a.x,a.y),(b.x,b.y),(c.x,c.y),(d.x,d.y)))

        all_aois[aois_names[i]] = poly

        i = i + 1
        
    return all_aois

In [63]:
def get_data_sorted_by(df_all_data, value="FixationsDuration"):
    sorted = df_all_data.sort_values([value])
    return sorted[[value, "Name"]]

In [1]:
def get_correlation_matrix(df_dropped_columns):
    corr = df_dropped_columns.corr()
    sns.set(rc={'figure.figsize':(8,6)})
    sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)