In [12]:
import pandas as pd

from os.path import join as pjoin
from typing import Dict
from pandas.core.frame import DataFrame

class ScibotParagraphFeaturesLoader:
    _GREL_FILE_EXT = "_LongestVisitFeatureExtractor_20210826-1344_3s.csv"
    _NQ_FILE_EXT = "_LongestVisitFeatureExtractor_20210826-1345_3s.csv"
    _EXCLUDE_USER_LIST = []  # ["A02", "A05"]  # exclude A02 for g-REL only
    _INCLUDE_USER_LIST = []  # if emtpy every user besides excluded will be analysed, "A01", "B01"
    _INCLUDE_DATA_SOURCE = []  # "g_rel", "GoogleNQ"
    _STUDY_TYPE = ["main"]  # "main", "train"
    _COLUMN_SELECTION = ["f_total_time",
                         "f_fixn_n",
                         "f_fixn_dur_sum",
                         "f_fixn_dur_avg",
                         "f_fixn_dur_sd",
                         "f_scan_distance_h",
                         "f_scan_distance_v",
                         "f_scan_distance_euclid",
                         "f_scan_hv_ratio",
                         "f_avg_sacc_length",
                         "f_scan_speed_h",
                         "f_scan_speed_v",
                         "f_scan_speed",
                         "f_box_area",
                         "f_box_area_per_time",
                         "f_fixns_per_box_area",
                         "f_hull_area_per_time",
                         "f_fixns_per_hull_area"]

    grel_par_features = {}
    google_nq_par_features = {}

    def __init__(self, data_dir: str, googleNQ: bool = True, gREL: bool = True,
                 include_users=None, exclude_users=None):
        """

        Args:
            data_dir: directory containing the mapping files.
            googleNQ: determines if the mappings of the GoogleNQ files are included.
            gREL: determines if the mappings of the g-REL articles are included.
        """
        super().__init__()

        self.data_dir = data_dir
        if googleNQ:
            self._INCLUDE_DATA_SOURCE.append("GoogleNQ")
        if gREL:
            self._INCLUDE_DATA_SOURCE.append("g-REL")
        if include_users is not None:
            assert isinstance(include_users, list)
            self._INCLUDE_USER_LIST = include_users
        if exclude_users is not None:
            assert isinstance(exclude_users, list)
            self._EXCLUDE_USER_LIST = exclude_users

        self._load_data()

    def _load_data(self):
        """ Loads the specified data"""
        if "g-REL" in self._INCLUDE_DATA_SOURCE:
            self._load_grel()
        if "GoogleNQ" in self._INCLUDE_DATA_SOURCE:
            self._load_google_nq()

    def _load_grel(self):
        """ Loads the features of the g-REL files """
        paths = [pjoin(self.data_dir, "g-REL", study_type) for study_type in self._STUDY_TYPE]

        for path in paths:
            data = self._load_mapping(pjoin(path, 'g-rel' + self._GREL_FILE_EXT))
            self._grel_para_features = self._extract_featues(data)

    def _load_google_nq(self):
        """ Loads the features of the GoggleNQ files """
        paths = [pjoin(self.data_dir, "GoogleNQ", study_type) for study_type in self._STUDY_TYPE]

        for path in paths:
            data = self._load_mapping(pjoin(path, 'nq' + self._NQ_FILE_EXT))
            self._google_nq_par_features = self._extract_featues(data)

    def _extract_featues(self, data: DataFrame) -> Dict[str, Dict[str, Dict[int, Dict[str, float]]]]:
        """ 
        Extracts the features of every paragrpah
            
        Args:
            data: dataframe containing the features of a document.
        """
        data_dict = {}
        # every user
        for user_id in data.user.unique():
            data_dict[user_id] = {}
            # every document
            for doc_id in data.loc[data['user'] == user_id].document.unique():
                data_dict[user_id][doc_id] = {}
                # every paragraph
                for par_id in data.loc[(data['user'] == user_id) & (data['document'] == doc_id)].paragraph.unique():   
                    data_dict[user_id][doc_id][par_id] = {}         
                    features = data.loc[(data['user'] == user_id) & 
                                        (data['document'] == doc_id) & 
                                        (data['paragraph'] == par_id), 
                                        self._COLUMN_SELECTION]
                    if features is not None and not features.empty:
                        data_dict[user_id][doc_id][par_id] = features.to_dict('records')[0]

    def _is_valid_user(self, user: str) -> bool:
        """
        Determine if a certain user has to be included

        Args:
            user: user's id
        """
        return user not in self._EXCLUDE_USER_LIST and (
                not self._INCLUDE_USER_LIST or user in self._INCLUDE_USER_LIST)

    @staticmethod
    def _load_mapping(path: str) -> pd.DataFrame:
        """ Reads a file containing a mapping into a Pandas dataframe """
        return pd.read_csv(path, delimiter=";", encoding='utf-8', float_precision='round_trip', na_values="None")


In [18]:
path = "./data/paragraph_features/GoogleNQ/main/nq_LongestVisitFeatureExtractor_20210826-1345_3s.csv"
data = pd.read_csv(path, delimiter=";", encoding='utf-8', float_precision='round_trip', na_values="None")
data

Unnamed: 0,user,document,corpus,paragraph,visit,system_relevance,perceived_relevance,system_relevance_type,method,f_total_time,f_fixn_n,f_fixn_dur_sum,f_fixn_dur_avg,f_fixn_dur_sd,f_scan_distance_h,f_scan_distance_v,f_scan_distance_euclid,f_scan_hv_ratio,f_avg_sacc_length,f_scan_speed_h,f_scan_speed_v,f_scan_speed,f_box_area,f_box_area_per_time,f_fixns_per_box_area,f_hull_area_per_time,f_fixns_per_hull_area
0,A01,nq_7p_a1_Mzgy,nq,0,,False,True,i,LongestVisitFeatureExtractor,4.699329,17.0,3.530522,0.207678,0.083126,3.402483,0.835150,3.525169,4.074098,0.220323,0.724036,0.177717,0.750143,2.841583,0.604678,0.167152,0.064017,56.509287
1,A01,nq_7p_a1_Mzgy,nq,1,,True,True,r,LongestVisitFeatureExtractor,13.459361,49.0,11.157894,0.227712,0.113799,8.410302,1.928477,8.845093,4.361112,0.184273,0.624866,0.143281,0.657170,16.219071,1.205040,0.331001,0.045958,79.216128
2,A01,nq_7p_a1_Mzgy,nq,2,,False,False,i,LongestVisitFeatureExtractor,10.001136,40.0,7.892463,0.197312,0.094675,7.131122,2.790773,8.393745,2.555250,0.215224,0.713031,0.279046,0.839279,19.901342,1.989908,0.497534,0.066115,60.493638
3,A01,nq_7p_a1_Mzgy,nq,3,,False,False,i,LongestVisitFeatureExtractor,13.844947,55.0,10.615664,0.193012,0.100460,11.732750,2.039616,12.165714,5.752431,0.225291,0.847439,0.147318,0.878712,23.930305,1.728450,0.435096,0.045298,87.698505
4,A01,nq_7p_a1_Mzgy,nq,4,,False,False,i,LongestVisitFeatureExtractor,5.000568,21.0,3.783562,0.180170,0.068008,3.613019,1.462188,4.096578,2.470968,0.204829,0.722522,0.292404,0.819223,5.282912,1.056462,0.251567,0.079055,53.121441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1499,B13,nq_6p_a3_MzA5,nq,1,,False,False,i,LongestVisitFeatureExtractor,3.639460,12.0,2.603058,0.216921,0.069925,2.049840,1.197677,2.671602,1.711514,0.242873,0.563226,0.329081,0.734065,2.455046,0.674563,0.204587,0.091070,36.204877
1500,B13,nq_6p_a3_MzA5,nq,2,,False,False,i,LongestVisitFeatureExtractor,6.363030,23.0,4.796375,0.208538,0.074070,4.799293,1.424340,5.137655,3.369486,0.233530,0.754246,0.223846,0.807423,6.835824,1.074303,0.297210,0.072162,50.090369
1501,B13,nq_6p_a3_MzA5,nq,3,,True,True,r,LongestVisitFeatureExtractor,7.929685,30.0,5.760470,0.192016,0.088226,5.907418,1.775284,6.519479,3.327590,0.224810,0.744975,0.223878,0.822161,10.487343,1.322542,0.349578,0.058863,64.272470
1502,B13,nq_6p_a3_MzA5,nq,4,,False,False,i,LongestVisitFeatureExtractor,6.326876,26.0,4.242020,0.163155,0.062640,4.692586,2.825167,6.548736,1.660994,0.261949,0.741691,0.446534,1.035066,13.257341,2.095401,0.509898,0.086914,47.281755


In [22]:
dataset = {}
selection = [
            "f_total_time",
            "f_fixn_n",
            "f_fixn_dur_sum",
            "f_fixn_dur_avg",
            "f_fixn_dur_sd",
            "f_scan_distance_h",
            "f_scan_distance_v",
            "f_scan_distance_euclid",
            "f_scan_hv_ratio",
            "f_avg_sacc_length",
            "f_scan_speed_h",
            "f_scan_speed_v",
            "f_scan_speed",
            "f_box_area",
            "f_box_area_per_time",
            "f_fixns_per_box_area",
            "f_hull_area_per_time",
            "f_fixns_per_hull_area"]
for user_id in data.user.unique():
    dataset[user_id] = {}
    for doc_id in data.loc[data['user'] == user_id].document.unique():
        dataset[user_id][doc_id] = {}
        for par_id in data.loc[(data['user'] == user_id) & (data['document'] == doc_id)].paragraph.unique():   
            dataset[user_id][doc_id][par_id] = {}         
            features = data.loc[(data['user'] == user_id) & (data['document'] == doc_id) & (data['paragraph'] == par_id), selection]
            if features is not None and not features.empty:
                dataset[user_id][doc_id][par_id] = features.to_dict('records')[0]
            
            

In [21]:
dataset

{'A01': {'nq_7p_a1_Mzgy': {0: {'f_total_time': 4.69932910956748,
    'f_fixn_n': 17.0,
    'f_fixn_dur_sum': 3.5305216130853134,
    'f_fixn_dur_avg': 0.2076777419461949,
    'f_fixn_dur_sd': 0.083125992322929,
    'f_scan_distance_h': 3.4024826005043396,
    'f_scan_distance_v': 0.8351498955150551,
    'f_scan_distance_euclid': 3.5251694316118556,
    'f_scan_hv_ratio': 4.074098097570801,
    'f_avg_sacc_length': 0.22032308947574097,
    'f_scan_speed_h': 0.7240358189804458,
    'f_scan_speed_v': 0.1777168349019763,
    'f_scan_speed': 0.7501431266933137,
    'f_box_area': 2.841582988302992,
    'f_box_area_per_time': 0.6046784385706767,
    'f_fixns_per_box_area': 0.1671519404884113,
    'f_hull_area_per_time': 0.06401669267593076,
    'f_fixns_per_hull_area': 56.509286910564064},
   1: {'f_total_time': 13.459360552274042,
    'f_fixn_n': 49.0,
    'f_fixn_dur_sum': 11.157894244767926,
    'f_fixn_dur_avg': 0.22771212744424338,
    'f_fixn_dur_sd': 0.11379895379821185,
    'f_scan_di

In [13]:

dataset = ScibotParagraphFeaturesLoader(data_dir="./data/paragraph_features")


In [6]:
df = dataset.google_nq_par_features['A01']['nq_7p_a1_Mzgy']
df

Unnamed: 0,paragraph,f_total_time,f_fixn_n,f_fixn_dur_sum,f_fixn_dur_avg,f_fixn_dur_sd,f_scan_distance_h,f_scan_distance_v,f_scan_distance_euclid,f_scan_hv_ratio,f_avg_sacc_length,f_scan_speed_h,f_scan_speed_v,f_scan_speed,f_box_area,f_box_area_per_time,f_fixns_per_box_area,f_hull_area_per_time,f_fixns_per_hull_area
0,0,4.699329,17.0,3.530522,0.207678,0.083126,3.402483,0.83515,3.525169,4.074098,0.220323,0.724036,0.177717,0.750143,2.841583,0.604678,0.167152,0.064017,56.509287
1,1,13.459361,49.0,11.157894,0.227712,0.113799,8.410302,1.928477,8.845093,4.361112,0.184273,0.624866,0.143281,0.65717,16.219071,1.20504,0.331001,0.045958,79.216128
2,2,10.001136,40.0,7.892463,0.197312,0.094675,7.131122,2.790773,8.393745,2.55525,0.215224,0.713031,0.279046,0.839279,19.901342,1.989908,0.497534,0.066115,60.493638
3,3,13.844947,55.0,10.615664,0.193012,0.10046,11.73275,2.039616,12.165714,5.752431,0.225291,0.847439,0.147318,0.878712,23.930305,1.72845,0.435096,0.045298,87.698505
4,4,5.000568,21.0,3.783562,0.18017,0.068008,3.613019,1.462188,4.096578,2.470968,0.204829,0.722522,0.292404,0.819223,5.282912,1.056462,0.251567,0.079055,53.121441
5,5,6.771854,24.0,5.121064,0.213378,0.099453,5.444154,1.243186,5.758191,4.379195,0.250356,0.803938,0.183581,0.850312,6.768096,0.999445,0.282004,0.069956,50.661652
6,6,12.242355,41.0,8.916676,0.21748,0.100953,11.130555,2.891875,11.677578,3.848906,0.291939,0.909184,0.236219,0.953867,32.18817,2.629247,0.785077,0.046571,71.913035


In [11]:
df_dict = {} 

for par_id in df['paragraph']:
    features = df.loc[df['paragraph'] == par_id].to_dict('records')[0]
    features.pop('paragraph')
    df_dict[par_id] = features
    print(features)

{'f_total_time': 4.69932910956748, 'f_fixn_n': 17.0, 'f_fixn_dur_sum': 3.5305216130853134, 'f_fixn_dur_avg': 0.2076777419461949, 'f_fixn_dur_sd': 0.083125992322929, 'f_scan_distance_h': 3.4024826005043396, 'f_scan_distance_v': 0.8351498955150551, 'f_scan_distance_euclid': 3.5251694316118556, 'f_scan_hv_ratio': 4.074098097570801, 'f_avg_sacc_length': 0.22032308947574097, 'f_scan_speed_h': 0.7240358189804458, 'f_scan_speed_v': 0.1777168349019763, 'f_scan_speed': 0.7501431266933137, 'f_box_area': 2.841582988302992, 'f_box_area_per_time': 0.6046784385706767, 'f_fixns_per_box_area': 0.1671519404884113, 'f_hull_area_per_time': 0.06401669267593076, 'f_fixns_per_hull_area': 56.509286910564064}
{'f_total_time': 13.459360552274042, 'f_fixn_n': 49.0, 'f_fixn_dur_sum': 11.157894244767926, 'f_fixn_dur_avg': 0.22771212744424338, 'f_fixn_dur_sd': 0.11379895379821185, 'f_scan_distance_h': 8.410302121527668, 'f_scan_distance_v': 1.9284766144146896, 'f_scan_distance_euclid': 8.84509332746213, 'f_scan_hv