In [118]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy
from shapely.geometry import Polygon, Point

def load_participant_figure_data(base_dir, participant_id=None, figure_name=None) -> dict:
    participants = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    participant_id = participants[0] if participants else None

    if not participant_id:
        raise RuntimeError("No participant folders found.")

    print(f"[INFO] Testing participant: {participant_id}")
    participant_path = os.path.join(base_dir, participant_id)
    figure_names = [d for d in os.listdir(participant_path) if os.path.isdir(os.path.join(participant_path, d))]

    if not figure_names:
        raise RuntimeError(f"No figure folders found for {participant_id}")
    
    figure_name = figure_names[0]
    print(f"[INFO] Testing figure: {figure_name}")

    # Paths
    fig_path = os.path.join(participant_path, figure_name)
    screen_path = os.path.join(fig_path, 'screen')
    table_path = os.path.join(fig_path, 'table')

    # Load gaze data
    df_screen = pd.read_csv(os.path.join(screen_path, 'gazepoints.csv'))
    df_table = pd.read_csv(os.path.join(table_path, 'gazepoints.csv'))
    print(f"[SUCCESS] Loaded screen gaze: {df_screen.shape}")
    print(f"[SUCCESS] Loaded table gaze: {df_table.shape}")

    # Load pupil data (required)
    pupil_path = os.path.join(screen_path, 'pupil_info.csv')
    if not os.path.exists(pupil_path):
        raise FileNotFoundError(f"[ERROR] Missing pupil_info.csv for {participant_id} → {figure_name}")
    
    pupil_df = pd.read_csv(pupil_path)
    pupil_df = pupil_df.dropna(subset=['timestamp', 'diameter_right'])
    mean_d = pupil_df['diameter_right'].mean()
    std_d = pupil_df['diameter_right'].std()
    pupil_df['diameter_z'] = (pupil_df['diameter_right'] - mean_d) / std_d
    print(f"[INFO] Loaded pupil data: {pupil_df.shape}")

    # Load screen events and states
    try:
        screen_events = pd.read_csv(os.path.join(screen_path, 'events.csv'))
        screen_states = pd.read_csv(os.path.join(screen_path, 'states.csv'))
        print("[INFO] Loaded screen events and states")
    except Exception as e:
        screen_events, screen_states = None, None
        print(f"[WARNING] Could not load screen events/states: {e}")

    # Load table events and states
    try:
        table_events = pd.read_csv(os.path.join(table_path, 'events.csv'))
        table_states = pd.read_csv(os.path.join(table_path, 'states.csv'))
        print("[INFO] Loaded table events and states")
    except Exception as e:
        table_events, table_states = None, None
        print(f"[WARNING] Could not load table events/states: {e}")

    return {
        'participant_id': participant_id,
        'figure_name': figure_name,
        'df_screen': df_screen,
        'df_table': df_table,
        'pupil_df': pupil_df,
        'screen_events': screen_events,
        'screen_states': screen_states,
        'table_events': table_events,
        'table_states': table_states
    }

data = load_participant_figure_data("gaipat_data/participants")
df_screen = data['df_screen']
df_table = data['df_table']
pupil_df = data['pupil_df']

# check if it works
print(df_screen.head())
print(df_table.head())
print(pupil_df.head())


[INFO] Testing participant: 87891249
[INFO] Testing figure: sc
[SUCCESS] Loaded screen gaze: (4525, 3)
[SUCCESS] Loaded table gaze: (3432, 3)
[INFO] Loaded pupil data: (533, 6)
[INFO] Loaded screen events and states
[INFO] Loaded table events and states
       timestamp   x   y
0  1706261126399 NaN NaN
1  1706261126410 NaN NaN
2  1706261126420 NaN NaN
3  1706261126432 NaN NaN
4  1706261126443 NaN NaN
       timestamp         x         y
0  1706261126399  0.345623  0.728385
1  1706261126417  0.343288  0.725673
2  1706261126435  0.339605  0.725688
3  1706261126448  0.336395  0.721700
4  1706261126466       NaN       NaN
         timestamp  confidence_right  confidence_left  diameter_right  \
449  1706261131918                 1                1        4.481079   
450  1706261131927                 1                1        4.499557   
451  1706261131938                 1                1        4.486954   
452  1706261131950                 1                1        4.486954   
453  1706

In [119]:
def merge_preprocess_gaze(df_screen, df_table) -> pd.DataFrame:
    """
    Merges and preprocesses gaze data from screen and table into a single dataframe.
    - add source column
    - drop NaN
    - convert to seconds
    - sort by timestamp
    """

    # assign source labels
    df_screen['source'] = 'screen'
    df_table['source'] = 'table'

    # drop NaN, specifying columns in abundance of caution
    df_screen = df_screen.dropna(subset=['x', 'y', 'timestamp'])
    df_table = df_table.dropna(subset=['x', 'y', 'timestamp'])

    # merge dataframes
    df = pd.concat([df_screen, df_table], ignore_index=True)

    # sort by timestamp
    df = df.sort_values('timestamp').reset_index(drop=True)

    return df

df_merged = merge_preprocess_gaze(df_screen, df_table)

### Gaze Entropy

In [120]:
def calc_gaze_entropy(xy_points, bins=10) -> float:
    """
    Calculate spatial entropy of gaze data.
    Shannon entropy quantifies how unpredictable the location of a point is based on its x and y values

    Parameters:
        xy_points (np.ndarray): 2D array of shape (N, 2) for gaze coordinates
        bins (int): number of bins per axis for histogram
    Returns:
        float: Shannon entropy in bits
    """
    if xy_points.shape[0] < 2:
        return np.nan  # when there's not enough data to compute entropy

    # 2D histogram over gaze space
    H, _, _ = np.histogram2d(xy_points[:, 0], xy_points[:, 1], bins=bins)

    # Flatten and normalize to get probabilities
    p = H.flatten() / np.sum(H)
    # remove zero bins to avoid log(0)
    p = p[p > 0]  

    # compute Shannon entropy in bits
    return entropy(p, base=2)

# calculating entropy in 2 second chunks (2000 ms) with 500 ms steps
def compute_entropy_over_time(df, window_size=2000, step_size=500, bins=10) -> pd.DataFrame:
    """
    Slides a time window over gaze data and computes spatial entropy per window.
    Returns a DataFrame with: start_time, end_time, entropy
    helps understand how gaze patterns change over time
    """
    results = []
    start_time = df['timestamp'].min()
    end_time = df['timestamp'].max()
    current = start_time

    while current + window_size <= end_time:
        window = df[(df['timestamp'] >= current) & (df['timestamp'] < current + window_size)]
        if len(window) >= 2:
            xy = window[['x', 'y']].to_numpy()
            ent = calc_gaze_entropy(xy, bins=bins)
            results.append({
                'start_time': current,
                'end_time': current + window_size,
                'entropy': ent
            })
        current += step_size

    return pd.DataFrame(results)

entropy_df = compute_entropy_over_time(df_merged)

# check it worked
print(entropy_df.head())

      start_time       end_time   entropy
0  1706261126399  1706261128399  3.489089
1  1706261126899  1706261128899  3.360848
2  1706261127399  1706261129399  2.654981
3  1706261127899  1706261129899  2.052651
4  1706261128399  1706261130399  2.584189


Setting thresholds, 3 distraction levels (could make it more in the future if we have more physiological signals)

In [121]:
def assign_distraction_levels(entropy_df):
    """
    Assigns distraction levels to entropy values using fixed thresholds at 1/3 and 2/3 quantiles.
    Returns: entropy_df with a new column 'distraction_level' (int: 0, 1, 2)
    """
    # split into quartiles, we want 3 attention categories: low, medium, high
    thresholds = entropy_df['entropy'].quantile([1/3, 2/3]).values

    entropy_df['distraction_level'] = pd.cut(
        entropy_df['entropy'],
        bins=[-np.inf, thresholds[0], thresholds[1], np.inf],
        labels=[0, 1, 2],
        include_lowest=True
    ).astype(int)

    return entropy_df

# check if it worked
entropy_df = assign_distraction_levels(entropy_df)
print(entropy_df.head())

      start_time       end_time   entropy  distraction_level
0  1706261126399  1706261128399  3.489089                  2
1  1706261126899  1706261128899  3.360848                  1
2  1706261127399  1706261129399  2.654981                  0
3  1706261127899  1706261129899  2.052651                  0
4  1706261128399  1706261130399  2.584189                  0


### Pupil Data

In [122]:
def preprocess_pupil_data(pupil_df) -> pd.DataFrame:
    """
    Drops low-confidence or missing samples and normalizes diameter within-participant.
    """
    # drops missing/invalid values
    pupil_df = pupil_df.dropna(subset=['timestamp', 'diameter_right', 'confidence_right'])

    # filter by confidence threshold, if exists
    if 'confidence_right' in pupil_df.columns:
        pupil_df = pupil_df[pupil_df['confidence_right'] >= 0.6]

    # Normalize diameter (z-score)
    mean_d = pupil_df['diameter_right'].mean()
    std_d = pupil_df['diameter_right'].std()
    pupil_df['diameter_z'] = (pupil_df['diameter_right'] - mean_d) / std_d

    return pupil_df

In [123]:
def compute_pupil_features(entropy_df, pupil_df) -> pd.DataFrame:
    """
    Computes mean and std of pupil diameter (z-scored) per entropy window.
    Adds valid sample ratio as a quality metric.
    """
    means, stds, valid_ratios = [], [], []

    for _, row in entropy_df.iterrows():
        start, end = row['start_time'], row['end_time']
        window = pupil_df[(pupil_df['timestamp'] >= start) & (pupil_df['timestamp'] < end)]

        if len(window) == 0:
            means.append(np.nan)
            stds.append(np.nan)
            valid_ratios.append(0.0)
        else:
            means.append(window['diameter_z'].mean())
            stds.append(window['diameter_z'].std())
            # pupil_valid_ratio = number of pupil samples in window / (window duration in milliseconds)
            valid_ratios.append(len(window) / (end - start))  # samples per ms

    entropy_df['pupil_mean'] = means
    entropy_df['pupil_std'] = stds
    entropy_df['pupil_valid_ratio'] = valid_ratios

    return entropy_df


In [124]:
# now add pupil features to entropy_df
entropy_df = compute_pupil_features(entropy_df, pupil_df)

# testing to see if its working, some are NaN but maybe no pupil data in that window
print(entropy_df.head())
print("[PUPIL] range:", pupil_df['timestamp'].min(), "→", pupil_df['timestamp'].max())
print("[ENTROPY] range:", entropy_df['start_time'].min(), "→", entropy_df['end_time'].max())
entropy_df[['start_time', 'end_time', 'pupil_mean', 'pupil_valid_ratio']].tail(10)


      start_time       end_time   entropy  distraction_level  pupil_mean  \
0  1706261126399  1706261128399  3.489089                  2         NaN   
1  1706261126899  1706261128899  3.360848                  1         NaN   
2  1706261127399  1706261129399  2.654981                  0         NaN   
3  1706261127899  1706261129899  2.052651                  0         NaN   
4  1706261128399  1706261130399  2.584189                  0         NaN   

   pupil_std  pupil_valid_ratio  
0        NaN                0.0  
1        NaN                0.0  
2        NaN                0.0  
3        NaN                0.0  
4        NaN                0.0  
[PUPIL] range: 1706261131918 → 1706261182072
[ENTROPY] range: 1706261126399 → 1706261181899


Unnamed: 0,start_time,end_time,pupil_mean,pupil_valid_ratio
98,1706261175399,1706261177399,-0.108267,0.006
99,1706261175899,1706261177899,,0.0
100,1706261176399,1706261178399,,0.0
101,1706261176899,1706261178899,,0.0
102,1706261177399,1706261179399,,0.0
103,1706261177899,1706261179899,,0.0
104,1706261178399,1706261180399,,0.0
105,1706261178899,1706261180899,,0.0
106,1706261179399,1706261181399,-0.116332,0.014
107,1706261179899,1706261181899,-0.166275,0.035


Prep data for training

In [125]:
# define features to train model on
features = ['entropy', 'pupil_mean', 'pupil_std', 'pupil_valid_ratio']
# define target variable
target = 'distraction_level'

# drop all NaNs, based on features defined above
train_df = entropy_df.dropna(subset=features + [target])

print(train_df.head())

       start_time       end_time   entropy  distraction_level  pupil_mean  \
8   1706261130399  1706261132399  3.490442                  2    1.464984   
9   1706261130899  1706261132899  3.585266                  2    1.464984   
10  1706261131399  1706261133399  3.472622                  2    1.464984   
11  1706261131899  1706261133899  3.296836                  1    1.464984   
24  1706261138399  1706261140399  2.369924                  0    1.065059   

    pupil_std  pupil_valid_ratio  
8    0.156792             0.0090  
9    0.156792             0.0090  
10   0.156792             0.0090  
11   0.156792             0.0090  
24   0.280486             0.0165  


### Train
#### Naive Bayes and Logistic Regression

In [126]:
# train, test, split first
from sklearn.model_selection import train_test_split

# define X and y from train_df
X = train_df[features]
y = train_df[target]

# set aside 20% for final testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [127]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Naive Bayes Classification
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
y_pred_nb = clf_nb.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.60      0.75      0.67         4
           2       1.00      0.50      0.67         4

    accuracy                           0.70        10
   macro avg       0.76      0.75      0.71        10
weighted avg       0.77      0.70      0.69        10



In [128]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(solver='lbfgs', max_iter=1000)
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.75      0.86         4
           2       1.00      1.00      1.00         4

    accuracy                           0.90        10
   macro avg       0.89      0.92      0.89        10
weighted avg       0.93      0.90      0.90        10



LR performs slightly better

### Test

In [129]:
# Naive Bayes model
from sklearn.metrics import classification_report, confusion_matrix

# predict on test set
y_pred_nb = clf_nb.predict(X_test)

print("[TEST] Naive Bayes Results:")
print(classification_report(y_test, y_pred_nb))

# confusion matrix, but joke is im the one who is confused
print("Confusion Matrix (Naive Bayes):")
print(confusion_matrix(y_test, y_pred_nb))

[TEST] Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.60      0.75      0.67         4
           2       1.00      0.50      0.67         4

    accuracy                           0.70        10
   macro avg       0.76      0.75      0.71        10
weighted avg       0.77      0.70      0.69        10

Confusion Matrix (Naive Bayes):
[[2 0 0]
 [1 3 0]
 [0 2 2]]


In [130]:
# Logistic Regression model
# Predict on test set
y_pred_lr = clf_lr.predict(X_test)

print("[TEST] Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))

# confusion matrix
print("Confusion Matrix (LogReg):")
print(confusion_matrix(y_test, y_pred_lr))


[TEST] Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.75      0.86         4
           2       1.00      1.00      1.00         4

    accuracy                           0.90        10
   macro avg       0.89      0.92      0.89        10
weighted avg       0.93      0.90      0.90        10

Confusion Matrix (LogReg):
[[2 0 0]
 [1 3 0]
 [0 0 4]]


In [131]:
def process_all_participants(base_dir="gaipat_data/participants"):
    import os
    all_dfs = []

    participants = [p for p in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, p))]

    for pid in participants:
        try:
            data = load_participant_figure_data(base_dir, participant_id=pid)
            df_screen = data['df_screen']
            df_table = data['df_table']
            pupil_df = data['pupil_df']

            df_merged = merge_preprocess_gaze(df_screen, df_table)
            entropy_df = compute_entropy_over_time(df_merged)
            entropy_df = assign_distraction_levels(entropy_df)

            pupil_df = preprocess_pupil_data(pupil_df)
            entropy_df = compute_pupil_features(entropy_df, pupil_df)

            # Final cleanup
            features = ['entropy', 'pupil_mean', 'pupil_std', 'pupil_valid_ratio']
            entropy_df = entropy_df.dropna(subset=features + ['distraction_level'])

            if not entropy_df.empty:
                all_dfs.append(entropy_df)

        except Exception as e:
            print(f"[SKIPPED] {pid}: {e}")

    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()

full_df = process_all_participants()
print(full_df.shape)
full_df.head()


[INFO] Testing participant: 87891249
[INFO] Testing figure: sc
[SUCCESS] Loaded screen gaze: (4525, 3)
[SUCCESS] Loaded table gaze: (3432, 3)
[INFO] Loaded pupil data: (533, 6)
[INFO] Loaded screen events and states
[INFO] Loaded table events and states
[INFO] Testing participant: 87891249
[INFO] Testing figure: sc
[SUCCESS] Loaded screen gaze: (4525, 3)
[SUCCESS] Loaded table gaze: (3432, 3)
[INFO] Loaded pupil data: (533, 6)
[INFO] Loaded screen events and states
[INFO] Loaded table events and states
[INFO] Testing participant: 87891249
[INFO] Testing figure: sc
[SUCCESS] Loaded screen gaze: (4525, 3)
[SUCCESS] Loaded table gaze: (3432, 3)
[INFO] Loaded pupil data: (533, 6)
[INFO] Loaded screen events and states
[INFO] Loaded table events and states
[INFO] Testing participant: 87891249
[INFO] Testing figure: sc
[SUCCESS] Loaded screen gaze: (4525, 3)
[SUCCESS] Loaded table gaze: (3432, 3)
[INFO] Loaded pupil data: (533, 6)
[INFO] Loaded screen events and states
[INFO] Loaded table ev

Unnamed: 0,start_time,end_time,entropy,distraction_level,pupil_mean,pupil_std,pupil_valid_ratio
0,1706261130399,1706261132399,3.490442,2,1.464984,0.156792,0.009
1,1706261130899,1706261132899,3.585266,2,1.464984,0.156792,0.009
2,1706261131399,1706261133399,3.472622,2,1.464984,0.156792,0.009
3,1706261131899,1706261133899,3.296836,1,1.464984,0.156792,0.009
4,1706261138399,1706261140399,2.369924,0,1.065059,0.280486,0.0165


Note to self for later to improve model performance:
- add pupil info for better means of interpreting cognitive state via pupil dilation
- add AOIs for both screen and table (need to calculate from slides provided)
- Add events.csv logic to: Track task steps, extract errors or redundant actions, add task_phase, action_count, or task_efficiency features