In [20]:
import pandas as pd
import json
from pathlib import Path
from datetime import datetime

from ds import Participant, GazePath

Populate dictionary with names and emails of participants

In [21]:
EXCLUDE_NAMES = ["Amogh Mannekote", "Mansi Singh", "kmkm", "ash", "mansi", "hju bjh", "Sankalp Mathur"]

In [22]:
participants = []
idx = 1
for p in Path("../name-emails/").glob("*.json"):
    with open(p) as f:
        name_email = json.load(f)
        if "calibration_score" not in name_email or name_email["name"] in EXCLUDE_NAMES:
            continue
        participants.append(Participant(
            name=name_email["name"],
            email=name_email["email_address"],
            calibaration_quality=int(name_email["calibration_score"]),
            start_time=datetime.utcfromtimestamp(float(p.stem)),
        ))

In [23]:
participants

[Participant(name='John Ng', email='johnng@ufl.edu', start_time=datetime.datetime(2022, 3, 24, 20, 58, 19, 116706), calibaration_quality=3, gaze_paths=[]),
 Participant(name='Mariana Molano', email='marianamolano@ufl.edu', start_time=datetime.datetime(2022, 3, 22, 17, 5, 41, 671388), calibaration_quality=3, gaze_paths=[]),
 Participant(name='Jayavidhi Kumar', email='kumar.j@ufl.edu', start_time=datetime.datetime(2022, 3, 25, 18, 22, 16, 811646), calibaration_quality=3, gaze_paths=[]),
 Participant(name='Nishant Agrawal', email='nag2965@gmail.com', start_time=datetime.datetime(2022, 3, 22, 19, 45, 52, 168187), calibaration_quality=2, gaze_paths=[]),
 Participant(name='Jasmine McKenzie', email='jasminemckenzie@ufl.edu', start_time=datetime.datetime(2022, 3, 25, 13, 1, 20, 479027), calibaration_quality=3, gaze_paths=[]),
 Participant(name='Katarina Jurczyk', email='kjurczyk@ufl.edu', start_time=datetime.datetime(2022, 3, 23, 20, 37, 14, 681310), calibaration_quality=3, gaze_paths=[]),
 Pa

In [24]:
# Only retain the latest log for duplicate logs from a single name
filtered_participants = []
for p in sorted(participants, key=lambda p: p.start_time, reverse=True):
    if p.name not in [pp.name for pp in filtered_participants]:
        filtered_participants.append(p)
print(len(filtered_participants))
[(p.name, str(p.start_time)) for p in filtered_participants]

19


[('Heting Wang', '2022-03-25 21:03:46.248357'),
 ('Raghav Gupta', '2022-03-25 19:39:57.838126'),
 ('Eric Navar', '2022-03-25 19:02:59.850827'),
 ('Jayavidhi Kumar', '2022-03-25 18:22:16.811646'),
 ('Yingbo Ma', '2022-03-25 14:14:15.168916'),
 ('Jasmine McKenzie', '2022-03-25 13:01:20.479027'),
 ('John Ng', '2022-03-24 20:58:19.116706'),
 ('Patriel Stapleton', '2022-03-24 18:59:05.612296'),
 ('Amal Hashky', '2022-03-24 17:02:48.240804'),
 ('Katarina Jurczyk', '2022-03-23 20:59:18.731983'),
 ('Nishant Agrawal', '2022-03-22 19:45:52.168187'),
 ('Monica Bhargavi Kodali', '2022-03-22 18:52:09.250428'),
 ('Mariana Molano', '2022-03-22 17:05:41.671388'),
 ('Josh Abraham', '2022-03-22 16:06:29.154850'),
 ('Shaina Murphy', '2022-03-22 14:56:39.521831'),
 ('Hengxu You', '2022-03-21 21:28:04.134762'),
 ('jahnavi Paruchuri', '2022-03-21 21:14:20.903205'),
 ('Sri Chaitanya Nulu', '2022-03-21 18:56:43.324268'),
 ('Anviksha Sharma', '2022-03-21 03:00:02.342730')]

Add gaze paths information to each participant's dict

In [25]:
for idx, participant in enumerate(filtered_participants):
    gaze_dir = Path("../gaze-paths") / participant.email
    for dialogue_file in gaze_dir.glob("*.json"):
        gaze_log = json.load(open(dialogue_file, "r"))
        gaze_path = GazePath(
            submit_time=gaze_log["timestamp"],
            dst=gaze_log["dst"],
            turns_time_series={
                float(timestamp) / 1000.0: int(turn)
                for timestamp, turn in gaze_log["turnsTimeSeries"].items()
            },
            prompt=gaze_log["prompt"]
        )
        participant.gaze_paths.append(gaze_path)

In [26]:
# Separate into semi and goal trajectories
semi_trajectories = []
goal_trajectories = []
for participant in filtered_participants:
    for gaze_path in participant.gaze_paths:
        if gaze_path.prompt == "semi":
            semi_trajectories.append(gaze_path)
        elif gaze_path.prompt == "goal":
            goal_trajectories.append(gaze_path)

semi_trajectories = pd.DataFrame({"trajectory": semi_trajectories, "prompt": "semi"})
goal_trajectories = pd.DataFrame({"trajectory": goal_trajectories, "prompt": "goal"})
trajectories = pd.concat([semi_trajectories, goal_trajectories])

In [27]:
trajectories.head()

Unnamed: 0,trajectory,prompt
0,"GazePath(submit_time=1648243465776, dst={'area...",semi
1,"GazePath(submit_time=1648243136082, dst={'area...",semi
2,"GazePath(submit_time=1648243035969, dst={'area...",semi
3,"GazePath(submit_time=1648243317732, dst={'area...",semi
4,"GazePath(submit_time=1648243279922, dst={'area...",semi


In [29]:
from collections import defaultdict

def duration_spent_on_turns(time_series):
    durations = defaultdict(int)
    # Convert dict to list of tuples - (timestamp, turn_idx)
    time_series = sorted([(k, v) for k, v in time_series.turns_time_series.items()], key=lambda x: x[0])
    if len(time_series) > 1:
        prev_timestamp = time_series[0][0]
        prev_turn = time_series[0][1]
        idx = 0
        while True:
            timestamp, turn = time_series[idx]
            if timestamp - prev_timestamp < 0.5 and prev_turn == turn:
                durations[turn] += timestamp - prev_timestamp
            prev_turn = turn
            prev_timestamp = timestamp
            idx += 1
            if idx >= len(time_series):
                break
    return durations

In [30]:
trajectories["turn_durations"] = trajectories["trajectory"].apply(duration_spent_on_turns)

In [31]:
trajectories["turn_durations"] = trajectories["turn_durations"].apply(json.dumps)
trajectories.head()

Unnamed: 0,trajectory,prompt,turn_durations
0,"GazePath(submit_time=1648243465776, dst={'area...",semi,"{""2"": 14.796999454498291, ""1"": 5.4239993095397..."
1,"GazePath(submit_time=1648243136082, dst={'area...",semi,"{""0"": 10.121999979019165, ""4"": 19.889999866485..."
2,"GazePath(submit_time=1648243035969, dst={'area...",semi,"{""4"": 33.115999937057495, ""0"": 34.605999708175..."
3,"GazePath(submit_time=1648243317732, dst={'area...",semi,"{""0"": 9.204999923706055, ""1"": 3.39199995994567..."
4,"GazePath(submit_time=1648243279922, dst={'area...",semi,"{""0"": 14.164000034332275, ""4"": 9.2470004558563..."


In [32]:
if not Path("processed/").exists():
    Path("processed/").mkdir()
trajectories.to_excel("processed/msamogh-turn-durations-seconds.xlsx", index=False)