## EDA TAL

### Import Libraries

In [1]:
import pandas as pd
import re
import ast

### Read CSVs

In [2]:
survey_df = pd.read_csv('../data/survey_results_20250205_154644.csv')
users_df = pd.read_csv('../data/users_data_20250205_154700.csv')

### Convert strings into their appropriate type

AKA convert strings that look like dicts/arrays to actual dicts and arrays

In [3]:
def clean_and_convert(entry):
    # Replace ObjectId with the string version of the ID
    cleaned_entry = re.sub(r"ObjectId\('(.*?)'\)", r"'\1'", entry)
    # Convert the string to Python object
    return ast.literal_eval(cleaned_entry)


## Users DF

### Rename email column to userId for consistency between DFs and drop the _id column since it's wrong

In [4]:
# Apply the function to the corresponding columns for the user df
users_df['form'] = users_df['form'].apply(clean_and_convert)

final_users_df = pd.DataFrame()
final_users_df['userId'] = users_df['email']

final_users_df = pd.concat([final_users_df[['userId']], users_df['form'].apply(pd.Series)], axis=1)
final_users_df = final_users_df.drop(columns=['_id'])

## Survey DF

### Convert the appropriate columns from strings to their respective data structure

In [5]:
# Apply the function to the columns for the survey df
survey_df['windowDimensions'] = survey_df['windowDimensions'].apply(clean_and_convert)
survey_df['gaze'] = survey_df['gaze'].apply(clean_and_convert)
survey_df['formData'] = survey_df['formData'].apply(clean_and_convert)

### Convert the formData and windowDimensions columns in the survey df from a dict to being their own individual columns

In [6]:
final_survey_df = pd.concat([survey_df, survey_df['formData'].apply(pd.Series), survey_df['windowDimensions'].apply(pd.Series)], axis=1)
final_survey_df = final_survey_df.drop(columns=['formData', '__v', '_id'])

### Drop extra start times if the duration exceeds 15 seconds since the videos are only 15 seconds long

In [7]:
for i in range(final_survey_df.shape[0]):
    if len(final_survey_df['gaze'].iloc[i]) > 0:
        endTime = final_survey_df['gaze'].iloc[i][-1]['time']
        startTime = final_survey_df['gaze'].iloc[i][0]['time']

        endTime = pd.to_datetime(endTime / 1000, unit='s')
        startTime = pd.to_datetime(startTime / 1000, unit='s')
        
        duration = (endTime - startTime).total_seconds()

        while len(final_survey_df['gaze'].iloc[i]) > 0 and duration > 15:
            final_survey_df['gaze'].iloc[i].pop(0)

            if len(final_survey_df['gaze'].iloc[i]) > 0:
                endTime = final_survey_df['gaze'].iloc[i][-1]['time']
                startTime = final_survey_df['gaze'].iloc[i][0]['time']

                endTime = pd.to_datetime(endTime / 1000, unit='s')
                startTime = pd.to_datetime(startTime / 1000, unit='s')
                
                duration = (endTime - startTime).total_seconds()
            else:
                duration = 0

### Convert start and end times to a meaningful duration and drop the start/end time columns<br>and the windowDimensions

*Note: Currently the duration column is being dropped later, but it's here in case we want to incorporate this later*

In [8]:
final_survey_df['startTime'] = final_survey_df['startTime'] / 1000
final_survey_df['endTime'] = final_survey_df['endTime'] / 1000
final_survey_df['startTime'] = pd.to_datetime(final_survey_df['startTime'], unit='s')
final_survey_df['endTime'] = pd.to_datetime(final_survey_df['endTime'], unit='s')
final_survey_df['duration'] = final_survey_df['endTime'] - final_survey_df['startTime']
final_survey_df['duration'] = final_survey_df['duration'].dt.total_seconds()
final_survey_df = final_survey_df.drop(columns=['startTime', 'endTime', 'windowDimensions'])

### Merge the users df with the survey df

In [9]:
merged_df = final_survey_df.merge(final_users_df, on='userId', how='left')

### Add a key to the gaze dictionaries for if a hazard was present at that specific timestamp

In [10]:
for i in range(merged_df.shape[0]):
    if len(merged_df['gaze'][i]) == 0:
        continue

    min_time = min([gaze_point['time'] for gaze_point in merged_df['gaze'][i]])

    for j in range(len(merged_df['gaze'][i])):
        if merged_df['hazardDetected'][i] == False or len(merged_df['spacebarTimestamps'][i]) == 0:
            merged_df['gaze'][i][j]['hazard'] = False
        else:
            k = 1
            while k < len(merged_df['spacebarTimestamps'][i]):
                time = merged_df['gaze'][i][j]['time']
                time_during_hazard = time > merged_df['spacebarTimestamps'][i][k-1] and time < merged_df['spacebarTimestamps'][i][k]
                if merged_df['hazardDetected'][i] == True and time_during_hazard:
                    merged_df['gaze'][i][j]['hazard'] = True
                else:
                    merged_df['gaze'][i][j]['hazard'] = True
                k += 2
        
        merged_df['gaze'][i][j]['time'] = (merged_df['gaze'][i][j]['time'] - min_time) / 1000

### Split the entire dataframe to have one row per timestamp with a value of if it's hazardous or not which will be the label

In [11]:
merged_df = merged_df.explode('gaze', ignore_index=True)
normalized = pd.json_normalize(merged_df['gaze'])
merged_df = merged_df.drop(columns=['gaze']).join(normalized)
merged_df.drop(columns=['duration'])

Unnamed: 0,userId,videoId,hazardDetected,noDetectionReason,detectionConfidence,hazardSeverity,attentionFactors,spacebarTimestamps,width,height,...,licenseAge,age,ethnicity,gender,visuallyImpaired,x,y,time,_id,hazard
0,jonahmulcrone@gmail.com,video219,False,noHazards,5,0,[],[],1470,797,...,17,24.0,White,male,True,500.496763,499.953378,0.000,679564a790793f4f61cbbbd4,False
1,jonahmulcrone@gmail.com,video219,False,noHazards,5,0,[],[],1470,797,...,17,24.0,White,male,True,500.898921,497.460984,0.197,679564a790793f4f61cbbbd5,False
2,jonahmulcrone@gmail.com,video219,False,noHazards,5,0,[],[],1470,797,...,17,24.0,White,male,True,514.789690,496.588843,0.396,679564a790793f4f61cbbbd6,False
3,jonahmulcrone@gmail.com,video219,False,noHazards,5,0,[],[],1470,797,...,17,24.0,White,male,True,553.986331,478.562368,0.595,679564a790793f4f61cbbbd7,False
4,jonahmulcrone@gmail.com,video219,False,noHazards,5,0,[],[],1470,797,...,17,24.0,White,male,True,606.061360,461.811118,0.794,679564a790793f4f61cbbbd8,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40873,andersonlennox381@outlook.com,video401,False,noHazards,5,0,[],[],1512,860,...,18,23.0,Black or African American,male,False,890.081049,120.326971,13.774,67a3ca8590793f4f61cc7022,False
40874,andersonlennox381@outlook.com,video401,False,noHazards,5,0,[],[],1512,860,...,18,23.0,Black or African American,male,False,1022.138557,145.055342,14.076,67a3ca8590793f4f61cc7023,False
40875,andersonlennox381@outlook.com,video401,False,noHazards,5,0,[],[],1512,860,...,18,23.0,Black or African American,male,False,888.994377,276.047783,14.376,67a3ca8590793f4f61cc7024,False
40876,andersonlennox381@outlook.com,video401,False,noHazards,5,0,[],[],1512,860,...,18,23.0,Black or African American,male,False,748.477336,311.445999,14.678,67a3ca8590793f4f61cc7025,False


### Save The File (This will be the baseline before more EDA)

In [12]:
#merged_df.to_csv('cleaned_survey_user_data.csv')

### One hot encode the necessary columns

*Note: Can also create embeddings or other encoding methods if too many cols*

In [13]:
attention_factors_exploded = merged_df.explode('attentionFactors')
attn_factor_cols = pd.get_dummies(attention_factors_exploded['attentionFactors'], prefix='attentionFactors')
merged_df = merged_df.drop(columns=['attentionFactors', 'spacebarTimestamps', '_id']).join(attn_factor_cols.groupby(level=0).max())

columns_to_clean = ['noDetectionReason', 'country', 'state', 'city', 'ethnicity', 'gender']

for col in columns_to_clean:
    merged_df[col] = merged_df[col].replace('', pd.NA).fillna('ignore')
    if merged_df[col].dtype == 'object':
        merged_df[col] = merged_df[col].str.lower().replace('', pd.NA).fillna('ignore')

merged_df['city'] = merged_df['city'].replace({'boca': 'boca raton'})

encoded_columns = pd.get_dummies(merged_df, columns=columns_to_clean, prefix=columns_to_clean)
merged_df = encoded_columns
merged_df = merged_df.drop(merged_df.filter(like='_ignore').columns, axis=1)

print(merged_df.columns)


Index(['userId', 'videoId', 'hazardDetected', 'detectionConfidence',
       'hazardSeverity', 'width', 'height', 'duration', 'licenseAge', 'age',
       'visuallyImpaired', 'x', 'y', 'time', 'hazard',
       'attentionFactors_construction', 'attentionFactors_environment',
       'attentionFactors_motion', 'attentionFactors_other',
       'attentionFactors_pedestrian', 'attentionFactors_proximity',
       'attentionFactors_velocity', 'noDetectionReason_nohazards',
       'noDetectionReason_subtlehazards', 'noDetectionReason_uncertain',
       'country_ar', 'country_fr', 'country_tn', 'country_us',
       'state_california', 'state_florida', 'state_massachusetts',
       'state_north carolina', 'state_oregon', 'state_south carolina',
       'state_washington', 'city_boca raton', 'city_boston',
       'city_chapel hill', 'city_charlotte', 'city_coconut creek',
       'city_delray beach', 'city_durham', 'city_los angeles', 'city_miami',
       'city_olympia', 'city_puyallup', 'city_raleigh

### Drop Rows with Missing Data (Because Ahmed's Bitchass didn't fill out the survey right)

In [14]:
merged_df = merged_df.dropna()
#merged_df.to_csv('final_user_survey_data.csv')

In [15]:
for col in merged_df.columns:
    nan_rows = merged_df[merged_df[col].isna()]
    print(f"Row indexes with NaN values in {col}: {len(nan_rows) / merged_df.shape[0]}")

Row indexes with NaN values in userId: 0.0
Row indexes with NaN values in videoId: 0.0
Row indexes with NaN values in hazardDetected: 0.0
Row indexes with NaN values in detectionConfidence: 0.0
Row indexes with NaN values in hazardSeverity: 0.0
Row indexes with NaN values in width: 0.0
Row indexes with NaN values in height: 0.0
Row indexes with NaN values in duration: 0.0
Row indexes with NaN values in licenseAge: 0.0
Row indexes with NaN values in age: 0.0
Row indexes with NaN values in visuallyImpaired: 0.0
Row indexes with NaN values in x: 0.0
Row indexes with NaN values in y: 0.0
Row indexes with NaN values in time: 0.0
Row indexes with NaN values in hazard: 0.0
Row indexes with NaN values in attentionFactors_construction: 0.0
Row indexes with NaN values in attentionFactors_environment: 0.0
Row indexes with NaN values in attentionFactors_motion: 0.0
Row indexes with NaN values in attentionFactors_other: 0.0
Row indexes with NaN values in attentionFactors_pedestrian: 0.0
Row indexes

### Convert video data to 0.5s splits and replace hazard binary data by majority vote per video per time bin

In [16]:
df_copy = merged_df.copy()

time_split=0.28

df_copy['time_bin'] = (df_copy['time'] // time_split).astype(int)  # Create bins of 0.5 seconds

user_grouped = (
    df_copy.groupby(['userId', 'videoId', 'time_bin'])
    .agg({
        'x': 'mean',  # Average x position in the interval
        'y': 'mean',  # Average y position in the interval
        'hazard': 'any'  # True if any row in the interval is hazardous
    })
    .reset_index()
)

# Step 2: Calculate majority hazard vote per videoId and time_bin
majority_hazard = (
    user_grouped.groupby(['videoId', 'time_bin'])['hazard']
    .apply(lambda hazards: hazards.mean() > 0.5)  # True if majority voted True
    .reset_index(name='majority_hazard')
)

# Step 3: Merge back to individual user data, replacing the original hazard value
final_grouped = pd.merge(user_grouped, majority_hazard, on=['videoId', 'time_bin'])
final_grouped['hazard'] = final_grouped['majority_hazard']  # Replace with majority decision
final_grouped = final_grouped.drop(columns=['majority_hazard'])

# Step 4: Create time column representing the start of the interval
final_grouped['time'] = final_grouped['time_bin'] * time_split

# Drop the 'time_bin' column if not needed
final_grouped = final_grouped.drop('time_bin', axis=1)
final_grouped.to_csv('binned_video_data.csv')


In [18]:
import torch
from torch.utils.data import DataLoader
from torchvision.models.video import r3d_18, R3D_18_Weights
from sklearn.model_selection import train_test_split
from training import VideoDataset, evaluate, training_loop
from sklearn.model_selection import StratifiedShuffleSplit

def stratify_split_data(X, y, train_size=20):
    sss = StratifiedShuffleSplit(n_splits=1, train_size=train_size, random_state=42)
    for subset_idx, _ in sss.split(X, y):
        stratified_df = training_df.iloc[subset_idx]
        stratified_labels = labels.iloc[subset_idx]

    X_train, X_test, y_train, y_test = train_test_split(
        stratified_df, stratified_labels, test_size=0.2, stratify=stratified_labels, random_state=42)
    
    return X_train, X_test, y_train, y_test

# Load the data
df = pd.read_csv('./binned_video_data.csv')  # Replace with your actual file
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

time_split = df.iloc[1]['time']
train_size = 50

training_df = df.drop(columns=['hazard'])
labels = df['hazard']

X_train, X_test, y_train, y_test = stratify_split_data(training_df, labels, train_size)

train_dataset = VideoDataset(X_train, y_train, time_splits=time_split, frames_per_clip=540, clip_size=36*time_split)
test_dataset = VideoDataset(X_test, y_test, time_splits=time_split, frames_per_clip=540, clip_size=36*time_split)

train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=5, shuffle=False)

# Load the pre-trained I3D model
model = r3d_18(weights=R3D_18_Weights.KINETICS400_V1)

# Freeze weights
'''
for param in model.parameters():
    param.requires_grad = False
'''

# Modify the final layer for binary classification
model.fc = torch.nn.Linear(model.fc.in_features, 2)

training_loop(model, train_loader, device)

accuracy = evaluate(model, test_loader, device)
print(f"Test Accuracy: {accuracy * 100:.2f}%")  

Current Loss: 0.7569016218185425
Current Loss: 0.7755458652973175
Current Loss: 0.7832516630490621
Current Loss: 0.7302255928516388
Current Loss: 0.726823377609253
Current Loss: 0.7324932714303335
Current Loss: 0.7592461960656303
Current Loss: 0.7286547943949699
Epoch 1, Loss: 0.7287
Current Loss: 0.062136292457580566
Current Loss: 0.20388907194137573


KeyboardInterrupt: 

In [546]:
print(df[df['hazard'] == True].shape[0] / df.shape[0])

0.2001796676149124


### Save All Videos Per User Per Video with Eye tracking overlayed

In [3]:
import numpy as np
import cv2
import os

# Iterate through all unique userId and videoId combinations
for (selected_user, selected_video) in merged_df[['userId', 'videoId']].drop_duplicates().values:
    # Filter the data for the current userId and videoId
    filtered_data = merged_df[(merged_df['userId'] == selected_user) & (merged_df['videoId'] == selected_video)].reset_index(drop=True)
    
    # Skip if no data for the current combination
    if filtered_data.empty:
        continue
    
    # Interpolation function
    def interpolate_positions(times, xs, ys, fps):
        new_times = np.arange(times[0], times[-1], 1 / fps)
        new_xs = np.interp(new_times, times, xs)
        new_ys = np.interp(new_times, times, ys)
        return new_times, new_xs, new_ys

    # Extract time, x, y columns for interpolation
    times = filtered_data['time'].values
    xs = filtered_data['x'].values
    ys = filtered_data['y'].values

    # Load the video
    video_path = f'./data/driving_videos/{selected_video}.mp4'
    cap = cv2.VideoCapture(video_path)

    # Check if video is loaded correctly
    if not cap.isOpened():
        print(f"Warning: Could not open video {video_path}. Skipping...")
        continue

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Output video setup
    output_file = f'./data/training_videos/{selected_user}_{selected_video}.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_file, fourcc, fps, (frame_width, frame_height))

    # Interpolate the positions
    interpolated_times, interpolated_xs, interpolated_ys = interpolate_positions(times, xs, ys, fps)

    # Initialize current frame index
    frame_num = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or frame_num >= len(interpolated_times):
            break

        # Get the interpolated position for the current frame
        x, y = int(interpolated_xs[frame_num]), int(interpolated_ys[frame_num])

        # Draw a circle at the interpolated position
        cv2.circle(frame, (x, y), 10, (0, 255, 0), -1)  # Green circle for normal

        # Write the current frame to the output
        out.write(frame)
        frame_num += 1

    # Release resources
    cap.release()
    out.release()

cv2.destroyAllWindows()

NameError: name 'merged_df' is not defined

### Save Example Vid

In [None]:
import cv2
import numpy as np

data = merged_df.copy()

selected_user = 'Avimarzini123@gmail.com'  # Replace with userId you want
selected_video = 'video423'  # Replace with videoId you want

filtered_data = data[(data['userId'] == selected_user) & (data['videoId'] == selected_video)].reset_index(drop=True)

# Interpolation function between (x, y) points
def interpolate_positions(times, xs, ys, fps):
    new_times = np.arange(times[0], times[-1], 1 / fps)
    new_xs = np.interp(new_times, times, xs)
    new_ys = np.interp(new_times, times, ys)
    return new_times, new_xs, new_ys

# Extract time, x, y columns for interpolation
times = filtered_data['time'].values
xs = filtered_data['x'].values
ys = filtered_data['y'].values

# Load the video
video_path = './data/driving_videos/' + selected_video + '.mp4'  # Replace with the video path
cap = cv2.VideoCapture(video_path)

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
print(fps)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Output video setup
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output_video.mp4', fourcc, fps, (frame_width, frame_height))

# Interpolate the positions
interpolated_times, interpolated_xs, interpolated_ys = interpolate_positions(times, xs, ys, fps)

# Initialize current frame index
frame_num = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret or frame_num >= len(interpolated_times):
        break

    # Get the interpolated position for the current frame
    x, y = int(interpolated_xs[frame_num]), int(interpolated_ys[frame_num])

    # Draw a circle at the interpolated position
    cv2.circle(frame, (x, y), 10, (0, 255, 0), -1)  # Green circle for normal

    # Write the current frame to the output
    out.write(frame)
    frame_num += 1

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()