In [1]:
import os
import random

import imagesize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import model_selection
from tqdm import tqdm

In [2]:
def train_test_split(df, on, ratio, seed=None):
    """
    Splits the df into train/test, disjoint on values in columns given by param 'on'.
    The param 'ratio' splits the unique values in each column by that ratio, so the returned
    sets may have a different ratio overall.

    :param df: pandas dataframe
    :param on: list or str, column names
    :param ratio: float, train/test split ratio on each column
    :param seed: int, random seed for shuffling
    """

    if isinstance(on, str):
        on = [on]
    assert isinstance(on, list)

    train_df = df.copy()
    test_df = df.copy()

    random.seed(seed)
    for col in on:
        unique_elems = df[col].unique().tolist()
        random.shuffle(unique_elems)

        train_num = int(ratio * len(unique_elems))
        train = unique_elems[:train_num]
        test = unique_elems[train_num:]

        train_df = train_df[train_df[col].isin(train)]
        test_df = test_df[test_df[col].isin(test)]

    return train_df, test_df

In [3]:
# Define 'apply' funcs
def stdev_2d(df):
    assert df.shape[1] == 2
    x = df.values[:, 0] / 22
    y = df.values[:, 1] / 22
    stdev_2d = np.sqrt(np.mean(np.square(x - x.mean()) + np.square(y - y.mean())))
    
    return stdev_2d

def compute_fixes_before_switch(df):
    x = df[df["Switched"] == 1].index.values
    fixes_before_switch = x[1:] - x[:-1]
    return np.mean(fixes_before_switch)

In [4]:
links_str = "TempSeq" \
    + "Fits_" \
    + "DetailBF88_" \
    + "IncRLTrans_" \
    + "UseLstFrstClust_" \
    + "BW22"
stats_cols = [
    "ImagePair",
    "Examiner",
    "Prefix",
    "Mating",
    "Outcome",
    "Difficulty",
    "Trial",
    "Conclusion-Simple",
    "AllIndvClustLinksFoundBW60",
    "PctClarRedYellow",
    "PctClarGreen",
    "PctClarBlue",
    "EMDDistanceToCorrect_C_Left",
    "EMDDistanceToCorrect_C_Right",
    "EMDDistanceJustDecidingToCorrect_C_Left",
    "EMDDistanceJustDecidingToCorrect_C_Right",
    "EMDLeftCToRightCSelf",
    "EMDLeftCToRightCSelfDeciding",
#     "AveRawCorrWeight" + links_str,
#     "RawNumHighQualityLinks" + links_str,
#     "RawTotalLinks" + links_str,
#     "RawRatioHighQuality" + links_str,
#     "tCirc" + links_str,
#     "ActivConc" + links_str,
#     "Angle" + links_str,
#     "Scale" + links_str,
#     "RelativeAngle" + links_str,
#     "AveMinDist" + links_str,
    "AveDeviationFromGroundTruthTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3InRidgeWidths",
    "AveDeviationFromGroundTruthTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3",
    "NumHighQualityLinksTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3",
    "AnalysisPercentVisitedCellsVisitedLatent",
    "ComparePercentVisitedCellsVisitedLatent",
    "AnalysisPropFixNearWBMinutiaK=22",
    "ComparisonPropFixNearWBMinutiaK=22"
]
df = pd.read_csv(os.path.join("..", "data", "CwCeTrialStats_20200324.csv"), usecols=stats_cols)
df["TrialCode"] = df["ImagePair"] + "_" + df["Examiner"]
df = df.drop(columns=["ImagePair", "Examiner"])

In [5]:
fixation_cols = [
    "ImagePair",
    "Examiner",
    "Image",
    "Phase",
    "TimeInPhase",
    "Subphase",
    "FixX",
    "FixY",
    "Speed",
    "DistanceOfPriorSaccade"
]
fixations_df = pd.read_csv(os.path.join("..", "data", "CwCe_OK_Fixations_20180703.csv"), usecols=fixation_cols)
fixations_df["TrialCode"] = fixations_df["ImagePair"] + "_" + fixations_df["Examiner"]
fixations_df = fixations_df.drop(columns=["ImagePair", "Examiner"])

In [6]:
# Drop NV trials
df = df[df["Outcome"] != "NV"]

In [7]:
# Map categorical features
df["Prefix"] = df["Prefix"].map({"CE": 0, "CW": 1})

In [8]:
for col in df.columns:
    pct_null = df[col].isnull().sum() / df.shape[0]
    if pct_null > 0.3:
        print(col, pct_null)

AveDeviationFromGroundTruthTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3InRidgeWidths 0.636978579481398
AveDeviationFromGroundTruthTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3 0.636978579481398
NumHighQualityLinksTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3 0.3337091319052988
AnalysisPropFixNearWBMinutiaK=22 0.39177001127395716
ComparisonPropFixNearWBMinutiaK=22 0.39177001127395716


In [9]:
# Fill with specific value
fixations_df["Subphase"] = fixations_df["Subphase"].fillna("na")

# Fill NaN links with 0
df["NumHighQualityLinksTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3"] = df["NumHighQualityLinksTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3"].fillna(0)

In [10]:
# Onehot encode categorical variables
fixations_df = pd.get_dummies(fixations_df, columns=["Subphase"])

In [11]:
# Map difficulty to [0, 1]
difficulty_map = {"VeryEasy": 0, "Easy": 0.25, "Moderate": 0.5, "Difficult": 0.75, "VeryDifficult": 1}
df["Difficulty"] = df["Difficulty"].map(difficulty_map)

In [12]:
grouped_fixations = fixations_df.groupby("TrialCode")

In [13]:
# Fixation counts
num_fixations = grouped_fixations.count().max(axis=1).reset_index()
num_fixations.columns = ["TrialCode", "Num Fixations"]
df = df.merge(num_fixations, on="TrialCode")

for image, phase in [("Left", "A"), ("Left", "C"), ("Right", "C")]:
    fixations = fixations_df[(fixations_df["Image"] == image) & (fixations_df["Phase"] == phase)]

    # Count fixations in each image/phase
    num_fixations = fixations.groupby("TrialCode").count().max(axis=1).reset_index()
    num_fixations.columns = ["TrialCode", f"Pct {phase}-{image} Fixations"]
    
    # Calculate spread of fixations in image image/phase
    fix_stdevs = fixations.groupby("TrialCode")["FixX", "FixY"].apply(stdev_2d).reset_index()
    fix_stdevs.columns = ["TrialCode", f"{phase}-{image} Fix Stdev"]

    df = df.merge(num_fixations, on="TrialCode")
    df = df.merge(fix_stdevs, on="TrialCode")

    df[f"Pct {phase}-{image} Fixations"] /= df["Num Fixations"]

  


In [14]:
# Count the number of times the examiner switched images in a trial
fixations_df["Image"] = fixations_df["Image"].map({"Left": 0, "Right": 1})
fixations_df["Switched"] = fixations_df.groupby("TrialCode")["Image"].diff().abs()
switches = fixations_df.groupby("TrialCode")["Switched"].sum().reset_index()
switches["Switched"] = switches["Switched"].astype(int)

# Map image back to "Left" and "Right"
fixations_df["Image"] = fixations_df["Image"].map({0: "Left", 1: "Right"})

# Merge into TrialStats and divide by total fixations
df = df.merge(switches, on="TrialCode")
df["Switched"] /= ((df["Pct C-Left Fixations"] + df["Pct C-Right Fixations"]) * df["Num Fixations"])

In [15]:
# Count average fixations before each switch
fixes_before_switch = fixations_df.groupby("TrialCode").apply(compute_fixes_before_switch).reset_index()
fixes_before_switch = fixes_before_switch.rename(columns={0: "FixationsBeforeSwitch"})

df = df.merge(fixes_before_switch, on="TrialCode")

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [16]:
# Time spent in phase
for phase in ["A", "C"]:
    fixations = fixations_df[fixations_df["Phase"] == phase]
    time_in_phase = fixations.groupby("TrialCode")["TimeInPhase"].max().reset_index()
    time_in_phase.columns = ["TrialCode", f"Phase {phase} Time"]
    
    df = df.merge(time_in_phase, on="TrialCode")

In [17]:
# Pct fixations in each Kundel subphase, in comparison phase only
subphase_cols = ["Subphase_Deciding", "Subphase_Scanning", "Subphase_na"]
fixations = fixations_df[fixations_df["Phase"] == "C"]
fixes_in_subphase = fixations.groupby("TrialCode")[subphase_cols].sum().reset_index()
fixes_in_subphase.columns = ["TrialCode", "Pct Deciding", "Pct Scanning", "Pct NA"]
df = df.merge(fixes_in_subphase, on="TrialCode")

for behavior in ["Deciding", "Scanning", "NA"]:
    df[f"Pct {behavior}"] /= ((df["Pct C-Left Fixations"] + df["Pct C-Right Fixations"]) * df["Num Fixations"])

In [18]:
# Pct in each speed bin
fixations_df["Speed"] = pd.cut(
    fixations_df["Speed"],
    bins=[0, 100, 200, np.inf],
    labels=["Slow", "Medium", "Fast"]
)
speeds = fixations_df.groupby("TrialCode")["Speed"].value_counts(normalize=True)
speeds.name = "Pct"
speeds = speeds.reset_index().pivot(index="TrialCode", columns="Speed", values="Pct")
speeds.columns = [f"Speed{col}" for col in speeds.columns]
speeds = speeds.fillna(0)

df = df.merge(speeds, on="TrialCode")

In [19]:
# Pct in each saccade distance bin
fixations_df["DistanceOfPriorSaccade"] = pd.cut(
    fixations_df["DistanceOfPriorSaccade"],
    bins=[0, 88, 176, np.inf],
    labels=["Short", "Medium", "Long"]
)
saccades = fixations_df.groupby("TrialCode")["DistanceOfPriorSaccade"].value_counts(normalize=True)
saccades.name = "Pct"
saccades = saccades.reset_index().pivot(index="TrialCode", columns="DistanceOfPriorSaccade", values="Pct")
saccades.columns = [f"Saccade{col}" for col in saccades.columns]
saccades = saccades.fillna(0)

df = df.merge(saccades, on="TrialCode")

In [20]:
# Adjust EMD distances to be in ridge widths
emd_cols = [
    "EMDLeftCToRightCSelf",
    "EMDLeftCToRightCSelfDeciding",
    "EMDDistanceToCorrect_C_Left",
    "EMDDistanceToCorrect_C_Right"
]
for col in emd_cols:
    df[col] *= 8  # image scaled down by 8
    df[col] /= 22  # ridge width = 22

In [21]:
# Split TrialCode back into ImagePair and Examiner
df[["ImagePair", "Examiner"]] = df["TrialCode"].str.split("_", expand=True)
df = df.drop(columns="TrialCode")

In [22]:
df.head()

Unnamed: 0,Trial,Prefix,Mating,Conclusion-Simple,Difficulty,Outcome,AllIndvClustLinksFoundBW60,PctClarRedYellow,PctClarGreen,PctClarBlue,...,Pct Scanning,Pct NA,SpeedFast,SpeedMedium,SpeedSlow,SaccadeLong,SaccadeMedium,SaccadeShort,ImagePair,Examiner
0,5,0,Nonmates,Ex,0.25,TN,,0.0,0.0,1.0,...,0.363636,0.636364,0.57377,0.147541,0.278689,0.207792,0.207792,0.584416,CE001,Y503
1,1,0,Nonmates,Ex,0.0,TN,,0.0,0.0,1.0,...,0.428571,0.5,0.642857,0.261905,0.095238,0.297872,0.340426,0.361702,CE001,Y506
2,11,0,Nonmates,ID,0.5,FP,8.0,0.00289,0.144509,0.852601,...,0.13172,0.442204,0.445619,0.274924,0.279456,0.082718,0.175775,0.741507,CE001,Y507
3,10,0,Nonmates,Ex,0.0,TN,4.0,0.0,0.093023,0.906977,...,0.123596,0.370787,0.544304,0.202532,0.253165,0.123457,0.197531,0.679012,CE001,Y513
4,1,0,Nonmates,Ex,0.0,TN,0.0,0.0,0.0,1.0,...,0.291667,0.708333,0.470588,0.147059,0.382353,0.175676,0.202703,0.621622,CE001,Y516


In [23]:
# Check for null values
rows_with_null = df[df.isna().any(axis=1)].shape[0]
if rows_with_null:
    print(f"Null values in {rows_with_null} rows\n")
    for col_with_nulls in df.columns[df.isna().any(axis=0)]:
        print(col_with_nulls)
else:
    print("No null values :)")    

Null values in 1229 rows

Difficulty
AllIndvClustLinksFoundBW60
PctClarRedYellow
PctClarGreen
PctClarBlue
EMDDistanceToCorrect_C_Left
EMDDistanceToCorrect_C_Right
EMDDistanceJustDecidingToCorrect_C_Left
EMDDistanceJustDecidingToCorrect_C_Right
EMDLeftCToRightCSelfDeciding
AveDeviationFromGroundTruthTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3InRidgeWidths
AveDeviationFromGroundTruthTempSeqFits_Detail_JstUnq_NormTransMat_NoClusterPrune_BW66_Thresh0.3
AnalysisPropFixNearWBMinutiaK=22
ComparisonPropFixNearWBMinutiaK=22


In [24]:
df.to_csv(os.path.join("..", "data", "cleaned_data.csv"), index=False)

In [25]:
df.groupby(["Prefix", "Outcome"]).size()

Prefix  Outcome
0       FP           4
        TN         300
        TP         243
1       FN         178
        FP           6
        Inc        356
        TN         434
        TP         250
dtype: int64