In [1]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
import sys
sys.path.append("../.venv/lib/python3.9/site-packages/")
sys.path.append("..")

In [29]:
from typing import Any, List

import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import ttest_rel, ttest_ind, norm
from sklearn.base import BaseEstimator
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error

from coordination.model.beta_coordination_blending_latent_vocalics import BetaCoordinationBlendingLatentVocalics
from coordination.model.utils.coordination_blending_latent_vocalics import LatentVocalicsDataset
from coordination.model.utils.beta_coordination_blending_latent_vocalics import BetaCoordinationLatentVocalicsDataSeries
from scripts.formatting import set_size

In [20]:
PLOTS_DIR = "/Users/paulosoares/manuscript/experimental_agenda/figures"

def save_plot(fig: Any, name: str):
    fig.savefig(f"{PLOTS_DIR}/{name}.pdf", format='pdf', bbox_inches='tight')

In [21]:
def load_datasets(advisor: str):
    with open(f"/Users/paulosoares/data/study-3_2022/datasets/{advisor}/mission1_dataset.pkl", "rb") as f:
        mission1_dataset = pickle.load(f)
    
    with open(f"/Users/paulosoares/data/study-3_2022/datasets/{advisor}/mission2_dataset.pkl", "rb") as f:
        mission2_dataset = pickle.load(f)
    
    with open(f"/Users/paulosoares/data/study-3_2022/datasets/{advisor}/all_missions_dataset.pkl", "rb") as f:
        all_missions_dataset = pickle.load(f)
    
    return mission1_dataset, mission2_dataset, all_missions_dataset

def load_model_inferences(advisor: str, ref_date: str, training_type: str):
    with open(f"/Users/paulosoares/data/study-3_2022/inferences/beta_gaussian/{advisor}/{training_type}/mission1/{ref_date}/inference_summaries.pkl", "rb") as f:
        mission1_summaries = pickle.load(f)
    
    with open(f"/Users/paulosoares/data/study-3_2022/inferences/beta_gaussian/{advisor}/{training_type}/mission2/{ref_date}/inference_summaries.pkl", "rb") as f:
        mission2_summaries = pickle.load(f)
    
    with open(f"/Users/paulosoares/data/study-3_2022/inferences/beta_gaussian/{advisor}/{training_type}/all_missions/{ref_date}/inference_summaries.pkl", "rb") as f:
        all_missions_summaries = pickle.load(f)
    
    return mission1_summaries, mission2_summaries, all_missions_summaries

def cohens_d(x1: np.ndarray, x2: np.ndarray):
    n1 = len(x1)
    n2 = len(x2)
    sp = np.sqrt(((n1 - 1) * np.var(x1) + (n2 - 1) * np.var(x2))/ (n1 + n2 - 2))
    
    return (np.mean(x1) - np.mean(x2)) / sp    

def compare_conditions(x1: np.ndarray, x2: np.ndarray, alternative: str = "greater", paired: bool = False):
    d = cohens_d(x1, x2)
    
    if paired:
        _, p_val = ttest_rel(x1, x2, alternative=alternative)
    else:
        _, p_val = ttest_ind(x1, x2, alternative=alternative)
    
    results = {
        "cohens_d": d, 
        "p_val": p_val
    }    
    
    return results

def team_process_scale_survey_score(series: BetaCoordinationLatentVocalicsDataSeries) -> float:
    return np.mean(np.array([value for value in series.team_process_surveys.values()]))

def team_satisfaction_survey_score(series: BetaCoordinationLatentVocalicsDataSeries) -> float:
    return np.mean(np.array([value for value in series.team_satisfaction_surveys.values()]))

In [22]:
# Loading all datasets
datasets = {
    "no_advisor": load_datasets("no_advisor"),
    "human_advisor": load_datasets("human_advisor"),
    "tomcat_advisor": load_datasets("tomcat_advisor"),
    "all_conditions": load_datasets("all_conditions")
}

In [59]:
# All models and conditions we want to evaluate
models = [
    {"ref_date":"2022.12.02--15", "training_type":"single_execution"},
    {"ref_date":"2022.12.04--12", "training_type":"single_execution_no_self_dep"},
    {"ref_date":"2022.12.04--22", "training_type":"single_execution_intensity_only"},
    {"ref_date":"2022.12.05--09", "training_type":"single_execution_pitch_only"},
    {"ref_date":"2022.12.07--18", "training_type":"single_execution_4_features"}
#       {"ref_date":"2022.12.15--13", "training_type":"single_execution_gendered"},
]

In [60]:
from functools import partial
AGGR_FN = lambda x: np.mean(x[240:-120])
# AGGR_FN = np.max
# AGGR_FN = np.var
# AGGR_FN = partial(np.percentile, q=75)

# Hypothesis I: Predictive Power

Coordination is predictive of outcome measure ($M_i$). For each outcome measure, we train Bayesian linear model that tells us p($M_i$ | $C$) in a model with coordination and in a model without it. We then compute MSE and log-likelihood in a holdout set (using LOO) and report the average. We also compute how many times the model with coordination was better than the model without. We accept the hypothesis that the model with coordination is better than the one without if it is better 95% of the time.

In [61]:
class NullModel(BaseEstimator):
    
    def __init__(self):
        self.mean = None
        self.std = None
    
    def fit(self, X: np.ndarray, y: Any = None):
        self.mean = np.mean(y)
        self.std = np.std(y)
        return self
    
    def predict(self, X: np.ndarray):
        return np.ones(X.shape[0]) * self.mean
    
    def compute_log_likelihood(self, X: np.ndarray, y: np.ndarray, sample_weight=None):
        return norm(loc=self.mean, scale=self.std).logpdf(y)

class CoordinationModel(BaseEstimator):
    
    def __init__(self):
        self.reg = BayesianRidge(tol=1e-6, fit_intercept=True)
    
    def fit(self, X: np.ndarray, y: Any = None):
        self.reg.fit(X, y)
        return self
    
    def predict(self, X: np.ndarray):
        return self.reg.predict(X)
    
    def compute_log_likelihood(self, X: np.ndarray, y: np.ndarray, sample_weight=None):
        mean, std = self.reg.predict(X, return_std=True)
        return norm(loc=mean, scale=std).logpdf(y)

def execute_loo(X: np.ndarray, y: np.ndarray, plot: bool = False):
    null_model = NullModel()
    coord_model = CoordinationModel()
    null_mses = []
    null_nlls = []
    coord_mses = []
    coord_nlls = []
    
    loo = LeaveOneOut()
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        null_model.fit(X_train, y_train)
        null_mses.append(mean_squared_error(null_model.predict(X_test), y_test))
        null_nlls.append(-null_model.compute_log_likelihood(X_test, y_test)[0])   

        coord_model.fit(X_train, y_train)
        coord_mses.append(mean_squared_error(coord_model.predict(X_test), y_test))
        coord_nlls.append(-coord_model.compute_log_likelihood(X_test, y_test)[0])
        
        if plot:
            print(null_mses[-1])
            print(coord_mses[-1])
            fig = plt.figure()
            xs = np.linspace(0,1, 100)
            plt.scatter(X_train.flatten(), y_train)
            plt.scatter(X_test.flatten(), y_test, color="red")
            plt.plot(xs, null_model.predict(xs[:, np.newaxis]), label="Null")
            plt.plot(xs, coord_model.predict(xs[:, np.newaxis]), label="Coord")
            plt.legend()            
            plt.show()
    
    null_mses = np.array(null_mses)
    coord_mses = np.array(coord_mses)
    null_nlls = np.array(null_nlls)
    coord_nlls = np.array(coord_nlls)
    
    _, p_val_nll = ttest_rel(coord_nlls, null_nlls, alternative="less")
    _, p_val_mse = ttest_rel(coord_mses, null_mses, alternative="less")
    
    results = {
        "avg_mse_null": np.mean(null_mses),
        "std_mse_null": np.std(null_mses),
        "avg_nll_null": np.mean(null_nlls),
        "std_nll_null": np.std(null_nlls),
        "avg_mse_coord": np.mean(coord_mses),
        "std_mse_coord": np.std(coord_mses),
        "avg_nll_coord": np.mean(coord_nlls),
        "std_nll_coord": np.std(coord_nlls),    
        "p_coord_smaller_mse": (100.0 * np.sum(coord_mses < null_mses)) / len(coord_mses),
        "p_coord_smaller_nll": (100.0 * np.sum(coord_nlls < null_nlls)) / len(coord_nlls),
        "p_val_nll": p_val_nll,
        "p_val_mse": p_val_mse,
        "cohens_d_nll": cohens_d(coord_nlls, null_nlls),
        "cohens_d_mse": cohens_d(coord_mses, null_mses),
    }
    
    return results

In [62]:
outcome_measures_labels = ["Score", "Process Scale", "Satisfaction"]

results_list = []
# for condition in ["no_advisor", "human_advisor", "tomcat_advisor", "all_conditions"]:
for condition in ["no_advisor", "tomcat_advisor"]:
    for model in models:
        for i, mission in enumerate(["Mission 1", "Mission 2", "All Missions"]):
            dataset = datasets[condition][i]
            inferences = load_model_inferences(condition, model["ref_date"], model["training_type"])            
            X = np.array([AGGR_FN(summary.coordination_mean) for summary in inferences[i]])[:, np.newaxis]
            outcome_measures = [np.array([s.team_score for s in dataset.series]),
                                np.array([team_process_scale_survey_score(s) for s in dataset.series]),
                                np.array([team_satisfaction_survey_score(s) for s in dataset.series])]
            
            for j, outcome_measure in enumerate(outcome_measures):
                results_entry = model.copy()
                results_entry.update({
                    "condition": condition,
                    "mission": mission,
                    "outcome_measure": outcome_measures_labels[j]
                })
                results_entry.update(execute_loo(X, outcome_measure))
                results_list.append(results_entry)

df_h1 = pd.DataFrame.from_dict(results_list)
df_h1.head()

Unnamed: 0,ref_date,training_type,condition,mission,outcome_measure,avg_mse_null,std_mse_null,avg_nll_null,std_nll_null,avg_mse_coord,std_mse_coord,avg_nll_coord,std_nll_coord,p_coord_smaller_mse,p_coord_smaller_nll,p_val_nll,p_val_mse,cohens_d_nll,cohens_d_mse
0,2022.12.02--15,single_execution,no_advisor,Mission 1,Score,30865.289256,42360.147507,6.769778,1.241946,36498.476997,44430.917785,6.743978,0.568149,75.0,33.333333,0.459197,0.89252,-0.026716,0.129773
1,2022.12.02--15,single_execution,no_advisor,Mission 1,Process Scale,0.230725,0.349627,0.908773,1.393337,0.250403,0.355198,0.96699,1.295832,33.333333,25.0,0.709865,0.953044,0.043269,0.055838
2,2022.12.02--15,single_execution,no_advisor,Mission 1,Satisfaction,0.269862,0.32929,0.884899,0.891286,0.299283,0.354298,0.986971,0.691722,25.0,33.333333,0.805547,0.939094,0.127948,0.086019
3,2022.12.02--15,single_execution,no_advisor,Mission 2,Score,14790.532544,15815.577307,6.289582,0.691112,16059.294832,18688.498627,6.408203,0.452025,57.142857,35.714286,0.896747,0.789615,0.203141,0.073289
4,2022.12.02--15,single_execution,no_advisor,Mission 2,Process Scale,0.171687,0.186821,0.614762,0.761029,0.219984,0.334488,0.598316,0.615313,57.142857,42.857143,0.369003,0.837803,-0.023765,0.178277


In [64]:
df_h1[(df_h1["p_val_mse"] <= 0.1)]# & (df_h1["training_type"] == "single_execution")]
# df_h1[(df_h1["training_type"] == "single_execution_4_features")]

Unnamed: 0,ref_date,training_type,condition,mission,outcome_measure,avg_mse_null,std_mse_null,avg_nll_null,std_nll_null,avg_mse_coord,std_mse_coord,avg_nll_coord,std_nll_coord,p_coord_smaller_mse,p_coord_smaller_nll,p_val_nll,p_val_mse,cohens_d_nll,cohens_d_mse
39,2022.12.07--18,single_execution_4_features,no_advisor,Mission 2,Score,14790.532544,15815.577307,6.289582,0.691112,12891.233577,15480.63166,6.373115,0.233475,57.142857,35.714286,0.709357,0.064576,0.161942,-0.121369
48,2022.12.02--15,single_execution,tomcat_advisor,Mission 2,Score,39701.388889,35192.945095,6.768992,0.577572,15082.99842,19863.949712,6.446779,0.266743,76.923077,53.846154,0.04065,0.016516,-0.716258,-0.861521
55,2022.12.04--12,single_execution_no_self_dep,tomcat_advisor,Mission 1,Process Scale,0.06615,0.041415,0.094541,0.379052,0.033367,0.025399,-0.128016,0.199505,58.333333,58.333333,0.069986,0.03234,-0.734784,-0.954276
59,2022.12.04--12,single_execution_no_self_dep,tomcat_advisor,Mission 2,Satisfaction,0.274753,0.246317,0.827222,0.554916,0.219549,0.191033,0.712766,0.503896,61.538462,61.538462,0.127153,0.088837,-0.215946,-0.250455
62,2022.12.04--12,single_execution_no_self_dep,tomcat_advisor,All Missions,Satisfaction,0.228194,0.284948,0.722708,0.737688,0.175152,0.192422,0.559675,0.563317,60.0,72.0,0.061869,0.07739,-0.248406,-0.218166
78,2022.12.05--09,single_execution_pitch_only,tomcat_advisor,All Missions,Score,26445.486111,28217.213769,6.542284,0.638857,21967.610178,27182.133675,6.507346,0.446651,68.0,44.0,0.295342,0.066463,-0.063385,-0.16163
87,2022.12.07--18,single_execution_4_features,tomcat_advisor,All Missions,Score,26445.486111,28217.213769,6.542284,0.638857,20136.873685,22140.035946,6.507159,0.263473,68.0,28.0,0.366301,0.058247,-0.071881,-0.248749


# Hypothesis II: Increased coordination predicts higher outcome measures

Perform a p-test over samples from the posterior distribution of slopes and check if that is bigger than 0. Moreover, compute effect size using Cohen's d measurement.

In [65]:
def test_positive_slope(X: np.ndarray, y: np.ndarray, num_samples: int):
    reg = BayesianRidge(tol=1e-6, fit_intercept=True)
    reg.fit(X, y)
    
    mean = reg.coef_[0]
    std = np.sqrt(reg.sigma_.flatten()[0])
    slope_posterior = norm(loc=mean, scale=std)
    
    slope_samples = slope_posterior.rvs(num_samples)
    d = cohens_d(slope_samples, np.zeros(num_samples))
    _, p_val = ttest_ind(slope_samples, np.zeros(num_samples), alternative="greater")
    
    results = {
        "cohens_d": d, 
        "p_val": p_val
    }    
    
    return results

In [67]:
outcome_measures_labels = ["Score", "Process Scale", "Satisfaction"]

results_list = []
for condition in ["no_advisor", "human_advisor", "tomcat_advisor", "all_conditions"]:
    for model in models:
        for i, mission in enumerate(["Mission 1", "Mission 2", "All Missions"]):
            dataset = datasets[condition][i]
            inferences = load_model_inferences(condition, model["ref_date"], model["training_type"])            
            X = np.array([AGGR_FN(summary.coordination_mean) for summary in inferences[i]])[:, np.newaxis]
            outcome_measures = [np.array([s.team_score for s in dataset.series]),
                                np.array([team_process_scale_survey_score(s) for s in dataset.series]),
                                np.array([team_satisfaction_survey_score(s) for s in dataset.series])]
            
            for j, outcome_measure in enumerate(outcome_measures):
                results_entry = model.copy()
                results_entry.update({
                    "condition": condition,
                    "mission": mission,
                    "outcome_measure": outcome_measures_labels[j]
                })
                results_entry.update(test_positive_slope(X, outcome_measure, 10000))
                results_list.append(results_entry)

df_h2 = pd.DataFrame.from_dict(results_list)
df_h2.head()

Unnamed: 0,ref_date,training_type,condition,mission,outcome_measure,cohens_d,p_val
0,2022.12.02--15,single_execution,no_advisor,Mission 1,Score,-1.223812,1.0
1,2022.12.02--15,single_execution,no_advisor,Mission 1,Process Scale,-0.524373,1.0
2,2022.12.02--15,single_execution,no_advisor,Mission 1,Satisfaction,-0.886129,1.0
3,2022.12.02--15,single_execution,no_advisor,Mission 2,Score,1.555748,0.0
4,2022.12.02--15,single_execution,no_advisor,Mission 2,Process Scale,0.295013,6.5107900000000005e-96


In [69]:
df_h2[(df_h2["p_val"] <= 0.05)]
# df_h2[(df_h2["training_type"] == "single_execution")]

Unnamed: 0,ref_date,training_type,condition,mission,outcome_measure,cohens_d,p_val
3,2022.12.02--15,single_execution,no_advisor,Mission 2,Score,1.555748,0.0
4,2022.12.02--15,single_execution,no_advisor,Mission 2,Process Scale,0.295013,6.5107900000000005e-96
21,2022.12.04--22,single_execution_intensity_only,no_advisor,Mission 2,Score,2.005155,0.0
22,2022.12.04--22,single_execution_intensity_only,no_advisor,Mission 2,Process Scale,2.494511,0.0
23,2022.12.04--22,single_execution_intensity_only,no_advisor,Mission 2,Satisfaction,0.47944,2.7788449999999997e-245
39,2022.12.07--18,single_execution_4_features,no_advisor,Mission 2,Score,2.206827,0.0
40,2022.12.07--18,single_execution_4_features,no_advisor,Mission 2,Process Scale,2.090382,0.0
41,2022.12.07--18,single_execution_4_features,no_advisor,Mission 2,Satisfaction,1.779357,0.0
49,2022.12.02--15,single_execution,human_advisor,Mission 2,Process Scale,0.036724,0.004710295
64,2022.12.04--22,single_execution_intensity_only,human_advisor,Mission 1,Process Scale,1.69948,0.0


# Hypothesis III: Intervening on team communication predicts higher coordination and outcome measures


Compare mission by mission whether ToMCAT trials had higher coordination and outcome measures than others.

In [70]:
outcome_measures_labels = ["Score", "Process Scale", "Satisfaction"]

# Compare the second group against the first
groups = [
    ("no_advisor", "human_advisor"),
    ("no_advisor", "tomcat_advisor"),
    ("human_advisor", "tomcat_advisor"),
]

results_list = []
for model in models:
    for i, mission in enumerate(["Mission 1", "Mission 2", "All Missions"]):
        for condition1, condition2 in groups:
            dataset1 = datasets[condition1][i]
            inferences1 = load_model_inferences(condition1, model["ref_date"], model["training_type"])            
            coord1 = np.array([AGGR_FN(summary.coordination_mean) for summary in inferences1[i]])

            dataset2 = datasets[condition2][i]
            inferences2 = load_model_inferences(condition2, model["ref_date"], model["training_type"])            
            coord2 = np.array([AGGR_FN(summary.coordination_mean) for summary in inferences2[i]])
            
            # Comparing coordination. Outcome measures were compared in another notebook.
            results_entry = model.copy()
            results_entry.update({
                "condition1": condition1,
                "condition2": condition2,
                "mission": mission
            })
            results_entry.update(compare_conditions(coord2, coord1))
            results_list.append(results_entry)

df_h3 = pd.DataFrame.from_dict(results_list)
df_h3.head()

Unnamed: 0,ref_date,training_type,condition1,condition2,mission,cohens_d,p_val
0,2022.12.02--15,single_execution,no_advisor,human_advisor,Mission 1,0.417236,0.169232
1,2022.12.02--15,single_execution,no_advisor,tomcat_advisor,Mission 1,-0.469895,0.858814
2,2022.12.02--15,single_execution,human_advisor,tomcat_advisor,Mission 1,-0.858316,0.971741
3,2022.12.02--15,single_execution,no_advisor,human_advisor,Mission 2,0.183384,0.322004
4,2022.12.02--15,single_execution,no_advisor,tomcat_advisor,Mission 2,0.631356,0.063705


In [71]:
df_h3[(df_h3["p_val"] <= 0.05)]

Unnamed: 0,ref_date,training_type,condition1,condition2,mission,cohens_d,p_val
9,2022.12.04--12,single_execution_no_self_dep,no_advisor,human_advisor,Mission 1,2.477402,3.801952e-06
12,2022.12.04--12,single_execution_no_self_dep,no_advisor,human_advisor,Mission 2,2.464285,5.960741e-07
15,2022.12.04--12,single_execution_no_self_dep,no_advisor,human_advisor,All Missions,1.993851,2.512024e-09
31,2022.12.05--09,single_execution_pitch_only,no_advisor,tomcat_advisor,Mission 2,1.174353,0.003551
32,2022.12.05--09,single_execution_pitch_only,human_advisor,tomcat_advisor,Mission 2,1.03542,0.007943231
33,2022.12.05--09,single_execution_pitch_only,no_advisor,human_advisor,All Missions,0.484474,0.04646555
34,2022.12.05--09,single_execution_pitch_only,no_advisor,tomcat_advisor,All Missions,0.640605,0.01478351
42,2022.12.07--18,single_execution_4_features,no_advisor,human_advisor,All Missions,0.516442,0.03692018


## Hypothesis IV: Coordination in the second mission is higher than in the first mission

In [72]:
def get_paired_coordination(mission1_dataset: LatentVocalicsDataset, mission2_dataset: LatentVocalicsDataset, 
                            mission1_coordination: np.ndarray, mission2_coordination: np.ndarray):
    """
    Gets coordination for the same teams in mission 1 and 2. Ignores data from
    trials with only one mission.
    """
    
    mission1 = {}
    mission2 = {}
    mission1_trial_numbers = []
    
    for i in range(mission1_dataset.num_trials):
        mission_trial_number = int(mission1_dataset.series[i].uuid[1:])
        mission1[mission_trial_number] = mission1_coordination[i]
        
    for i in range(mission2_dataset.num_trials):
        mission_trial_number = int(mission2_dataset.series[i].uuid[1:])
        if mission_trial_number - 1 in mission1:
            # Only add if exists an entry for mission 1
            mission1_trial_numbers.append(mission_trial_number - 1)
        
            mission2[mission_trial_number] = mission2_coordination[i]
    
    coord1 = []
    coord2 = []
    for mission1_trial_number in mission1_trial_numbers:
        coord1.append(mission1[mission1_trial_number])
        coord2.append(mission2[mission1_trial_number + 1])
    
    return np.array(coord1), np.array(coord2)

In [73]:
results_list = []
for model in models:    
    for condition in ["no_advisor", "human_advisor", "tomcat_advisor", "all_conditions"]:
        inferences = load_model_inferences(condition, model["ref_date"], model["training_type"])            
        
        dataset1 = datasets[condition][0]
        dataset2 = datasets[condition][1]
        coord1 = np.array([AGGR_FN(summary.coordination_mean) for summary in inferences[0]])          
        coord2 = np.array([AGGR_FN(summary.coordination_mean) for summary in inferences[1]])        
        
        coord1, coord2 = get_paired_coordination(dataset1, dataset2, coord1, coord2)
        
        # Comparing coordination. Outcome measures were compared in another notebook.
        results_entry = model.copy()
        results_entry.update({
            "condition": condition
        })
        
        results_entry.update(compare_conditions(coord2, coord1, paired=True))
        results_list.append(results_entry)

df_h4 = pd.DataFrame.from_dict(results_list)
df_h4.head()

Unnamed: 0,ref_date,training_type,condition,cohens_d,p_val
0,2022.12.02--15,single_execution,no_advisor,-0.600152,0.957793
1,2022.12.02--15,single_execution,human_advisor,-0.876517,0.996158
2,2022.12.02--15,single_execution,tomcat_advisor,0.689078,0.051727
3,2022.12.02--15,single_execution,all_conditions,-0.044276,0.57479
4,2022.12.04--12,single_execution_no_self_dep,no_advisor,-0.990994,0.985384


In [74]:
df_h4[(df_h4["p_val"] <= 0.05)]

Unnamed: 0,ref_date,training_type,condition,cohens_d,p_val
6,2022.12.04--12,single_execution_no_self_dep,tomcat_advisor,0.853623,0.037096
14,2022.12.05--09,single_execution_pitch_only,tomcat_advisor,1.026324,0.027777
