In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [19]:
data = pd.read_csv('../results/cleaned_data.csv')
data.head()

Unnamed: 0,timestamp,user_id,day,quiz_id,image_index,illusion_filename,user_answer,response_time,speed_group,display_time,correct_answer,line_length1,line_length2,actual_difference,arrow_length,angle,arrow_color,is_control,is_correct
0,2024-11-04 21:33:40,mattrau1,1,2,0,muller_lyer_day1_illusion25.svg,Left,8.259,Group 2 - Fast,0.5,Same length,200,200,0,20,36,red,True,False
1,2024-11-04 21:33:41,jonah.harmon,1,0,0,muller_lyer_day1_illusion4.svg,Left,4.777,Group 1 - Fast,0.5,Same length,180,180,0,20,36,black,False,False
2,2024-11-04 21:33:44,mattrau1,1,2,1,muller_lyer_day1_illusion30.svg,Left,2.741,Group 2 - Fast,0.5,Same length,200,200,0,20,36,red,True,False
3,2024-11-04 21:33:45,jonah.harmon,1,0,1,muller_lyer_day1_illusion11.svg,Same length,2.907,Group 1 - Fast,0.5,Same length,205,205,0,20,36,red,False,True
4,2024-11-04 21:33:47,mwhite,1,2,0,muller_lyer_day1_illusion25.svg,Same length,3.467,Group 2 - Fast,0.5,Same length,200,200,0,20,36,red,True,True


In [21]:
def analyze_group_performance(df):
    # Convert is_correct to numeric (1 for True, 0 for False)
    df['is_correct_numeric'] = df['is_correct'].astype(int)
    
    # Create group identifier
    df['main_group'] = df['speed_group'].str.split(' - ').str[0]
    df['speed_condition'] = df['speed_group'].str.split(' - ').str[1]
    
    # Print data validation
    print("Data distribution across groups:")
    print(df.groupby(['main_group', 'speed_condition']).size())
    
    # Get first and last days overall
    first_day = df['day'].min()
    last_day = df['day'].max()
    
    # Filter for first and last days
    first_last_data = df[df['day'].isin([first_day, last_day])].copy()
    first_last_data['day_type'] = first_last_data['day'].map({first_day: 'First', last_day: 'Last'})
    
    # Calculate metrics for each group and condition
    metrics = []
    for group in ['Group 1', 'Group 2']:
        for speed in ['Fast', 'Slow']:
            for day_type in ['First', 'Last']:
                mask = ((first_last_data['main_group'] == group) & 
                       (first_last_data['speed_condition'] == speed) & 
                       (first_last_data['day_type'] == day_type))
                
                group_data = first_last_data[mask]
                
                if len(group_data) > 0:
                    metrics.append({
                        'main_group': group,
                        'speed_condition': speed,
                        'day_type': day_type,
                        'accuracy': group_data['is_correct_numeric'].mean(),
                        'response_time': group_data['response_time'].mean(),
                        'n_trials': len(group_data),
                        'std_accuracy': group_data['is_correct_numeric'].std(),
                        'std_rt': group_data['response_time'].std()
                    })
    
    metrics_df = pd.DataFrame(metrics)
    
    # Statistical tests
    stats_results = {}
    for group in ['Group 1', 'Group 2']:
        group_data = first_last_data[first_last_data['main_group'] == group]
        
        # Accuracy
        first_acc = group_data[group_data['day_type'] == 'First']['is_correct_numeric']
        last_acc = group_data[group_data['day_type'] == 'Last']['is_correct_numeric']
        if len(first_acc) > 0 and len(last_acc) > 0:
            acc_ttest = stats.ttest_ind(first_acc, last_acc)
            stats_results[f'{group}_accuracy'] = acc_ttest
        
        # Response Time
        first_rt = group_data[group_data['day_type'] == 'First']['response_time']
        last_rt = group_data[group_data['day_type'] == 'Last']['response_time']
        if len(first_rt) > 0 and len(last_rt) > 0:
            rt_ttest = stats.ttest_ind(first_rt, last_rt)
            stats_results[f'{group}_rt'] = rt_ttest
    
    return {
        'metrics': metrics_df,
        'stats': stats_results,
        # 'plot': fig
    }


results = analyze_group_performance(data)

# Print results
print("\nMetrics Summary:")
print(results['metrics'])

print("\nStatistical Tests:")
for test_name, test_result in results['stats'].items():
    print(f"\n{test_name}:")
    print(f"t-statistic: {test_result.statistic:.3f}")
    print(f"p-value: {test_result.pvalue:.3f}")

Data distribution across groups:
main_group  speed_condition
Group 1     Fast               928
            Slow               880
Group 2     Fast               288
            Slow               288
dtype: int64

Metrics Summary:
  main_group speed_condition day_type  accuracy  response_time  n_trials  \
0    Group 1            Fast    First  0.193750       2.568269       160   
1    Group 1            Fast     Last  0.453125       1.833461       128   
2    Group 1            Slow    First  0.109375       3.483406       128   
3    Group 1            Slow     Last  0.484375       3.085289       128   
4    Group 2            Fast    First  0.312500       2.744271        48   
5    Group 2            Fast     Last  0.468750       2.864781        32   
6    Group 2            Slow    First  0.354167       4.675771        48   
7    Group 2            Slow     Last  0.406250       3.282062        32   

   std_accuracy    std_rt  
0      0.396476  1.324007  
1      0.499754  0.811482  