In [8]:
import PyQt6.QtCore
import os
os.environ["QT_API"] = "pyqt6"
import matplotlib.pyplot as plt

# Use qt backend for matplotlab to use interactive mne plots
%matplotlib qt

import mne 
import analysis.processing
import pandas as pd
import csv 
import os
from config import Config
configObj = Config()
from mne_connectivity import spectral_connectivity_time
import numpy as np
configss = configObj.getConfigSnapshot()
from tqdm import tqdm
import tools.helpers
from scipy import stats

mne.set_log_level(verbose='WARNING', return_old_level=False, add_frames=None)

In [21]:
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 200 # 200 

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import pingouin as pg
from scipy.stats import shapiro, wilcoxon
import seaborn as sns

# Ensure 'configss' is defined appropriately
# Example:
# configss = {
#     'root': '/path/to/data',
#     'data_qa': 'qa_data_directory'
# }

# -----------------------------
# 1. Data Loading and Preparation
# -----------------------------

# List of participant IDs excluding certain ones
participant_ids = [el for el in range(1, 32) if el not in [14, 5, 13, 16, 17, 20, 31]]

# Initialize an empty list to store DataFrames
dataframes = []

for pnum in tqdm(participant_ids, desc="Loading Participant Data"):
    participant_data_path = f"{pnum}.csv"
    path_qa = os.path.join(configss['root'], configss['data_qa'], participant_data_path)
    
    # Check if file exists
    if not os.path.exists(path_qa):
        print(f"Warning: File {path_qa} does not exist. Skipping participant {pnum}.")
        continue
    
    # Read the data for the current participant
    participant_df = pd.read_csv(path_qa)
    
    # Add a column for ParticipantID
    participant_df['ParticipantID'] = pnum
    
    # Append to the list
    dataframes.append(participant_df)

# Concatenate all participant DataFrames into one DataFrame
if not dataframes:
    raise ValueError("No participant data loaded. Please check the file paths and existence.")

df = pd.concat(dataframes, axis=0).reset_index(drop=True)


# Function to replace outliers with median
def replace_outliers(df, group_col, value_col):
    df_copy = df.copy()
    medians = df.groupby(group_col)[value_col].transform('median')
    
    Q1 = df[value_col].quantile(0.25)
    Q3 = df[value_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Replace outliers with median
    df_copy.loc[(df[value_col] < lower_bound) | (df[value_col] > upper_bound), value_col] = medians
    return df_copy

# Replace outliers in the combined data
df = replace_outliers(df, 'BlockType', 'Correct')

# -----------------------------
# 2. Data Aggregation
# -----------------------------

# Check for multiple entries per ParticipantID and BlockType
duplicates = df.duplicated(subset=['ParticipantID', 'BlockType'])
if duplicates.any():
    print("Found duplicate entries for some ParticipantID and BlockType combinations. Aggregating by mean.")
    # Aggregate by mean to have one 'Correct' score per ParticipantID per BlockType
    df = df.groupby(['ParticipantID', 'BlockType'], as_index=False)['Correct'].mean()

# Verify that each participant has all BlockTypes
expected_block_types = df['BlockType'].unique()
participants_block_types = df.groupby('ParticipantID')['BlockType'].nunique()

missing_blocks = participants_block_types[participants_block_types != len(expected_block_types)]
if not missing_blocks.empty:
    print("The following ParticipantIDs are missing some BlockTypes and will be excluded:")
    print(missing_blocks[missing_blocks != len(expected_block_types)].index.tolist())
    # Exclude participants with incomplete data
    df = df[~df['ParticipantID'].isin(missing_blocks.index.tolist())]

# -----------------------------
# 3. Normality Testing
# -----------------------------

# Perform Shapiro-Wilk test for normality on 'Correct' scores within each BlockType
normality_results = {}
for block_type in df['BlockType'].unique():
    stat, p = shapiro(df[df['BlockType'] == block_type]['Correct'])
    normality_results[block_type] = p
    print(f"Shapiro-Wilk Test for BlockType '{block_type}': p-value = {p:.4f}")

# Determine overall normality
is_normal = all(p > 0.05 for p in normality_results.values())
print(f"\nOverall Normality Assumption Met: {is_normal}")

# -----------------------------
# 4. Statistical Analysis
# -----------------------------

if is_normal:
    # -----------------------------
    # 4a. Repeated Measures ANOVA
    # -----------------------------
    print("\nData is normally distributed. Performing Repeated Measures ANOVA.")
    
    # Perform Repeated Measures ANOVA using pingouin
    aov = pg.rm_anova(dv='Correct', within='BlockType', subject='ParticipantID', data=df, detailed=True)
    print("\nANOVA Results:")
    print(aov)
    
    # Extract F-statistic and partial eta squared
    f_stat = aov.loc[aov['Source'] == 'BlockType', 'F'].values[0]
    p_value = aov.loc[aov['Source'] == 'BlockType', 'p-unc'].values[0]
    partial_eta_sq = aov.loc[aov['Source'] == 'BlockType', 'np2'].values[0]
    
    print(f"\nF-statistic: {f_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Partial Eta Squared: {partial_eta_sq:.4f}")
    
    # Effect size is already provided as partial_eta_sq
    effect_size = partial_eta_sq
    
else:
    # -----------------------------
    # 4b. Wilcoxon Signed-Rank Test
    # -----------------------------
    print("\nData is not normally distributed. Performing pairwise Wilcoxon signed-rank tests.")
    
    block_types = sorted(df['BlockType'].unique())  # Sort for consistent pairing
    wilcoxon_results = []
    
    # Pivot the data to have one row per ParticipantID and columns for each BlockType
    pivot_df = df.pivot(index='ParticipantID', columns='BlockType', values='Correct')
    
    # Identify all unique pairs of BlockTypes for pairwise comparisons
    from itertools import combinations
    block_type_pairs = list(combinations(block_types, 2))
    
    for pair in block_type_pairs:
        bt1, bt2 = pair
        data1 = pivot_df[bt1]
        data2 = pivot_df[bt2]
        
        # Ensure no missing data
        paired_data = pivot_df[[bt1, bt2]].dropna()
        data1 = paired_data[bt1]
        data2 = paired_data[bt2]
        
        # Perform Wilcoxon signed-rank test
        stat, p = wilcoxon(data1, data2, alternative='two-sided')
        
        # Compute rank-biserial correlation as effect size
        # Using formula: r = Z / sqrt(N)
        # However, scipy's wilcoxon does not return Z, so we use pingouin's compute_effsize
        # Alternatively, use the formula: r = (W - (n*(n+1)/4)) / sqrt(n*(n+1)*(2*n+1)/24)
        # But it's simpler to use pingouin
        
        # Compute effect size using pingouin's compute_effsize
        # Requires 'paired=True' and 'eftype' parameter
        # Since data is paired, we pass data1 and data2 directly
        # 'rank_biserial' is one option, but pingouin may not support it directly
        # Instead, use 'r' which is rank correlation
        effsize = pg.compute_effsize(paired_data[bt1], paired_data[bt2], paired=True, eftype='r')
        
        wilcoxon_results.append({
            'Block Type 1': bt1,
            'Block Type 2': bt2,
            'Statistic (Wilcoxon W)': stat,
            'P-value': p,
            'Effect Size (r)': effsize
        })
    
    # Convert results to DataFrame
    wilcoxon_df = pd.DataFrame(wilcoxon_results)
    
    # Adjust p-values for multiple comparisons using Bonferroni correction
    wilcoxon_df['Adjusted P-value'] = wilcoxon_df['P-value'] * len(block_type_pairs)
    wilcoxon_df['Adjusted P-value'] = wilcoxon_df['Adjusted P-value'].apply(lambda x: min(x, 1.0))
    
    print("\nPairwise Wilcoxon Signed-Rank Test Results (Bonferroni Adjusted):")
    print(wilcoxon_df)

    # For visualization, you might want to annotate significant differences
    # This can be done using statistical annotations libraries like statannot or manually

# -----------------------------
# 5. Visualization
# -----------------------------

# Calculate means, counts, stds for each BlockType
stats_df = df.groupby('BlockType')['Correct'].agg(['mean', 'count', 'std']).reset_index()
stats_df['sem'] = stats_df['std'] / np.sqrt(stats_df['count'])  # Standard Error of Mean
stats_df['ci95'] = 1.96 * stats_df['sem']  # 95% Confidence Interval

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='BlockType', y='Correct', data=df, ci= 95, palette='viridis')

# Adding error bars manually for more control
# plt.errorbar(x=np.arange(len(stats_df)), y=stats_df['mean'], yerr=stats_df['ci95'],
            #  fmt='none', c='black', capsize=10)

# Adding titles and labels
if is_normal:
    plt.title(f'Q&A Accuracy by Block Type')
else:
    plt.title('Q&A Accuracy by Block Type')
plt.xlabel('Block Type')
plt.ylabel('Accuracy (Proportion of Correct Answers)')

# Improve layout
plt.tight_layout()
plt.show()

# -----------------------------
# 6. Descriptive Statistics and Effect Sizes
# -----------------------------

if is_normal:
    # Display descriptive statistics and effect size
    display_df = stats_df[['BlockType', 'mean', 'std']]
    display_df = display_df.rename(columns={'mean': 'Mean Correct', 'std': 'Std Deviation'})
    display_df['Partial Eta Squared'] = partial_eta_sq
    print("\nDescriptive Statistics and Effect Size:")
    print(display_df)
else:
    # Display Wilcoxon test results with descriptive statistics
    print("\nDescriptive Statistics:")
    print(stats_df[['BlockType', 'mean', 'std']].rename(columns={'mean': 'Mean Correct', 'std': 'Std Deviation'}))
    print("\nPairwise Wilcoxon Signed-Rank Test Results:")
    print(wilcoxon_df)


Loading Participant Data: 100%|██████████| 24/24 [00:00<00:00, 786.14it/s]

Found duplicate entries for some ParticipantID and BlockType combinations. Aggregating by mean.
Shapiro-Wilk Test for BlockType 'D': p-value = 0.1389
Shapiro-Wilk Test for BlockType 'ND': p-value = 0.0068

Overall Normality Assumption Met: False

Data is not normally distributed. Performing pairwise Wilcoxon signed-rank tests.

Pairwise Wilcoxon Signed-Rank Test Results (Bonferroni Adjusted):
  Block Type 1 Block Type 2  Statistic (Wilcoxon W)   P-value  \
0            D           ND                    67.5  0.054613   

   Effect Size (r)  Adjusted P-value  
0         0.312952          0.054613  

Descriptive Statistics:
  BlockType  Mean Correct  Std Deviation
0         D      0.687500       0.172454
1        ND      0.774306       0.135666

Pairwise Wilcoxon Signed-Rank Test Results:
  Block Type 1 Block Type 2  Statistic (Wilcoxon W)   P-value  \
0            D           ND                    67.5  0.054613   

   Effect Size (r)  Adjusted P-value  
0         0.312952          0.05



The `ci` parameter is deprecated. Use `errorbar=('ci', 95)` for the same effect.

  sns.barplot(x='BlockType', y='Correct', data=df, ci= 95, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='BlockType', y='Correct', data=df, ci= 95, palette='viridis')
