In [18]:
import PyQt6.QtCore
import os
os.environ["QT_API"] = "pyqt6"
import matplotlib.pyplot as plt

# Use qt backend for matplotlab to use interactive mne plots
%matplotlib qt

import mne 
import analysis.processing
import pandas as pd
import csv 
import os
from config import Config
configObj = Config()
from mne_connectivity import spectral_connectivity_time
import numpy as np
configss = configObj.getConfigSnapshot()
from tqdm import tqdm
import tools.helpers
from scipy import stats

mne.set_log_level(verbose='WARNING', return_old_level=False, add_frames=None)

In [19]:
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 200 # 200 

In [22]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro, wilcoxon, ttest_rel
import pingouin as pg
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
from statsmodels.stats.multitest import multipletests

# -----------------------------
# 1. Data Loading and Cleaning
# -----------------------------

# List of participant numbers excluding specified ones
participant_numbers = [el for el in range(1, 32) if el not in [5, 13, 14, 16, 17, 20, 31]]

# Initialize an empty list to store DataFrames
dataframes = []

for pnum in tqdm(participant_numbers, desc="Loading Data"):
    participant_data_path = f"{pnum}.csv"
    path_qa = os.path.join(configss['root'], configss['data_completion'], participant_data_path)
    
    # Check if file exists
    if not os.path.exists(path_qa):
        print(f"Warning: File {path_qa} does not exist. Skipping participant {pnum}.")
        continue
    
    # Read and append the DataFrame
    try:
        df_participant = pd.read_csv(path_qa)
        df_participant['PID'] = pnum  # Ensure 'PID' column exists
        dataframes.append(df_participant)
    except Exception as e:
        print(f"Error reading {path_qa}: {e}")

# Check if any data was loaded
if not dataframes:
    raise ValueError("No participant data loaded. Please check file paths and data.")

# Concatenate all DataFrames into one combined DataFrame
combined_df = pd.concat(dataframes, axis=0).reset_index(drop=True)

# -----------------------------
# 2. Outlier Replacement
# -----------------------------

def replace_outliers(df, group_col, value_col):
    """
    Replace outliers in 'value_col' within each 'group_col' group with the group's median.
    Outliers are defined as values below Q1 - 1.5*IQR or above Q3 + 1.5*IQR.
    """
    df_copy = df.copy()
    medians = df.groupby(group_col)[value_col].transform('median')
    
    Q1 = df[value_col].quantile(0.25)
    Q3 = df[value_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Replace outliers with median
    outliers = (df[value_col] < lower_bound) | (df[value_col] > upper_bound)
    num_outliers = outliers.sum()
    print(f"Replacing {num_outliers} outliers in '{value_col}' within each '{group_col}' group.")
    df_copy.loc[outliers, value_col] = medians[outliers]
    
    return df_copy

# Replace outliers in the combined data
cleaned_df = replace_outliers(combined_df, 'BlockType', 'completion')

# -----------------------------
# 3. Data Aggregation
# -----------------------------

# Aggregate the data by computing mean 'completion' per PID and BlockType
aggregated_df = cleaned_df.groupby(['PID', 'BlockType'])['completion'].mean().reset_index()

# Verify no duplicates
duplicate_counts = aggregated_df.duplicated(subset=['PID', 'BlockType']).sum()
print(f"\nNumber of duplicate PID-BlockType combinations after aggregation: {duplicate_counts}")

if duplicate_counts > 0:
    raise ValueError("Duplicates found after aggregation. Please check your data.")

# Ensure that each PID has data for all BlockTypes
block_types = aggregated_df['BlockType'].unique()
block_type_counts = aggregated_df.groupby('PID')['BlockType'].nunique()

missing_blocktypes = block_type_counts[block_type_counts != len(block_types)].index.tolist()
if missing_blocktypes:
    print(f"\nParticipants missing some BlockTypes: {missing_blocktypes}")
    # Exclude these participants from the analysis
    aggregated_df = aggregated_df[~aggregated_df['PID'].isin(missing_blocktypes)]
    print(f"Excluded participants missing BlockTypes. Remaining participants: {aggregated_df['PID'].nunique()}")

# -----------------------------
# 4. Normality Testing on Each Block Type Separately
# -----------------------------

# Pivot the data to have one row per participant with columns for each BlockType
pivot_df = aggregated_df.pivot(index='PID', columns='BlockType', values='completion')

# Check normality for each BlockType separately
normality_results = {}
for block in block_types:
    shapiro_stat, shapiro_p = shapiro(pivot_df[block])
    normality_results[block] = {'statistic': shapiro_stat, 'p-value': shapiro_p}
    print(f"\nShapiro-Wilk Test for BlockType '{block}':")
    print(f"Statistic = {shapiro_stat:.4f}, p-value = {shapiro_p:.4f}")

# -----------------------------
# 5. Paired Test Logic Based on Normality
# -----------------------------

if len(block_types) == 2:
    # Exactly two BlockTypes: proceed with paired tests
    block1, block2 = block_types
    print(f"\nBlock Types identified for paired comparison: '{block1}' and '{block2}'")
    
    # Calculate differences
    pivot_df['Difference'] = pivot_df[block1] - pivot_df[block2]
    
    # Check if both blocks are normally distributed
    normality = (normality_results[block1]['p-value'] > 0.05) and (normality_results[block2]['p-value'] > 0.05)
    print(f"\nNormality for Both Blocks: {'Pass' if normality else 'Fail'}")

    if normality:
        print("\nData is normally distributed. Performing Paired t-test.")
        # Perform Paired t-test
        t_stat, t_p_value = ttest_rel(pivot_df[block1], pivot_df[block2])
        print(f"\nPaired t-test Results:")
        print(f"t-statistic = {t_stat:.4f}, p-value = {t_p_value:.4f}")
        
        # Calculate Cohen's d for paired samples
        differences = pivot_df['Difference']
        cohen_d = differences.mean() / differences.std(ddof=1)
        print(f"Cohen's d = {cohen_d:.4f}")
        
        effect_size = cohen_d
    else:
        print("\nData is not normally distributed. Performing Wilcoxon signed-rank test.")
        # Perform Wilcoxon signed-rank test
        non_zero = pivot_df['Difference'] != 0
        completion1 = pivot_df.loc[non_zero, block1]
        completion2 = pivot_df.loc[non_zero, block2]

        # stat, p = wilcoxon(data1, data2, alternative='two-sided')

        test_res = pg.wilcoxon(completion1, completion2)
        wilcoxon_test_stat = test_res['W-val'].values[0]
        wilcoxon_pvalue = test_res['p-val'].values[0]
        effect_size = test_res['RBC'].values[0]  # Rank-biserial correlation
        effect_size_name = 'Rank-biserial correlation'
        
        # wilcoxon_test_stat, wilcoxon_pvalue = wilcoxon(completion1, completion2, alternative='two-sided')
        print(f"\nWilcoxon Signed-Rank Test Results:")
        print(f"Statistic = {wilcoxon_test_stat}, p-value = {wilcoxon_pvalue:.4f}")
        
        # Calculate Effect Size (Rank Biserial Correlation) using pingouin
        # effect_size = pg.compute_effsize(completion1, completion2, paired=True, eftype='r')
        print(f"Effect Size (Rank Biserial Correlation) = {effect_size:.4f}")

else:
    # More than two BlockTypes: proceed with ANOVA or Friedman test
    print("\nMore than two BlockTypes detected.")
    normality = all([result['p-value'] > 0.05 for result in normality_results.values()])
    
    if normality:
        print("\nData is normally distributed. Performing Repeated Measures ANOVA.")
        aov = pg.rm_anova(
            data=aggregated_df,
            dv='completion',
            within='BlockType',
            subject='PID',
            detailed=True
        )
        print("\nANOVA Results:")
        print(aov)
        
        partial_eta_squared = aov.loc[aov['Source'] == 'BlockType', 'np2'].values[0]
        print(f"\nPartial Eta Squared: {partial_eta_squared:.4f}")
        effect_size = partial_eta_squared
    else:
        print("\nData is not normally distributed. Performing Friedman test.")
        friedman_data = [pivot_df[bt].dropna() for bt in block_types]
        
        friedman_stat, friedman_pvalue = friedmanchisquare(*friedman_data)
        print(f"\nFriedman Test Results:")
        print(f"Statistic = {friedman_stat:.4f}, p-value = {friedman_pvalue:.4f}")
        
        N = pivot_df.shape[0]
        k = len(block_types)
        partial_eta_squared = friedman_stat / (N * (k - 1))
        print(f"Partial Eta Squared = {partial_eta_squared:.4f}")
        effect_size = partial_eta_squared

# -----------------------------
# 6. Visualization
# -----------------------------

plt.figure(figsize=(10, 6))
sns.barplot(x='BlockType', y='completion', data=aggregated_df, ci=95, palette='viridis', capsize=0.1)
plt.title('Average Completion by Block Type')
plt.xlabel('Block Type')
plt.ylabel('Average Completion')
plt.tight_layout()
plt.show()

# -----------------------------
# 7. Descriptive Statistics and Effect Size Reporting
# -----------------------------

mean_completion = aggregated_df.groupby('BlockType')['completion'].mean()
std_completion = aggregated_df.groupby('BlockType')['completion'].std()

if len(block_types) == 2:
    if normality:
        display_data = {
            'Mean Completion': mean_completion,
            'Std Deviation': std_completion,
            'Effect Size (Cohen\'s d)': [effect_size] * len(mean_completion)
        }
    else:
        display_data = {
            'Mean Completion': mean_completion,
            'Std Deviation': std_completion,
            'Wilcoxon Test Statistic': [wilcoxon_test_stat],
            'Wilcoxon P-Value': [wilcoxon_pvalue],
            'Effect Size (r)': [effect_size]
        }
else:
    if normality:
        display_data = {
            'Mean Completion': mean_completion,
            'Std Deviation': std_completion,
            'Partial Eta Squared': [effect_size] * len(mean_completion)
        }
    else:
        display_data = {
            'Mean Completion': mean_completion,
            'Std Deviation': std_completion,
            'Friedman Test Statistic': [friedman_stat],
            'Friedman P-Value': [friedman_pvalue],
            'Effect Size (Partial Eta Squared)': [effect_size]
        }

display_df = pd.DataFrame(display_data)
print("\nDescriptive Statistics and Effect Size:")
print(display_df)


Loading Data: 100%|██████████| 24/24 [00:00<00:00, 786.40it/s]

Replacing 16 outliers in 'completion' within each 'BlockType' group.

Number of duplicate PID-BlockType combinations after aggregation: 0

Shapiro-Wilk Test for BlockType 'D':
Statistic = 0.9430, p-value = 0.1904

Shapiro-Wilk Test for BlockType 'ND':
Statistic = 0.7588, p-value = 0.0001

Block Types identified for paired comparison: 'D' and 'ND'

Normality for Both Blocks: Fail

Data is not normally distributed. Performing Wilcoxon signed-rank test.

Wilcoxon Signed-Rank Test Results:
Statistic = 29.0, p-value = 0.0016
Effect Size (Rank Biserial Correlation) = -0.7489




The `ci` parameter is deprecated. Use `errorbar=('ci', 95)` for the same effect.

  sns.barplot(x='BlockType', y='completion', data=aggregated_df, ci=95, palette='viridis', capsize=0.1)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='BlockType', y='completion', data=aggregated_df, ci=95, palette='viridis', capsize=0.1)


ValueError: array length 1 does not match index length 2

In [27]:
# Function to calculate median and IQR
def calculate_iqr(df, group_col, value_col):
    iqr_data = df.groupby(group_col)[value_col].agg([np.median, lambda x: np.percentile(x, 75) - np.percentile(x, 25)])
    iqr_data.columns = ['Median', 'IQR']
    iqr_data['Quartile Deviation'] = iqr_data['IQR'] / 2
    return iqr_data

# Calculate IQR for each condition
iqr_df = calculate_iqr(aggregated_df, 'BlockType', 'completion')

# Plot boxplot with viridis color palette
plt.figure(figsize=(10, 6))
sns.set_palette('viridis')
sns.boxplot(x='BlockType', y='completion',data=aggregated_df)


# Display median and IQR
# for i, block_type in enumerate(iqr_df.index):
#     median = iqr_df.loc[block_type, 'Median']
#     iqr_range = iqr_df.loc[block_type, 'IQR']
#     plt.text(i, median, f'Median: {median:.2f}\nIQR: {iqr_range:.2f}', 
#              ha='center', va='center', color='white', fontsize=10, bbox=dict(facecolor='black', alpha=0.5))
plt.xlabel('Block Type')
plt.ylabel('Average Completion')
plt.title("Average Completion by Block Type")
plt.show()

print(iqr_df)


              Median        IQR  Quartile Deviation
BlockType                                          
D          91.458333  10.462240            5.231120
ND         98.789062   6.783854            3.391927


  iqr_data = df.groupby(group_col)[value_col].agg([np.median, lambda x: np.percentile(x, 75) - np.percentile(x, 25)])


In [16]:
display_data

{'Mean Completion': BlockType
 D     91.542969
 ND    96.158312
 Name: completion, dtype: float64,
 'Std Deviation': BlockType
 D     6.417297
 ND    5.210947
 Name: completion, dtype: float64,
 'Wilcoxon Test Statistic': [np.float64(29.0)],
 'Wilcoxon P-Value': [np.float64(0.0016002655029296875)],
 'Effect Size (r)': [np.float64(0.4320223989133355)]}

In [12]:
aggregated_df

Unnamed: 0,PID,BlockType,completion
0,1,D,95.0
1,1,ND,96.614583
2,2,D,88.307292
3,2,ND,98.151042
4,3,D,93.411458
5,3,ND,100.0
6,4,D,90.833333
7,4,ND,99.401042
8,6,D,91.458333
9,6,ND,90.325521
