This notebook reproduces the statistical tests described in the manuscript:

- Normality: Shapiro–Wilk test (or Kolmogorov–Smirnov if n > 5000)  
- Variance equality: Levene’s test (if both groups normal)  
- Group comparison:
  - Student’s t-test (equal variances)  
  - Welch’s t-test (unequal variances)  
  - Mann–Whitney U test (if non-normal)  

Outputs:  
- Sample sizes per group  
- Raw p-values for all pairwise comparisons

Note that:
    P-values for multiple comparisons are manually corrected with Bonferroni adjustment.

In [None]:
# imports
import pandas as pd
import itertools
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

### Step1: load data

In [None]:
# change input_path
input_file = "data/example_input.xlsx" 

# change sheetname
df = pd.read_excel(input_file, sheet_name="Sheet1") 

# Keep only group and value columns
df = df.iloc[:, [group_col_ID, value_col_ID]]
df.columns = ["group", "value"]

df.head()

### Step2:validate, counts, summary

In [None]:
# Output file path
output_file_path = 'file_name.txt' 

# Initialize a list to store result lines
result_lines = []

# Step 1: Calculate and log group counts
group_counts = data.groupby('group')['value'].count()
result_lines.append("Count of observations within each group:\n")
print("Count of observations within each group:")
for group, count in group_counts.items():
    line = f"{group}: {count} observations"
    print(line)
    result_lines.append(line)
result_lines.append("\n")  # Add a blank line for separation

# Step 2: Perform pairwise comparisons between all groups
groups = data['group'].unique()
comparisons = []

for group1, group2 in itertools.combinations(groups, 2):
    group1_data = data[data['group'] == group1]['value']
    group2_data = data[data['group'] == group2]['value']
    
    # Perform normality tests (Shapiro-Wilk test)
    shapiro_group1_p = stats.shapiro(group1_data).pvalue if len(group1_data) < 5000 else stats.kstest(group1_data, 'norm').pvalue
    shapiro_group2_p = stats.shapiro(group2_data).pvalue if len(group2_data) < 5000 else stats.kstest(group2_data, 'norm').pvalue
    
    group1_normal = shapiro_group1_p > 0.05
    group2_normal = shapiro_group2_p > 0.05
    
    # Perform Levene's test for variance assumption (only if both groups are normal)
    if group1_normal and group2_normal:
        levene_stat, levene_p = stats.levene(group1_data, group2_data)
        
        # Perform the appropriate t-test
        if levene_p > 0.05:
            t_stat, p_value = stats.ttest_ind(group1_data, group2_data, equal_var=True)  # Student's t-test
            test_type = "Student's t-test (equal variances assumed)"
        else:
            t_stat, p_value = stats.ttest_ind(group1_data, group2_data, equal_var=False)  # Welch's t-test
            test_type = "Welch's t-test (unequal variances)"
    
    else:
        # If at least one group is not normal, use Mann-Whitney U test
        u_stat, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative='two-sided')
        test_type = "Mann-Whitney U test (non-parametric)"

    # Store results
    comparisons.append(f'{group1} vs {group2}')
    result_lines.append(f'{test_type} for {group1} vs {group2}: p-value = {p_value:.10f}')  # Full precision p-value

# Step 4: Save all results to a single text file
with open(output_file_path, 'w') as file:
    for line in result_lines:
        file.write(line + '\n')

print(f"Results saved to {output_file_path}")