## QC Reviewer Statistics

Author: Marc Biosca

In [1]:
import pandas as pd

def qc_statistics(csv_file):
    """
    Reads a CSV file of QC columns and prints out basic stats about 
    how many subjects pass/fail each criterion, as well as overall pass/fail.

    Parameters
    ----------
    csv_file : str
        Path to the CSV file to read.
    """
    
    # 1. Read CSV into a DataFrame
    df = pd.read_csv(csv_file)

    # 2. Identify columns (excluding subject_id, if needed)
    #    Adjust to match your file’s actual column headers
    qc_columns = [
        "FD_Pass", "FD_max_Pass", 
        "GM_lower_Pass", "GM_upper_Pass", 
        "WM_lower_Pass", "WM_upper_Pass",
        "CSF_lower_Pass", "CSF_upper_Pass",
        "Total_lower_Pass", "Total_upper_Pass" 
    ]
    
    # If your actual CSV has columns in a different order or named differently, 
    # feel free to adapt the above list or do something like:
    # qc_columns = [col for col in df.columns if col != "subject_id"]
    
    # 3. Total number of subjects
    num_subjects = len(df)
    print(f"Total number of subjects: {num_subjects}\n")
    
    # 4. Compute pass/fail counts and percentages for each QC column
    stats_dict = {}
    for col in qc_columns:
        # Number of subjects that pass this column
        pass_count = df[col].sum()  # True = 1, False = 0
        fail_count = num_subjects - pass_count
        
        pass_percentage = (pass_count / num_subjects) * 100
        fail_percentage = (fail_count / num_subjects) * 100
        
        stats_dict[col] = {
            'pass_count': pass_count,
            'fail_count': fail_count,
            'pass_percentage': pass_percentage,
            'fail_percentage': fail_percentage
        }
    
    # 5. Print out the summary for each criterion
    for col, stats in stats_dict.items():
        print(f"--- {col} ---")
        print(f"  Pass: {stats['pass_count']} ({stats['pass_percentage']:.2f}%)")
        print(f"  Fail: {stats['fail_count']} ({stats['fail_percentage']:.2f}%)\n")
    
    # 6. If 'Pass' is your final overall pass/fail column, 
    #    you might also want to see how many total pass vs. fail.
    if "Pass" in df.columns:
        overall_pass = df["Pass"].sum()
        overall_fail = num_subjects - overall_pass
        print("=== Overall QC ===")
        print(f" Pass: {overall_pass} subjects ({(overall_pass/num_subjects)*100:.2f}%)")
        print(f" Fail: {overall_fail} subjects ({(overall_fail/num_subjects)*100:.2f}%)\n")


if __name__ == "__main__":
    # Example usage:
    # Replace 'qc_data.csv' with the path to your CSV file
    qc_statistics("/Users/marc/Downloads/automatic_rejections-2.csv")


Total number of subjects: 51

--- FD_Pass ---
  Pass: 34 (66.67%)
  Fail: 17 (33.33%)

--- FD_max_Pass ---
  Pass: 22 (43.14%)
  Fail: 29 (56.86%)

--- GM_lower_Pass ---
  Pass: 51 (100.00%)
  Fail: 0 (0.00%)

--- GM_upper_Pass ---
  Pass: 49 (96.08%)
  Fail: 2 (3.92%)

--- WM_lower_Pass ---
  Pass: 47 (92.16%)
  Fail: 4 (7.84%)

--- WM_upper_Pass ---
  Pass: 50 (98.04%)
  Fail: 1 (1.96%)

--- CSF_lower_Pass ---
  Pass: 50 (98.04%)
  Fail: 1 (1.96%)

--- CSF_upper_Pass ---
  Pass: 47 (92.16%)
  Fail: 4 (7.84%)

--- Total_lower_Pass ---
  Pass: 51 (100.00%)
  Fail: 0 (0.00%)

--- Total_upper_Pass ---
  Pass: 49 (96.08%)
  Fail: 2 (3.92%)

=== Overall QC ===
 Pass: 0 subjects (0.00%)
 Fail: 51 subjects (100.00%)



In [8]:
import json
from collections import defaultdict, Counter

def analyze_multireviewer_qc(json_file):
    """
    Analyzes a JSON file containing QC decisions from 3 reviewers (marc, roser, raul).
    
    We extract how many subjects were reviewed by 1, 2, or 3 reviewers and compute
    various agreement statistics (including how many 2-reviewer subjects have all yes/no/maybe).
    """
    
    # 1) Load JSON data
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    marc_dict = data.get("marc", {})
    roser_dict = data.get("roser", {})
    raul_dict = data.get("raul", {})
    
    # 2) Collect all unique subject IDs across all three reviewers
    all_subjects = set(marc_dict.keys()) | set(roser_dict.keys()) | set(raul_dict.keys())
    total_subject_count = len(all_subjects)
    
    # subject_reviews will map subjectID -> list of its reviews (len 1, 2, or 3)
    subject_reviews = {}
    
    for subj in all_subjects:
        reviews = []
        if subj in marc_dict:
            reviews.append(marc_dict[subj])
        if subj in roser_dict:
            reviews.append(roser_dict[subj])
        if subj in raul_dict:
            reviews.append(raul_dict[subj])
        subject_reviews[subj] = reviews
    
    # 3) Classify subjects into 3 groups: reviewed by 1, 2, or 3 reviewers
    group_1 = []  # reviewed by exactly 1 reviewer
    group_2 = []  # reviewed by exactly 2 reviewers
    group_3 = []  # reviewed by exactly 3 reviewers
    
    for subj, revs in subject_reviews.items():
        n_reviews = len(revs)
        if n_reviews == 1:
            group_1.append(subj)
        elif n_reviews == 2:
            group_2.append(subj)
        elif n_reviews == 3:
            group_3.append(subj)
    
    # Helper function to get counts of yes/no/maybe and patterns
    def summarize_reviews(subject_list):
        counter = Counter()
        patterns = Counter()
        for subj in subject_list:
            revs = subject_reviews[subj]
            for r in revs:
                counter[r] += 1
            # Sort to create a consistent pattern key (e.g., "maybe/no", "yes/yes")
            sorted_pattern = "/".join(sorted(revs))
            patterns[sorted_pattern] += 1
        return counter, patterns
    
    # 4) Summarize each group
    counter_group1, patterns_group1 = summarize_reviews(group_1)
    counter_group2, patterns_group2 = summarize_reviews(group_2)
    counter_group3, patterns_group3 = summarize_reviews(group_3)
    
    # 5) Print overall stats
    print("===== REVIEW STATISTICS =====")
    print(f"Total unique subjects in file: {total_subject_count}\n")
    
    # --- Group 1 stats ---
    print(f"Subjects reviewed by exactly 1 reviewer: {len(group_1)}")
    print("  Counts of yes/no/maybe (for group 1):")
    for k in ["yes", "no", "maybe"]:
        print(f"    {k}: {counter_group1[k]}")
    print()
    
    # --- Group 2 stats ---
    total_subj_g2 = len(group_2)
    print(f"Subjects reviewed by exactly 2 reviewers: {total_subj_g2}")
    print("  Counts of yes/no/maybe (for group 2):")
    for k in ["yes", "no", "maybe"]:
        print(f"    {k}: {counter_group2[k]}")
    
    # Agreement vs. disagreement
    agree_count_g2 = 0
    disagree_count_g2 = 0
    for pattern, count in patterns_group2.items():
        # Example patterns: "yes/yes", "no/no", "yes/maybe"
        if len(set(pattern.split("/"))) == 1:
            agree_count_g2 += count
        else:
            disagree_count_g2 += count
    
    if total_subj_g2 > 0:
        print(f"  # of 2-reviewer subjects with perfect agreement: {agree_count_g2} "
              f"({100 * agree_count_g2 / total_subj_g2:.1f}%)")
        print(f"  # of 2-reviewer subjects with disagreement: {disagree_count_g2} "
              f"({100 * disagree_count_g2 / total_subj_g2:.1f}%)\n")
    
    # NEW: Count all-yes, all-no, all-maybe in 2-reviewer group
    all_yes_2 = patterns_group2["yes/yes"]
    all_no_2 = patterns_group2["no/no"]
    all_maybe_2 = patterns_group2["maybe/maybe"]
    print(f"  # of 2-reviewer subjects with all YES:   {all_yes_2}")
    print(f"  # of 2-reviewer subjects with all NO:    {all_no_2}")
    print(f"  # of 2-reviewer subjects with all MAYBE: {all_maybe_2}")
    partial_or_mixed_2 = total_subj_g2 - (all_yes_2 + all_no_2 + all_maybe_2)
    print(f"  # of 2-reviewer subjects with partial/mixed decisions: {partial_or_mixed_2}\n")
    
    # --- Group 3 stats ---
    total_subj_g3 = len(group_3)
    print(f"Subjects reviewed by exactly 3 reviewers: {total_subj_g3}")
    print("  Counts of yes/no/maybe (for group 3):")
    for k in ["yes", "no", "maybe"]:
        print(f"    {k}: {counter_group3[k]}")
    
    all_yes_3 = patterns_group3["yes/yes/yes"]
    all_no_3 = patterns_group3["no/no/no"]
    all_maybe_3 = patterns_group3["maybe/maybe/maybe"]
    
    partial_or_mixed_3 = total_subj_g3 - (all_yes_3 + all_no_3 + all_maybe_3)
    
    # Print 3-reviewer stats on perfect vs. partial agreement
    print(f"  # of 3-reviewer subjects with perfect agreement: {all_yes_3 + all_no_3 + all_maybe_3} "
          f"({100 * (all_yes_3 + all_no_3 + all_maybe_3) / total_subj_g3:.1f}%")
    print(f"    All YES:   {all_yes_3}")
    print(f"    All NO:    {all_no_3}")
    print(f"    All MAYBE: {all_maybe_3}")
    
    print(f"  # of 3-reviewer subjects with partial or mixed decisions: {partial_or_mixed_3} "
          f"({100 * partial_or_mixed_3 / total_subj_g3:.1f}%) ")
    print()

# -----------------------
# Example usage:
# -----------------------
if __name__ == "__main__":
    # Replace 'qc_review.json' with the path to your actual JSON file
    analyze_multireviewer_qc("/Users/marc/Downloads/qc_decisions-1 copy.json")


===== REVIEW STATISTICS =====
Total unique subjects in file: 1113

Subjects reviewed by exactly 1 reviewer: 471
  Counts of yes/no/maybe (for group 1):
    yes: 341
    no: 22
    maybe: 108

Subjects reviewed by exactly 2 reviewers: 460
  Counts of yes/no/maybe (for group 2):
    yes: 784
    no: 24
    maybe: 112
  # of 2-reviewer subjects with perfect agreement: 363 (78.9%)
  # of 2-reviewer subjects with disagreement: 97 (21.1%)

  # of 2-reviewer subjects with all YES:   346
  # of 2-reviewer subjects with all NO:    3
  # of 2-reviewer subjects with all MAYBE: 14
  # of 2-reviewer subjects with partial/mixed decisions: 97

Subjects reviewed by exactly 3 reviewers: 182
  Counts of yes/no/maybe (for group 3):
    yes: 379
    no: 31
    maybe: 136
  # of 3-reviewer subjects with perfect agreement: 93 (51.1%
    All YES:   84
    All NO:    1
    All MAYBE: 8
  # of 3-reviewer subjects with partial or mixed decisions: 89 (48.9%) 

