# This notebook was used for genotyping the draft coverage-based CNV calls

In [1]:
import os
import pandas as pd
import numpy as np

from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm.notebook import tqdm

# Initialising a bunch of useful dataframes

# Path to coverage-based pipeline's output directory
PATH_TO_EXECUTION = "../assets_pf8/03_execution"

# Loading in Pf8's public metadata and getting QC-passed samples
sample_meta_data = pd.read_csv("../assets_pf8/Pf_8_samples_20241212.txt", sep = "\t", usecols = ["Sample", "QC pass"])
qc_pass_samples  = sample_meta_data.loc[sample_meta_data["QC pass"] == True, "Sample"].tolist()

samples = [sample for sample in os.listdir(os.path.join(PATH_TO_EXECUTION, "postprocessing")) if sample in qc_pass_samples]

# Loading in the regions for which we want to make CNV calls (i.e., the 6 genes)
call_regions_dict = pd.read_csv("../assets_pf8/04_call_regions.tsv", sep = "\t").set_index("CALL_ID").to_dict("index")

# Reading which samples did not have a contig ploidy of 1 for which contigs/chromosomes
failed_test_ploidy = pd.read_csv(os.path.join(PATH_TO_EXECUTION, "failed_test_ploidy.tsv"), sep = "\t", header = None)
failed_test_ploidy.columns = ["SAMPLE", "CONTIG", "PLOIDY", "_"]

failed_train_ploidy = pd.read_csv(os.path.join(PATH_TO_EXECUTION, "failed_test_ploidy.tsv"), sep = "\t", header = None)
failed_train_ploidy.columns = ["SAMPLE", "CONTIG", "PLOIDY", "_"]

failed_ploidy_df = pd.merge(failed_test_ploidy, failed_train_ploidy)

call_regions_dict

{'MDR1': {'CONTIG': 'Pf3D7_05_v3',
  'START': 955955,
  'END': 963095,
  'STRAND': '+',
  'CALL_TYPE': 'CNV'},
 'CRT': {'CONTIG': 'Pf3D7_07_v3',
  'START': 402385,
  'END': 406341,
  'STRAND': '+',
  'CALL_TYPE': 'CNV'},
 'HRP2': {'CONTIG': 'Pf3D7_08_v3',
  'START': 1373212,
  'END': 1376988,
  'STRAND': '-',
  'CALL_TYPE': 'DEL'},
 'GCH1': {'CONTIG': 'Pf3D7_12_v3',
  'START': 974226,
  'END': 976097,
  'STRAND': '+',
  'CALL_TYPE': 'CNV'},
 'HRP3': {'CONTIG': 'Pf3D7_13_v3',
  'START': 2840236,
  'END': 2842840,
  'STRAND': '-',
  'CALL_TYPE': 'DEL'},
 'PM2_PM3': {'CONTIG': 'Pf3D7_14_v3',
  'START': 292244,
  'END': 299101,
  'STRAND': '+',
  'CALL_TYPE': 'CNV'}}

In [2]:
# Setting up the parameters for thresholding
MAX_N_SEGMENTS_ALLOWED = 5
MINIMUM_CONTIG_OCCUPANCY_CN1 = 0.8
MINIMUM_CALL_REGION_OCCUPANCY_FOR_CNV_CALL = 0.75
SUFFICIENT_QS_FOR_CNV_CALL = 400
SUFFICIENT_QA_FOR_CNV_CALL = 40
MINIMUM_RESCUE_WINDOW_OCCUPANCY_CN1_FOR_CNV_RESCUE = 0.8
MINIMUM_CALL_REGION_OCCUPANCY_CN1_FOR_CNV_RESCUE = 0.75

MAXIMUM_CONTIG_MEAN_ABSOLUTE_RESIDUALS = 0.3
MINIMUM_QS_FOR_DEL_CALL = 400

MINIMUM_QS_IN_CN1_REGION_FOR_RESCUE = 1000
MINIMUM_RESCUE_WINDOW_OCCUPANCY_CN1_FOR_DEL_RESCUE = 0.8
MINIMUM_CALL_REGION_OCCUPANCY_CN1_FOR_DEL_RESCUE = 0.2

def _annotate_with_coverage(s: pd.Series, region_start: int, region_end: int) -> float:
    """
    Helper function which calculates how much of the gene of interest is covered by the input segment
    """
    overlap_start = max([s["START"], region_start])
    overlap_end   = min([s["END"],   region_end])
    
    overlap_length = max([0, overlap_end - overlap_start])
    gene_length = (region_end - region_start)
    
    proportion_overlap = overlap_length / gene_length
    
    return proportion_overlap

def _call_del(df: pd.DataFrame, dCR: pd.DataFrame) -> int:
    seg_call_region_subset = df.loc[df.OCCUPANCY_OF_CALL_REGION > 0.].copy()
    
    segment_qc_pass = (seg_call_region_subset.QS >= MINIMUM_QS_FOR_DEL_CALL)
    seg_call_region_qc_pass_subset = seg_call_region_subset.loc[segment_qc_pass]
    
    no_suitable_qc_pass_segments = (len(seg_call_region_qc_pass_subset) == 0)
    acceptable_seg_residual_mean = (dCR.DCR_SEGMENT_RESIDUAL.mean() < MAXIMUM_CONTIG_MEAN_ABSOLUTE_RESIDUALS)

    if no_suitable_qc_pass_segments or not acceptable_seg_residual_mean:
        rescue_window = df.loc[
            (df.CN == 1) & (df.QS >= MINIMUM_QS_IN_CN1_REGION_FOR_RESCUE)]
        
        if rescue_window.SEGMENT_CONTIG_OCCUPANCY.sum() >= MINIMUM_RESCUE_WINDOW_OCCUPANCY_CN1_FOR_DEL_RESCUE:
            rescued_call_df = df.loc[
                (df.OCCUPANCY_OF_CALL_REGION > MINIMUM_CALL_REGION_OCCUPANCY_CN1_FOR_DEL_RESCUE) & (df.CN != 1)
            ].groupby("CN").OCCUPANCY_OF_CALL_REGION.sum().reset_index()
            
            if len(rescued_call_df) > 0:
                return (1 if (rescued_call_df.CN == 0).any() else 0)
            
            else:
                return -1
        
        else:
            return -1
        
    else:
        return (1 if (seg_call_region_qc_pass_subset.CN == 0).any() else 0)

def _call_cnv(df: pd.DataFrame, dCR: pd.DataFrame) -> int:
    seg_call_region_subset = df.loc[df.OCCUPANCY_OF_CALL_REGION >= MINIMUM_CALL_REGION_OCCUPANCY_FOR_CNV_CALL].copy()
    
    if (seg_call_region_subset[["QS", "QA"]].values == 0).any():
        return -1
    
    segment_qc_pass = (seg_call_region_subset.QS >= SUFFICIENT_QS_FOR_CNV_CALL) | (seg_call_region_subset.QA >= SUFFICIENT_QA_FOR_CNV_CALL)
    seg_call_region_qc_pass_subset = seg_call_region_subset.loc[segment_qc_pass]

    no_suitable_qc_pass_segments = (len(seg_call_region_qc_pass_subset) == 0)
    acceptable_seg_residual_mean = (dCR.DCR_SEGMENT_RESIDUAL.mean() < MAXIMUM_CONTIG_MEAN_ABSOLUTE_RESIDUALS)

    if no_suitable_qc_pass_segments or not acceptable_seg_residual_mean:
        rescue_window = df.loc[
            (df.CN == 1) & (df.QS >= MINIMUM_QS_IN_CN1_REGION_FOR_RESCUE)]
        
        if rescue_window.SEGMENT_CONTIG_OCCUPANCY.sum() >= MINIMUM_RESCUE_WINDOW_OCCUPANCY_CN1_FOR_CNV_RESCUE:
            rescued_call_df = df.loc[
                (df.OCCUPANCY_OF_CALL_REGION > MINIMUM_CALL_REGION_OCCUPANCY_CN1_FOR_CNV_RESCUE) & (df.CN != 1)
            ].groupby("CN").OCCUPANCY_OF_CALL_REGION.sum().reset_index()
            
            if len(rescued_call_df) > 0:
                return (1 if (rescued_call_df.CN > 1).any() else 0)
            
            else:
                return -1
        
        else:
            return -1
    else:
        return (1 if (seg_call_region_qc_pass_subset.CN > 1).any() else 0)

def genotype_sample(SAMPLE: str) -> dict:
    sample_results = {CALL_ID: "" for CALL_ID in call_regions_dict.keys()}
    sample_results["SAMPLE"]  = SAMPLE
    
    dCR = pd.read_csv(os.path.join(PATH_TO_EXECUTION, "postprocessing", SAMPLE, f"{SAMPLE}.dCR.tsv"),
                      sep = "\t", skiprows = 18)
    seg = pd.read_csv(os.path.join(PATH_TO_EXECUTION, "postprocessing", SAMPLE, f"{SAMPLE}.segments.vcf.gz"),
                      sep = "\t", skiprows = 35)
    seg["START"] = seg.POS.astype(int)
    seg["END"]   = seg.INFO.apply(lambda s: s.partition("END=")[2]).astype(int)
    
    split_data = seg[SAMPLE].str.split(":").apply(lambda x: tuple(map(int, x)))
    seg[["GT", "CN", "NP", "QA", "QS", "QSE", "QSS"]] = pd.DataFrame(
        [tuple(x) for x in zip(*split_data)]).T
    
    seg = seg[["#CHROM", "START", "END", "CN", "QA", "QS", "QSE", "QSS"]]

    for CALL_ID, call_meta in call_regions_dict.items():
        CALL_REGION_CHROM     = call_meta["CONTIG"]
        CALL_REGION_START     = call_meta["START"]
        CALL_REGION_END       = call_meta["END"]
        CALL_REGION_CALL_TYPE = call_meta["CALL_TYPE"]

        seg_inspection_subset = seg.loc[seg["#CHROM"] == CALL_REGION_CHROM].copy()

        seg_inspection_subset["SEGMENT_LENGTH"] = seg_inspection_subset.END - seg_inspection_subset.START
        seg_inspection_subset["SEGMENT_CONTIG_OCCUPANCY"] = seg_inspection_subset.SEGMENT_LENGTH / seg_inspection_subset.SEGMENT_LENGTH.sum()
        seg_inspection_subset["OCCUPANCY_OF_CALL_REGION"] = seg_inspection_subset.apply(lambda s: 
            _annotate_with_coverage(s, CALL_REGION_START, CALL_REGION_END),
        axis = 1)

        if CALL_REGION_CALL_TYPE == "DEL":
            seg_inspection_subset = seg_inspection_subset.loc[seg_inspection_subset.START <= CALL_REGION_END].reset_index(drop = True)
        
        if len(seg_inspection_subset) > MAX_N_SEGMENTS_ALLOWED:
            sample_results[CALL_ID] = -1
            continue

        contig_ploidy_fail = failed_ploidy_df.loc[
            (failed_ploidy_df.SAMPLE == SAMPLE) & (failed_ploidy_df.CONTIG == CALL_REGION_CHROM)
        ]
        
        if len(contig_ploidy_fail) > 0:
            sample_results[CALL_ID] = -1
            continue
        
        contig_occupancy_grouped_by_cn = seg_inspection_subset.groupby("CN").SEGMENT_CONTIG_OCCUPANCY.sum().reset_index()
        contig_occupancy_cn1 = contig_occupancy_grouped_by_cn.loc[contig_occupancy_grouped_by_cn.CN == 1, "SEGMENT_CONTIG_OCCUPANCY"].values
        
        if len(contig_occupancy_cn1) == 0:
            sample_results[CALL_ID] = -1
            continue
        elif len(contig_occupancy_cn1) > 0:
            if contig_occupancy_cn1[0] < MINIMUM_CONTIG_OCCUPANCY_CN1:
                sample_results[CALL_ID] = -1
                continue
            else:
                pass # Allow calls to proceed if majority of contig is CN=1
        
        for i, row in seg_inspection_subset.iterrows():
            dCR.loc[
                (dCR.CONTIG == row["#CHROM"]) &
                (dCR.START >= row.START) &
                (dCR.END <= row.END),
                "SEGMENT_CN"
            ] = row.CN
        dCR["DCR_SEGMENT_RESIDUAL"] = np.abs(dCR.LINEAR_COPY_RATIO - dCR.SEGMENT_CN)
        dCR = dCR.dropna().copy()

        if CALL_REGION_CALL_TYPE == "DEL":
            sample_results[CALL_ID] = _call_del(seg_inspection_subset, dCR)
            continue
            
        elif CALL_REGION_CALL_TYPE == "CNV":
            sample_results[CALL_ID] = _call_cnv(seg_inspection_subset, dCR)
            continue
        
    return sample_results

In [None]:
# Highlighting CNV genotyping of a single sample - also helps with bugfixing to be able to just run on a single sample
genotype_sample("SPT00899")

{'MDR1': 0,
 'CRT': 0,
 'HRP2': 0,
 'GCH1': 0,
 'HRP3': 1,
 'PM2_PM3': 0,
 'SAMPLE': 'SPT00899'}

In [None]:
# Multiprocessing for CNV genotyping all samples
with ProcessPoolExecutor(max_workers = 64) as executor:
    list_of_results = []
    
    futures = {executor.submit(genotype_sample, sample): sample for sample in samples}

    with tqdm(total = len(samples)) as pbar:
        for future in as_completed(futures):            
            list_of_results.append(future.result())
            pbar.update(1)

# Export draft calls as `draft_coverage_calls.tsv`
df = pd.DataFrame(list_of_results).sort_values("SAMPLE").reset_index(drop = True)
df[["SAMPLE"] + [CALL_ID for CALL_ID in call_regions_dict.keys()]].to_csv("app_files/draft_coverage_calls.tsv", sep = "\t", index = False)

  0%|          | 0/24409 [00:00<?, ?it/s]

In [3]:
# Loading in `draft_coverage_calls.tsv`
df = pd.read_csv("app_files/draft_coverage_calls.tsv", sep = "\t")

In [None]:
# Combining with Pf7's drug resistance file to check for concordance
pd.set_option("display.max.colwidth", None)

pf7_resistance = pd.read_csv(
    "app_files/Pf7_inferred_resistance_status_classification.tsv", sep = "\t"
).set_index("Sample")[
    ["Mefloquine", "Piperaquine", "HRP2", "HRP3"]
].map(lambda val: {
    "Sensitive"   :  "0",
    "Resistant"   :  "1",
    "Undetermined": "-1",
    "nodel"       :  "0",
    "del"         :  "1",
    "uncallable"  : "-1"
    }[val]
).rename(columns = {
    "Mefloquine" : "Pf7Final:MDR1",
    "Piperaquine": "Pf7Final:PM2_PM3",
    "HRP2"       : "Pf7Final:HRP2",
    "HRP3"       : "Pf7Final:HRP3"
})

meta = pd.read_csv("../assets_pf8/Pf_8_samples_20241212.txt", sep = "\t", usecols = ["Sample", "QC pass", "Sample was in Pf7"]).set_index("Sample")

merged = (
    pf7_resistance.join(meta, how = "outer")
    .join(df.set_index("SAMPLE"), how = "outer")
    .reset_index()
    .rename(columns = {"index": "Sample"})
    .fillna("MISSING")
    .map(lambda x: (int(x) if type(x) == float else x))
)

merged = merged.loc[
    (merged["QC pass"] == True)
    # & (merged["Sample was in Pf7"] == True)
    # & ~(merged["Sample was in Pf7"] == True)
]

n_total_samples = len(merged)
print(n_total_samples)

n_example_samples = 100

24409


---

# Check concordance for one gene at a time.

#### Python lists here can be copy and pasted directly into `app.py` for rapid inspection

#### "MISSING" on the first column (e.g., Pf7Final:MDR1) implies these samples were not in Pf7 and were new in Pf8.

In [10]:
merged.groupby(["Pf7Final:MDR1", "MDR1"]).apply(lambda s: pd.Series({
    "COUNT": len(s.Sample),
    "PERCENTAGE": f"{len(s.Sample) / n_total_samples:.2%}",
    "EXAMPLES": '["' + '", "'.join(s.Sample.sample(min([n_example_samples, len(s.Sample)]))) + '"]'
}), include_groups = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT,PERCENTAGE,EXAMPLES
Pf7Final:MDR1,MDR1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,-1,1380,5.65%,"[""SPT36128"", ""SPT00202"", ""SPT41911"", ""SPT24493"", ""RCN08675"", ""SPT24651"", ""SPT24513"", ""SPT16953"", ""SPT19356"", ""SPT25034"", ""RCN03360"", ""RCN09292"", ""SPT19947"", ""SPT40589"", ""PM0034-C"", ""SPT40816"", ""SPT35133"", ""SPT17972"", ""SPT19872"", ""SPT35519"", ""SPT24628"", ""RCN13907"", ""RCN10308"", ""SPT15621"", ""SPT19521"", ""SPT24826"", ""SPT19681"", ""RCN02656"", ""SPT40623"", ""PH1678-CW2"", ""SPT24630"", ""SPT40847"", ""SPT34453"", ""SPT36356"", ""SPT21295"", ""SPT36104"", ""PA0330-CW"", ""SPT19396"", ""SPT36713"", ""SPT19764"", ""SPT24790"", ""SPT36671"", ""SPT35587"", ""RCN09561"", ""RCN02544"", ""SPT36612"", ""SPT19341"", ""SPT41999"", ""QG0364-C"", ""SPT12505"", ""SPT12492"", ""QG0439-C"", ""RCN02974"", ""SPT17924"", ""PA1062-CW"", ""RCN02767"", ""SPT22438"", ""SPT40676"", ""QG0353-C"", ""SPT36877"", ""PF0046-C"", ""RCN02993"", ""SPT38372"", ""SPT19383"", ""SPT15529"", ""RCN11270"", ""SPT19024"", ""RCN09045"", ""RCN03160"", ""SPT40943"", ""SPT12524"", ""SPT24526"", ""SPT24738"", ""SPT36127"", ""SPT41939"", ""SPT36057"", ""SPT35590"", ""SPT35537"", ""SPT34234"", ""SPT38319"", ""RCN03099"", ""SPT35285"", ""RCN09314"", ""SPT24588"", ""SPT36095"", ""SPT35419"", ""PH1747-C"", ""RCN14119"", ""PF0692-C"", ""RCN09225"", ""QC0551-CW"", ""SPT35511"", ""PH0917-Cx"", ""SPT15671"", ""SPT38325"", ""SPT17064"", ""SPT12559"", ""PH0868-Cx"", ""RCN01106"", ""SPT26961""]"
-1,0,30,0.12%,"[""PM0008-C"", ""PD0040-C"", ""PF0134-C"", ""PD0062-C"", ""PN0041-C"", ""PD1192-C"", ""PF0674-C"", ""PK0026-C"", ""PR0149-C"", ""PF0676-C"", ""PF0026-C"", ""PF0680-C"", ""PF0710-C"", ""PF0081-C"", ""PF0096-C"", ""PH0023-C"", ""PF0711-C"", ""QG0383-C"", ""PM0033-C"", ""PF0131-C"", ""PD0111-C"", ""PF0704-C"", ""PD1036-C"", ""PA0007-C"", ""PF0720-C"", ""PM0007-C"", ""PD0052-C"", ""PN0019-C"", ""PF0040-C"", ""PF0123-C""]"
-1,1,7,0.03%,"[""PD0121-C"", ""PD1211-C"", ""PD0137-C"", ""PD1103-C"", ""PD0032-C"", ""PD0026-C"", ""PD0068-C""]"
0,-1,3933,16.11%,"[""RCN08696"", ""SPT12084"", ""SPT19981"", ""QG0303-C"", ""SPT26526"", ""RCN00895"", ""RCN02824"", ""RCN00898"", ""SPT26518"", ""RCN09745"", ""SPT38311"", ""SPT15565"", ""QW0021-CW"", ""RCN09003"", ""SPT34285"", ""RCN00934"", ""SPT25158"", ""SPT26944"", ""SPT15735"", ""RCN02282"", ""RCN00168"", ""SPT35896"", ""QW0148-CW"", ""PF0707-C"", ""SPT41092"", ""SPT00211"", ""SPT18993"", ""RCN14040"", ""QG0423-C"", ""RCN08744"", ""SPT36696"", ""SPT17826"", ""RCN00229"", ""PD0096-C"", ""SPT12080"", ""SPT36861"", ""RCN13442"", ""SPT41076"", ""SPT16942"", ""RCN13543"", ""RCN12817"", ""PE0232-C"", ""SPT26586"", ""RCN13093"", ""RCN11616"", ""RCN01869"", ""SPT16870"", ""SPT20495"", ""RCN02133"", ""RCN12373"", ""RCN14008"", ""PA0899-CW"", ""SPT16948"", ""SPT23012"", ""SPT34287"", ""RCN03165"", ""RCN13039"", ""RCN13527"", ""RCN11241"", ""RCN00122"", ""RCN14036"", ""RCN09273"", ""SPT40950"", ""RCN08951"", ""RCN08754"", ""SPT18595"", ""SPT26557"", ""RCN12409"", ""SPT41964"", ""SPT19530"", ""SPT34289"", ""RCN12789"", ""SPT12074"", ""SPT40518"", ""SPT42006"", ""RCN08348"", ""SPT18969"", ""RCN11023"", ""PA0645-C"", ""SPT34805"", ""SPT36107"", ""SPT18661"", ""SPT15229"", ""SPT24139"", ""PA0345-CW"", ""RCN13920"", ""SPT24639"", ""SPT41278"", ""SPT34327"", ""RCN14065"", ""SPT12572"", ""SPT22555"", ""SPT24756"", ""RCN00785"", ""SPT36628"", ""SPT15204"", ""SPT19558"", ""RCN00760"", ""RCN12618"", ""PF0691-C""]"
0,0,9934,40.70%,"[""PD0957-C"", ""RCN10159"", ""PE0340-C"", ""QC0138-C"", ""RCN02192"", ""RCN11113"", ""QC0215-C"", ""QP0064-C"", ""RCN12813"", ""SPT14965"", ""QE0375-C"", ""RCN08185"", ""QV0051-C"", ""RCN09481"", ""PF0461-C"", ""PF1085-C"", ""SPT26623"", ""SPT15011"", ""SPT15705"", ""QG0200-C"", ""QP0177-C"", ""SPT00876"", ""SPT36311"", ""PV0258-C"", ""SPT42886"", ""RCN10207"", ""RCN02547"", ""RCN09935"", ""SPT22621"", ""RCN08659"", ""PE0109-C"", ""PF1190-Cx"", ""PF0846-C"", ""PH1085-C"", ""QG0278-C"", ""QP0079-C"", ""SPT34896"", ""SPT15131"", ""SPT35238"", ""SPT40994"", ""QG0160-C"", ""PF0851-C"", ""PW0054-C"", ""PM0205-C"", ""PF0531-C"", ""PF0666-C"", ""RCN02831"", ""RCN09650"", ""PH1203-C"", ""PF0262-C"", ""SPT15058"", ""PA0604-C"", ""QG0468-C"", ""PH0395-C"", ""PM0483-C"", ""SPT42882"", ""PV0120-C"", ""QQ0071-C"", ""SPT22200"", ""SPT43175"", ""PA0542-C"", ""RCN09293"", ""PH0326-C"", ""PH1742-C"", ""PH1737-C"", ""RCN11012"", ""RCN11835"", ""RCN08335"", ""PA0022-C"", ""PH0130-CW"", ""RCN03016"", ""RCN11758"", ""SPT26972"", ""PA0448-C"", ""RCN11300"", ""RCN11677"", ""SPT15704"", ""QP0159-C"", ""SPT00859"", ""PT0109-C"", ""PF1167-Cx"", ""RCN11187"", ""SPT43279"", ""PP0010-C"", ""RCN02357"", ""PM0198-C"", ""PC0051-C"", ""PH0243-C"", ""PE0379-C"", ""SPT15007"", ""SPT42999"", ""RCN02304"", ""QC0206-C"", ""PH1252-C"", ""PF0832-C"", ""RCN09540"", ""PM0124-C"", ""QQ0008-C"", ""RCN00944"", ""PM0281-C""]"
0,1,15,0.06%,"[""PH0467-CW"", ""PD1174-C"", ""SPT40987"", ""SPT41006"", ""RCN09062"", ""SPT36901"", ""RCN11129"", ""SPT26642"", ""RCN02525"", ""PA1321-Cw"", ""RCN13510"", ""RCN10728"", ""PD0015-02"", ""RCN11290"", ""PD1292-C""]"
1,-1,41,0.17%,"[""RCN09565"", ""RCN09501"", ""SPT24419"", ""QC0156-C"", ""RCN03450"", ""RCN09555"", ""PD0469-C"", ""PD0964-C"", ""SPT34233"", ""SPT24413"", ""RCN03181"", ""RCN03521"", ""RCN09675"", ""RCN03169"", ""RCN13530"", ""RCN09401"", ""PD1167-C"", ""RCN09350"", ""RCN03174"", ""RCN09683"", ""RCN03509"", ""RCN13537"", ""SPT24410"", ""SPT34257"", ""PD0760-C"", ""SPT24457"", ""PD0782-C"", ""RCN09649"", ""PD0459-Cx"", ""RCN03194"", ""RCN03183"", ""RCN03178"", ""RCN03195"", ""PH0476-C"", ""PD1368-C"", ""RCN09794"", ""RCN03380"", ""RCN13533"", ""QC0141-C"", ""RCN09309"", ""RCN09710""]"
1,0,17,0.07%,"[""PD1279-C"", ""PD1032-C"", ""PH0683-C"", ""PH0475-C"", ""PD1012-C"", ""PD0592-C"", ""PD0499-C"", ""PD1251-C"", ""PD1202-C"", ""PD0489-C"", ""PH0588-C"", ""RCN09633"", ""PN0008-C"", ""PH0248-C"", ""RCN09400"", ""PD0667-C"", ""PD1322-C""]"
1,1,642,2.63%,"[""PD1236-C"", ""PH0534-C"", ""PH0209-C"", ""PD0473-C"", ""PH0338-C"", ""PD0537-C"", ""PD0800-C"", ""PH1089-C"", ""PD1045-C"", ""PD0106-C"", ""QC0262-C"", ""PH0330-C"", ""PD0563-C"", ""PD0814-C"", ""PD0720-C"", ""PD1507-C"", ""PH0108-CW"", ""PD1243-C"", ""PD1127-C"", ""PD0079-C"", ""PD0476-C"", ""PD1274-C"", ""PD0966-C"", ""PH0413-C"", ""PH0114-C"", ""PD1075-C"", ""PH1195-C"", ""PD0078-C"", ""PD1270-C"", ""PD0511-C"", ""PD1277-C"", ""PD0974-C"", ""PD0123-C"", ""PH0722-C"", ""PD1402-C"", ""PH0703-C"", ""PD1156-C"", ""PD1510-C"", ""PD0908-C"", ""PD0520-C"", ""PD0658-C"", ""PD0526-C"", ""PD1105-C"", ""PV0286-C"", ""PD1514-C"", ""PH0008-C"", ""PD0951-C"", ""PD0918-C"", ""RCN09738"", ""PD1221-C"", ""RCN09825"", ""PD0962-C"", ""PD1373-C"", ""PD0937-C"", ""PD0132-C"", ""PH0001-C"", ""PD0525-C"", ""PD0818-C"", ""PH0959-Cx"", ""PD1353-C"", ""PD0484-C"", ""PD1335-C"", ""PD1405-C"", ""PD1360-C"", ""PD0804-C"", ""RCN09728"", ""PD1271-C"", ""PD1240-C"", ""PD1201-C"", ""PD0967-C"", ""PD1379-C"", ""PD0758-C"", ""PH0832-C"", ""PD1194-C"", ""PD0799-C"", ""PD0777-C"", ""PD0820-C"", ""PD0979-C"", ""QC0229-C"", ""PH0064-C"", ""PD0550-C"", ""PH0700-C"", ""PD1227-C"", ""PH0418-C"", ""PD1466-C"", ""PD0787-C"", ""PD1193-C"", ""PD0928-C"", ""PD0498-C"", ""PD1499-C"", ""PD0663-C"", ""PD0006-01"", ""PD0885-C"", ""PD0497-C"", ""PH0559-C"", ""PD0746-C"", ""PD0516-C"", ""PD1024-C"", ""PD0565-C"", ""PD0828-C""]"
MISSING,-1,5755,23.58%,"[""RCN23162"", ""SPT67521"", ""SPT83405"", ""SPT91105"", ""SPT52491"", ""SPT67471"", ""SPT67184"", ""SPT51520"", ""SPT34580"", ""SPT44662"", ""RCN09226"", ""SPT83935"", ""SPT44686"", ""SPT17866"", ""SPT53775"", ""SPT53880"", ""SPT67169"", ""SPT91361"", ""SPT67197"", ""SPT46326"", ""SPT54470"", ""SPT44934"", ""SPT34583"", ""RCN14948"", ""SPT67133"", ""SPT72128"", ""RCN25279"", ""RCN18495"", ""SPT67077"", ""SPT46172"", ""SPT91765"", ""SPT50699"", ""SPT53780"", ""SPT67185"", ""SPT54633"", ""SPT52507"", ""SPT46077"", ""RCN23212"", ""SPT65241"", ""SPT46207"", ""SPT20524"", ""SPT91516"", ""SPT34600"", ""RCN26596"", ""SPT45081"", ""SPT54416"", ""SPT52807"", ""RCN23120"", ""SPT87755"", ""SPT84049"", ""SPT83770"", ""RCN18743"", ""SPT83606"", ""SPT20559"", ""RCN25888"", ""RCN23178"", ""SPT83517"", ""SPT88285"", ""SPT18219"", ""SPT72397"", ""SPT46242"", ""SPT54700"", ""SPT88722"", ""RCN26594"", ""RCN26061"", ""RCN24868"", ""SPT46445"", ""SPT53681"", ""SPT90742"", ""SPT44050"", ""SPT72416"", ""SPT83566"", ""SPT83648"", ""SPT53867"", ""RCN23223"", ""SPT88730"", ""SPT38413"", ""RCN22946"", ""SPT55584"", ""SPT88329"", ""SPT50704"", ""RCN25350"", ""RCN22654"", ""RCN26666"", ""SPT15581"", ""SPT44086"", ""SPT54656"", ""SPT54825"", ""SPT43830"", ""SPT51323"", ""RCN15170"", ""SPT46480"", ""RCN22561"", ""RCN15000"", ""RCN23098"", ""SPT71947"", ""RCN25181"", ""SPT54507"", ""RCN18615"", ""SPT43958""]"


In [11]:
merged.groupby(["Pf7Final:PM2_PM3", "PM2_PM3"]).apply(lambda s: pd.Series({
    "COUNT": len(s.Sample),
    "PERCENTAGE": f"{len(s.Sample) / n_total_samples:.2%}",
    "EXAMPLES": '["' + '", "'.join(s.Sample.sample(min([n_example_samples, len(s.Sample)]))) + '"]'
}), include_groups = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT,PERCENTAGE,EXAMPLES
Pf7Final:PM2_PM3,PM2_PM3,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,-1,1450,5.94%,"[""RCN14151"", ""SPT15617"", ""SPT38343"", ""SPT34411"", ""RCN11208"", ""SPT19351"", ""SPT18492"", ""SPT36877"", ""SPT42023"", ""SPT26533"", ""SPT34801"", ""SPT36375"", ""SPT40750"", ""SPT36344"", ""SPT24953"", ""SPT25002"", ""SPT26891"", ""SPT42003"", ""SPT19931"", ""RCN02999"", ""SPT40943"", ""SPT35673"", ""RCN11215"", ""SPT19494"", ""RCN00768"", ""SPT24760"", ""RCN11111"", ""RCN02846"", ""SPT15535"", ""SPT40733"", ""RCN12829"", ""SPT18991"", ""RCN03174"", ""SPT24581"", ""RCN02282"", ""SPT20000"", ""SPT12097"", ""SPT35556"", ""RCN02288"", ""SPT41903"", ""SPT40816"", ""SPT36264"", ""SPT22566"", ""SPT12098"", ""PA1456-Cw"", ""SPT16875"", ""SPT36317"", ""RCN01045"", ""SPT24613"", ""RCN09741"", ""SPT12863"", ""SPT15184"", ""SPT00192"", ""SPT00035"", ""SPT36072"", ""SPT24587"", ""SPT17929"", ""SPT21807"", ""SPT41089"", ""RCN14091"", ""SPT20565"", ""SPT35516"", ""RCN02961"", ""SPT17972"", ""PH0713-C"", ""SPT15599"", ""RCN10014"", ""SPT17950"", ""RCN11892"", ""SPT34255"", ""SPT41094"", ""SPT17812"", ""SPT42020"", ""SPT38336"", ""SPT18681"", ""SPT34337"", ""SPT26899"", ""SPT34437"", ""SPT15529"", ""RCN02281"", ""RCN11268"", ""SPT35766"", ""SPT40695"", ""RCN02998"", ""SPT38331"", ""SPT17969"", ""SPT24445"", ""RCN14152"", ""SPT24463"", ""SPT24653"", ""RCN08884"", ""SPT19756"", ""SPT36281"", ""SPT24626"", ""SPT34889"", ""SPT24566"", ""SPT24758"", ""SPT24630"", ""SPT24415"", ""PA0294-CW""]"
-1,0,10,0.04%,"[""PF0675-C"", ""SPT18003"", ""PH0729-C"", ""PH0564-C"", ""PF0711-C"", ""PF0680-C"", ""PD0102-C"", ""RCN07859"", ""PH0741-C"", ""RCN08660""]"
-1,1,2,0.01%,"[""RCN12096"", ""RCN08724""]"
0,-1,3075,12.60%,"[""SPT36875"", ""SPT19542"", ""RCN01069"", ""RCN13982"", ""SPT34242"", ""SPT00055"", ""SPT23069"", ""QG0400-C"", ""RCN02801"", ""SPT35463"", ""SPT19930"", ""SPT00791"", ""SPT35520"", ""SPT16946"", ""SPT16904"", ""SPT41136"", ""SPT36685"", ""SPT12985"", ""SPT19665"", ""SPT34514"", ""SPT41041"", ""QW0064-CW"", ""SPT19876"", ""QC0599-CW"", ""QG0414-C"", ""RCN02837"", ""RCN00945"", ""SPT19549"", ""SPT26928"", ""PH0784-C"", ""SPT18993"", ""SPT36912"", ""SPT19018"", ""PJ0240-C"", ""SPT42948"", ""RCN13910"", ""RCN10349"", ""SPT24733"", ""RCN02636"", ""SPT42941"", ""SPT34498"", ""RCN11805"", ""SPT36607"", ""SPT12283"", ""SPT40878"", ""SPT40857"", ""RCN12107"", ""SPT21837"", ""SPT17031"", ""RCN14066"", ""SPT34248"", ""SPT36914"", ""QW0115-CW"", ""SPT19850"", ""RCN01867"", ""SPT15246"", ""SPT19614"", ""QG0375-C"", ""SPT24456"", ""SPT35575"", ""RCN08369"", ""RCN09600"", ""PN0020-C"", ""RCN13350"", ""SPT34250"", ""RCN13461"", ""SPT20563"", ""PA1376-Cw"", ""RCN08370"", ""PF0298-C"", ""RCN12772"", ""SPT20028"", ""PF1247-CW"", ""SPT42834"", ""RCN10339"", ""RCN13442"", ""RCN09565"", ""SPT19025"", ""SPT34927"", ""SPT19706"", ""RCN14050"", ""RCN03110"", ""SPT21295"", ""SPT19607"", ""SPT24558"", ""SPT36245"", ""SPT41263"", ""SPT23029"", ""SPT34295"", ""RCN03452"", ""SPT19656"", ""RCN03183"", ""SPT41250"", ""SPT34934"", ""QG0486-C"", ""SPT12507"", ""SPT40939"", ""SPT40503"", ""SPT26547"", ""PH0775-Cx""]"
0,0,10181,41.71%,"[""PC0276-C"", ""FP0059-C"", ""RCN00294"", ""PH0498-C"", ""SPT42967"", ""SPT12348"", ""RCN02362"", ""QC0349-C"", ""RCN02529"", ""RCN11689"", ""SPT42885"", ""QG0235-C"", ""PM0083-C"", ""PD0466-C"", ""RCN11800"", ""RCN03122"", ""RCN02685"", ""PN0090-C"", ""PR0281-C"", ""RCN09166"", ""SPT35188"", ""PM0460-C"", ""PD0808-C"", ""SPT43029"", ""PJ0130-Cx"", ""SPT43352"", ""PA0230-C"", ""QC0366-C"", ""SPT35017"", ""PT0010-CW"", ""QP0235-C"", ""QE0375-C"", ""RCN02628"", ""RCN08827"", ""RCN15730"", ""PR0130-C"", ""PA0532-C"", ""RCN10790"", ""PT0134-C"", ""QP0192-C"", ""QV0029-C"", ""PC0064-C"", ""PE0269-C"", ""SPT35352"", ""SPT24669"", ""PH0392-C"", ""PM0290-C"", ""QP0061-C"", ""PM0585-C"", ""QQ0059-C"", ""RCN11774"", ""SPT15706"", ""PF0392-C"", ""SPT36903"", ""PV0080-C"", ""QP0218-C"", ""PH0411-C"", ""PR0285-C"", ""PF0985-C"", ""SPT43254"", ""SPT35240"", ""SPT43124"", ""PM0532-CW"", ""SPT43054"", ""RCN03021"", ""PA0438-C"", ""PF0272-C"", ""PC0197-C"", ""PV0046-C"", ""SPT25132"", ""SPT34415"", ""FP0090-C"", ""RCN02356"", ""RCN13429"", ""RCN03335"", ""PH0109-C"", ""PT0015-CW"", ""SPT16601"", ""SPT17012"", ""QC0153-C"", ""SPT26361"", ""RCN00993"", ""QP0096-C"", ""PD1004-C"", ""PF1200-Cx"", ""QG0203-C"", ""PD0524-C"", ""PA0490-C"", ""RCN03534"", ""QC0311-C"", ""PC4045-C"", ""PT0208-C"", ""SPT19882"", ""PF0621-C"", ""SPT41042"", ""PD0529-C"", ""SPT43156"", ""PF0294-C"", ""PE0231-C"", ""QC0205-C""]"
0,1,14,0.06%,"[""SPT21294"", ""PH0431-CW"", ""QG0383-C"", ""QG0398-C"", ""RCN02742"", ""QG0357-C"", ""QG0425-C"", ""PH0779-Cx"", ""RCN10789"", ""RCN08736"", ""QG0388-C"", ""RCN11605"", ""PA0102-C"", ""RCN02530""]"
1,-1,369,1.51%,"[""RCN01777"", ""RCN08754"", ""RCN08021"", ""RCN13459"", ""RCN10350"", ""PH1571-C"", ""RCN11333"", ""RCN01781"", ""RCN11128"", ""RCN08730"", ""RCN12890"", ""RCN08726"", ""RCN12793"", ""RCN08642"", ""RCN13551"", ""PH0726-Cx"", ""RCN08810"", ""RCN12562"", ""RCN11178"", ""RCN08989"", ""RCN08976"", ""PH1480-C"", ""RCN08993"", ""PH1719-C"", ""RCN13495"", ""RCN12790"", ""RCN08723"", ""RCN09016"", ""RCN11013"", ""RCN11919"", ""RCN13503"", ""RCN12584"", ""RCN12115"", ""RCN14312"", ""RCN11906"", ""RCN10324"", ""RCN13576"", ""RCN01864"", ""RCN08819"", ""RCN08784"", ""RCN11257"", ""RCN10398"", ""RCN10256"", ""RCN12832"", ""RCN11144"", ""RCN12853"", ""RCN08995"", ""RCN08817"", ""PH0467-CW"", ""RCN13521"", ""RCN10316"", ""RCN13557"", ""RCN10727"", ""RCN11285"", ""RCN13477"", ""RCN12794"", ""RCN08699"", ""RCN11134"", ""RCN08775"", ""RCN12867"", ""RCN11891"", ""RCN13512"", ""RCN01783"", ""RCN13562"", ""RCN11930"", ""RCN08645"", ""RCN12791"", ""RCN14268"", ""RCN08664"", ""RCN08748"", ""RCN08727"", ""RCN10311"", ""RCN11895"", ""RCN12830"", ""PH0874-Cx"", ""PH1679-CW2"", ""RCN09268"", ""RCN12822"", ""RCN07839"", ""PV0372-C"", ""RCN08820"", ""RCN14272"", ""RCN11862"", ""RCN12796"", ""RCN10317"", ""RCN11872"", ""RCN08695"", ""RCN11289"", ""RCN12895"", ""RCN11924"", ""RCN08666"", ""RCN12785"", ""RCN13411"", ""RCN08742"", ""RCN12776"", ""RCN13466"", ""RCN09012"", ""RCN08959"", ""PH1752-C"", ""RCN08701""]"
1,0,417,1.71%,"[""PH0835-C"", ""RCN08813"", ""RCN08023"", ""RCN10309"", ""RCN12461"", ""RCN08966"", ""RCN07860"", ""RCN11167"", ""RCN11162"", ""RCN10762"", ""RCN10761"", ""RCN10699"", ""RCN10319"", ""RCN11313"", ""RCN11184"", ""RCN13476"", ""RCN11155"", ""RCN13574"", ""RCN11021"", ""RCN11154"", ""RCN13580"", ""RCN11856"", ""RCN11831"", ""RCN13452"", ""RCN11260"", ""RCN11187"", ""RCN12028"", ""RCN08139"", ""RCN11080"", ""RCN11294"", ""RCN12100"", ""RCN12482"", ""RCN07951"", ""RCN14279"", ""RCN11298"", ""RCN08082"", ""RCN08011"", ""RCN11014"", ""RCN08137"", ""RCN12810"", ""RCN12774"", ""RCN13488"", ""RCN13582"", ""RCN11286"", ""RCN11029"", ""RCN11169"", ""RCN11030"", ""RCN11164"", ""RCN09025"", ""RCN12857"", ""RCN09002"", ""RCN12454"", ""RCN12504"", ""PH0143-CW"", ""RCN14287"", ""RCN11256"", ""RCN10151"", ""RCN08814"", ""RCN11327"", ""RCN14324"", ""RCN11153"", ""RCN13409"", ""RCN11866"", ""RCN09006"", ""RCN10706"", ""RCN11173"", ""RCN12548"", ""RCN08087"", ""RCN14303"", ""RCN08120"", ""RCN11104"", ""RCN11075"", ""RCN12469"", ""RCN12854"", ""RCN10290"", ""RCN10381"", ""RCN14310"", ""RCN10728"", ""RCN10391"", ""RCN10346"", ""RCN08129"", ""RCN12889"", ""RCN08045"", ""RCN13479"", ""RCN14284"", ""RCN11077"", ""PH0245-C"", ""RCN12846"", ""RCN12103"", ""RCN11042"", ""RCN10703"", ""RCN08027"", ""RCN11151"", ""RCN11024"", ""RCN11136"", ""RCN11147"", ""RCN12817"", ""RCN08816"", ""RCN08141"", ""RCN08013""]"
1,1,481,1.97%,"[""PH1732-C"", ""RCN13096"", ""RCN07854"", ""PH0472-C"", ""PH1025-C"", ""PH0905-C"", ""PV0368-C"", ""PH1711-C"", ""RCN13032"", ""PH0393-C"", ""RCN12386"", ""RCN13012"", ""RCN12613"", ""RCN12607"", ""RCN10168"", ""PH1712-C"", ""PH1744-C"", ""RCN08692"", ""RCN12622"", ""PH0682-C"", ""PV0366-C"", ""PH0561-C"", ""PH1220-C"", ""PH0328-C"", ""RCN13015"", ""PV0381-C"", ""PV0370-C"", ""RCN13009"", ""PH0541-C"", ""PH1453-C"", ""RCN12600"", ""RCN12365"", ""RCN07858"", ""RCN12375"", ""PH0909-C"", ""PH0390-C"", ""PH1399-C"", ""RCN10202"", ""RCN12361"", ""PH1519-C"", ""RCN08992"", ""PH0533-C"", ""RCN12033"", ""PH0916-C"", ""PD0729-C"", ""PH1391-C"", ""RCN13010"", ""PH1745-C"", ""PH0572-C"", ""PH1173-C"", ""PH1378-C"", ""PV0384-C"", ""PH1190-C"", ""PH0584-C"", ""PH1189-C"", ""PH0573-C"", ""PH1759-C"", ""RCN13016"", ""PH0241-C"", ""PH0227-C"", ""PH0391-C"", ""PH1443-C"", ""PV0385-C"", ""PH1216-C"", ""RCN13527"", ""PH1328-C"", ""RCN12389"", ""PH1760-C"", ""RCN12592"", ""PH1137-C"", ""PH1735-C"", ""RCN13066"", ""PH0478-C"", ""PH0562-C"", ""RCN12366"", ""RCN08017"", ""PH0966-Cx"", ""PV0398-C"", ""PH1343-C"", ""RCN12058"", ""RCN13079"", ""RCN07825"", ""RCN08728"", ""PH0251-C"", ""RCN12364"", ""PH0997-C"", ""PH0676-C"", ""PH1377-C"", ""PH1336-C"", ""RCN12593"", ""PH1229-C"", ""RCN12987"", ""RCN12378"", ""PH1373-C"", ""RCN11086"", ""RCN12602"", ""RCN13075"", ""PH1715-C"", ""RCN08138"", ""PV0396-C""]"
MISSING,-1,5518,22.61%,"[""SPT54612"", ""SPT54832"", ""RCN11379"", ""SPT52962"", ""SPT34589"", ""SPT87762"", ""SPT34737"", ""SPT66966"", ""SPT72191"", ""SPT43777"", ""RCN25944"", ""SPT64871"", ""SPT46367"", ""SPT46491"", ""SPT19536"", ""SPT18017"", ""SPT88289"", ""SPT91207"", ""SPT46420"", ""SPT52963"", ""SPT64853"", ""RCN25849"", ""RCN14953"", ""SPT83672"", ""RCN18496"", ""SPT43821"", ""RCN12803"", ""RCN18623"", ""SPT70622"", ""SPT67066"", ""RCN22852"", ""RCN22448"", ""SPT18303"", ""SPT46234"", ""SPT52543"", ""RCN23221"", ""SPT53799"", ""SPT36559"", ""RCN23018"", ""SPT88293"", ""RCN14643"", ""SPT50973"", ""RCN22604"", ""SPT67118"", ""SPT72322"", ""RCN25053"", ""RCN14950"", ""SPT54685"", ""RCN26028"", ""SPT72288"", ""SPT64985"", ""SPT87702"", ""SPT72405"", ""SPT72256"", ""SPT67184"", ""RCN11380"", ""SPT53125"", ""SPT46272"", ""SPT34688"", ""SPT46326"", ""SPT53142"", ""SPT52455"", ""SPT64862"", ""SPT46208"", ""SPT83174"", ""SPT90978"", ""SPT43962"", ""SPT87882"", ""SPT88419"", ""SPT46227"", ""SPT20539"", ""SPT43831"", ""SPT54518"", ""SPT67691"", ""SPT72368"", ""SPT52456"", ""SPT36937"", ""RCN15107"", ""SPT53147"", ""SPT84092"", ""SPT44910"", ""SPT38460"", ""RCN15936"", ""SPT91512"", ""SPT46418"", ""SPT83552"", ""RCN22715"", ""SPT88095"", ""SPT44717"", ""SPT18283"", ""SPT51088"", ""SPT83213"", ""SPT18112"", ""SPT44982"", ""SPT88451"", ""SPT91788"", ""SPT87918"", ""SPT83833"", ""SPT83435"", ""RCN22640""]"


In [12]:
merged.groupby(["Pf7Final:HRP2", "HRP2"]).apply(lambda s: pd.Series({
    "COUNT": len(s.Sample),
    "PERCENTAGE": f"{len(s.Sample) / n_total_samples:.2%}",
    "EXAMPLES": '["' + '", "'.join(s.Sample.sample(min([n_example_samples, len(s.Sample)]))) + '"]'
}), include_groups = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT,PERCENTAGE,EXAMPLES
Pf7Final:HRP2,HRP2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,-1,4346,17.80%,"[""RCN13950"", ""SPT25116"", ""SPT18549"", ""SPT34281"", ""SPT12513"", ""SPT34273"", ""SPT35780"", ""RCN12113"", ""RCN02894"", ""RCN09678"", ""SPT34904"", ""RCN14055"", ""SPT36233"", ""PA0313-CW"", ""PA0916-CW"", ""SPT40861"", ""RCN09960"", ""SPT15586"", ""SPT36190"", ""SPT00203"", ""SPT24826"", ""SPT20030"", ""SPT42042"", ""SPT40988"", ""SPT36285"", ""RCN10727"", ""SPT13073"", ""RCN02281"", ""RCN00879"", ""SPT34469"", ""SPT35245"", ""RCN08955"", ""RCN03433"", ""SPT26864"", ""RCN03178"", ""RCN14030"", ""SPT20499"", ""SPT19824"", ""SPT35116"", ""SPT19385"", ""SPT34451"", ""RCN14017"", ""PM0752-CW"", ""PF1066-Cx"", ""SPT17050"", ""SPT34261"", ""PC0077-C"", ""RCN00871"", ""SPT35241"", ""RCN11232"", ""RCN14137"", ""SPT34866"", ""SPT24724"", ""RCN03165"", ""RCN02272"", ""SPT17918"", ""SPT35437"", ""SPT35571"", ""SPT12074"", ""SPT24743"", ""RCN12809"", ""QC0420-CW"", ""RCN13375"", ""SPT12367"", ""RCN02843"", ""SPT41280"", ""SPT35036"", ""RCN12063"", ""SPT35580"", ""SPT41036"", ""SPT12990"", ""RCN08722"", ""SPT19875"", ""SPT19382"", ""RCN09010"", ""SPT12292"", ""SPT36682"", ""RCN09940"", ""SPT24141"", ""SPT17057"", ""SPT26304"", ""QG0366-C"", ""SPT36055"", ""SPT35563"", ""SPT26874"", ""RCN14072"", ""SPT20012"", ""RCN09403"", ""SPT40868"", ""QC0409-CW"", ""SPT24392"", ""SPT38311"", ""SPT34349"", ""SPT15233"", ""SPT17995"", ""SPT34272"", ""SPT00889"", ""SPT15653"", ""SPT43146"", ""RCN03169""]"
-1,0,766,3.14%,"[""RCN11025"", ""SPT41076"", ""RCN03082"", ""SPT36293"", ""RCN08992"", ""SPT12576"", ""RCN02910"", ""RCN13264"", ""RCN02091"", ""PK0073-C"", ""QG0291-C"", ""SPT34318"", ""RCN14264"", ""PM0303-C"", ""RCN11257"", ""RCN11764"", ""PC0104-C"", ""RCN11024"", ""SPT40815"", ""SPT22415"", ""PD0755-C"", ""RCN09419"", ""RCN08751"", ""RCN08739"", ""SPT22230"", ""RCN11334"", ""PE0327-C"", ""RCN12032"", ""SPT12523"", ""PK0075-C"", ""PK0052-C"", ""SPT34465"", ""SPT12348"", ""SPT19939"", ""SPT18474"", ""PA0074-C"", ""PH0805-C"", ""SPT34410"", ""SPT16686"", ""SPT40532"", ""QC0404-CW"", ""RCN02525"", ""SPT36694"", ""RCN10392"", ""SPT24375"", ""SPT34357"", ""PF0011-C"", ""SPT40852"", ""RCN12817"", ""PD1103-C"", ""QG0225-C"", ""PF0743-C"", ""RCN11274"", ""PF0710-C"", ""PD0009-01"", ""RCN12857"", ""RCN12854"", ""SPT15257"", ""PF0335-CW"", ""RCN11269"", ""RCN09058"", ""RCN00282"", ""RCN13564"", ""PM0014-C"", ""PD0837-C"", ""RCN09382"", ""SPT34900"", ""PH0942-Cx"", ""RCN10325"", ""RCN13392"", ""RCN09059"", ""RCN00853"", ""SPT15208"", ""QG0383-C"", ""SPT34980"", ""PC0078-C"", ""PF0122-C"", ""PV0170-C"", ""SPT12321"", ""SPT40521"", ""RCN11326"", ""RCN09170"", ""RCN09636"", ""SPT34930"", ""RCN11638"", ""RCN11713"", ""RCN10299"", ""RCN11243"", ""RCN01022"", ""SPT41108"", ""SPT35093"", ""RCN11685"", ""RCN10002"", ""RCN11709"", ""SPT36477"", ""RCN12842"", ""SPT14974"", ""SPT34471"", ""SPT40824"", ""PF0010-C""]"
-1,1,8,0.03%,"[""RCN08051"", ""RCN11874"", ""RCN12815"", ""RCN09021"", ""RCN12848"", ""RCN12878"", ""RCN09023"", ""RCN12853""]"
0,-1,891,3.65%,"[""RCN02774"", ""SPT26616"", ""QW0010-CxW"", ""RCN13477"", ""PH1577-C"", ""RCN13080"", ""PH1159-C"", ""RCN00740"", ""SPT18639"", ""RCN11531"", ""SPT26895"", ""RCN13414"", ""SPT26945"", ""RCN03095"", ""QW0043-CW"", ""SPT00830"", ""SPT12138"", ""RCN00831"", ""SPT17049"", ""SPT22567"", ""RCN11740"", ""SPT12372"", ""SPT18620"", ""RCN02184"", ""RCN00895"", ""RCN12993"", ""SPT26555"", ""RCN13077"", ""SPT43287"", ""QW0072-CW"", ""PM0551-CW"", ""SPT35201"", ""RCN03322"", ""SPT19597"", ""SPT12119"", ""SPT34512"", ""SPT35066"", ""RCN01776"", ""SPT18680"", ""RCN12385"", ""RCN13282"", ""RCN11133"", ""SPT34433"", ""SPT18674"", ""RCN09704"", ""RCN00123"", ""SPT41102"", ""RCN13056"", ""RCN09675"", ""RCN08369"", ""RCN09718"", ""SPT24193"", ""RCN03195"", ""QW0115-CW"", ""SPT35363"", ""RCN03194"", ""RCN03401"", ""PF0706-C"", ""SPT16737"", ""RCN00986"", ""RCN03043"", ""SPT18619"", ""SPT17051"", ""RCN12366"", ""SPT26484"", ""RCN09338"", ""SPT26904"", ""SPT35340"", ""RCN13369"", ""SPT17018"", ""RCN09394"", ""RCN08053"", ""RCN03351"", ""RCN02238"", ""SPT15191"", ""SPT35040"", ""RCN02557"", ""RCN13076"", ""RCN00250"", ""RCN08383"", ""RCN00758"", ""SPT19955"", ""RCN13031"", ""SPT24687"", ""RCN03177"", ""SPT25128"", ""SPT18585"", ""SPT18552"", ""RCN09597"", ""SPT18649"", ""RCN09464"", ""SPT19973"", ""SPT40802"", ""SPT24561"", ""SPT34435"", ""RCN03073"", ""RCN08693"", ""RCN08816"", ""PK0044-C"", ""RCN09071""]"
0,0,9951,40.77%,"[""PT0095-C"", ""PD1072-C"", ""RCN13408"", ""PH0324-C"", ""SPT41001"", ""SPT43001"", ""RCN12369"", ""PH0603-C"", ""PF0363-C"", ""PM0438-C"", ""PF0232-C"", ""PH0865-C"", ""PD0569-C"", ""PH1453-C"", ""RCN09088"", ""PF0845-C"", ""PD0910-C"", ""PC0030-C"", ""SPT14954"", ""QP0237-C"", ""PD0556-C"", ""PH0801-C"", ""RCN13556"", ""SPT42971"", ""SPT26907"", ""RCN10217"", ""RCN03344"", ""PF0326-C"", ""RCN08332"", ""PT0041-C"", ""PM0561-CW"", ""PH0915-C"", ""PM0357-C"", ""PC0189-C"", ""RCN11669"", ""QE0454-C"", ""PT0085-C"", ""PT0044-C"", ""QV0037-C"", ""PE0187-C"", ""PD0816-C"", ""PM0467-C"", ""PA0461-C"", ""PM0399-C"", ""SPT42963"", ""SPT35332"", ""QQ0004-C"", ""RCN08296"", ""RCN10738"", ""RCN13500"", ""PD0106-C"", ""QQ0002-C"", ""RCN13078"", ""QP0035-C"", ""RCN00776"", ""RCN00197"", ""QT0015-CW2"", ""RCN12880"", ""QP0211-C"", ""QG0142-C"", ""RCN02203"", ""PF0128-C"", ""PF0116-C"", ""RCN11075"", ""PD0093-C"", ""RCN08124"", ""PD1284-C"", ""PW0056-C"", ""PH1175-C"", ""SPT43261"", ""RCN12810"", ""PN0171-C"", ""PJ0259-C"", ""PH1677-CW2"", ""SPT15767"", ""QE0486-C"", ""PE0405-C"", ""RCN08776"", ""QG0266-C"", ""SPT15026"", ""PH0369-C"", ""SPT15330"", ""QP0255-C"", ""SPT24683"", ""QG0009-C"", ""RCN03069"", ""SPT00784"", ""PM0263-C"", ""PD0571-C"", ""PH0367-C"", ""PD1311-C"", ""QC0162-C"", ""PH0522-C"", ""PT0169-C"", ""PF0620-C"", ""PD1413-C"", ""PH0279-C"", ""SPT22621"", ""RCN11803"", ""PH0702-C""]"
0,1,22,0.09%,"[""RCN11933"", ""RCN12481"", ""RCN11862"", ""RCN09271"", ""RCN09025"", ""RCN09019"", ""RCN08069"", ""RCN11294"", ""RCN10692"", ""RCN08958"", ""RCN12480"", ""RCN13559"", ""RCN07860"", ""RCN11886"", ""RCN13449"", ""RCN09270"", ""RCN13475"", ""RCN12456"", ""RCN11854"", ""RCN08931"", ""RCN11332"", ""RCN13093""]"
1,-1,3,0.01%,"[""RCN12999"", ""SPT24677"", ""PP0026-C""]"
1,0,1,0.00%,"[""SPT00899""]"
1,1,11,0.05%,"[""PP0011-C"", ""SPT24670"", ""PJ0233-C"", ""PJ0135-Cx"", ""PJ0258-C"", ""PP0028-C"", ""PP0002-C"", ""PP0029-C"", ""PP0025-C"", ""PP0024-C"", ""PP0017-C""]"
MISSING,-1,5545,22.72%,"[""SPT83232"", ""SPT65195"", ""SPT88117"", ""SPT64979"", ""RCN18553"", ""SPT44073"", ""RCN25318"", ""SPT43958"", ""SPT52473"", ""SPT67832"", ""RCN22715"", ""RCN22614"", ""SPT88401"", ""RCN22831"", ""SPT88061"", ""SPT44877"", ""SPT52899"", ""RCN12685"", ""RCN25994"", ""SPT50307"", ""SPT46099"", ""RCN18520"", ""SPT67197"", ""SPT87770"", ""SPT44989"", ""RCN25871"", ""RCN25969"", ""RCN25015"", ""SPT46175"", ""SPT70451"", ""SPT67622"", ""RCN22791"", ""SPT41119"", ""SPT54812"", ""SPT51544"", ""SPT34700"", ""SPT34554"", ""RCN22581"", ""SPT51356"", ""SPT87950"", ""SPT67776"", ""SPT54462"", ""SPT65151"", ""SPT44638"", ""SPT88703"", ""SPT51401"", ""RCN14936"", ""RCN25941"", ""SPT88571"", ""SPT18265"", ""SPT36594"", ""SPT34837"", ""SPT88400"", ""SPT72087"", ""SPT88050"", ""RCN25928"", ""SPT52516"", ""SPT87720"", ""SPT44116"", ""SPT46079"", ""SPT50504"", ""SPT44611"", ""SPT53166"", ""SPT43735"", ""SPT34590"", ""SPT83749"", ""SPT65223"", ""SPT54673"", ""RCN22465"", ""SPT45057"", ""SPT94772"", ""RCN22527"", ""SPT84074"", ""SPT18325"", ""RCN24879"", ""SPT51412"", ""SPT67724"", ""SPT83851"", ""RCN23139"", ""SPT84092"", ""RCN24949"", ""RCN22695"", ""SPT53661"", ""SPT43792"", ""SPT90672"", ""SPT50793"", ""SPT43996"", ""RCN24981"", ""RCN13314"", ""SPT67658"", ""SPT44703"", ""SPT67713"", ""SPT50986"", ""RCN22720"", ""RCN26613"", ""RCN18578"", ""SPT90877"", ""SPT45085"", ""SPT43870"", ""RCN16020""]"


In [13]:
merged.groupby(["Pf7Final:HRP3", "HRP3"]).apply(lambda s: pd.Series({
    "COUNT": len(s.Sample),
    "PERCENTAGE": f"{len(s.Sample) / n_total_samples:.2%}",
    "EXAMPLES": '["' + '", "'.join(s.Sample.sample(min([n_example_samples, len(s.Sample)]))) + '"]'
}), include_groups = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT,PERCENTAGE,EXAMPLES
Pf7Final:HRP3,HRP3,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,-1,4654,19.07%,"[""SPT12299"", ""RCN08753"", ""SPT40959"", ""SPT26446"", ""SPT35092"", ""RCN13918"", ""SPT19238"", ""SPT36185"", ""SPT34471"", ""SPT17956"", ""SPT19600"", ""QC0402-CW"", ""SPT35592"", ""SPT15635"", ""SPT24529"", ""PF1106-Cx"", ""SPT34321"", ""SPT40541"", ""SPT35540"", ""RCN08889"", ""RCN11315"", ""SPT40956"", ""PF0720-C"", ""SPT36106"", ""PW0090-C"", ""SPT38380"", ""SPT12556"", ""SPT18609"", ""SPT34631"", ""SPT17830"", ""RCN02366"", ""SPT36703"", ""PH0747-Cx"", ""SPT36613"", ""SPT24740"", ""RCN14066"", ""SPT42003"", ""SPT35005"", ""RCN09990"", ""SPT36600"", ""SPT20013"", ""RCN12550"", ""SPT19851"", ""SPT36886"", ""SPT23019"", ""SPT18428"", ""SPT20563"", ""RCN08961"", ""SPT38353"", ""RCN03094"", ""SPT36875"", ""SPT40804"", ""SPT19251"", ""SPT19766"", ""RCN14122"", ""SPT35573"", ""SPT17046"", ""SPT00192"", ""SPT22407"", ""PM0774-CW"", ""RCN14042"", ""SPT12076"", ""SPT22480"", ""SPT19415"", ""SPT17078"", ""SPT18932"", ""RCN09283"", ""RCN11621"", ""RCN09812"", ""RCN13480"", ""SPT18431"", ""SPT22608"", ""RCN13950"", ""SPT34629"", ""SPT19419"", ""RCN03337"", ""SPT25103"", ""SPT17993"", ""SPT15640"", ""SPT36126"", ""RCN02030"", ""SPT34295"", ""SPT25012"", ""SPT19496"", ""RCN10387"", ""SPT21817"", ""RCN03032"", ""SPT35080"", ""SPT15252"", ""SPT15647"", ""SPT17064"", ""SPT40892"", ""RCN02172"", ""RCN01012"", ""RCN14164"", ""RCN09649"", ""SPT19493"", ""PA0305-CW"", ""SPT17793"", ""SPT34314""]"
-1,0,879,3.60%,"[""RCN12871"", ""SPT36667"", ""RCN09820"", ""RCN11834"", ""RCN11501"", ""RCN09533"", ""SPT36266"", ""RCN10788"", ""RCN13462"", ""RCN08681"", ""SPT34435"", ""PW0095-C"", ""RCN12774"", ""RCN13555"", ""RCN11688"", ""SPT35150"", ""SPT15679"", ""RCN09704"", ""SPT41025"", ""PK0045-C"", ""RCN09019"", ""PA0238-C"", ""RCN09654"", ""RCN02139"", ""PD0843-C"", ""RCN12562"", ""SPT34379"", ""PH0767-C"", ""RCN00254"", ""SPT16799"", ""RCN01062"", ""SPT17025"", ""RCN13511"", ""SPT34906"", ""SPT34412"", ""RCN11813"", ""RCN11887"", ""PA0634-C"", ""RCN08974"", ""PH0684-C"", ""RCN11530"", ""SPT40877"", ""SPT40596"", ""PH1636-C"", ""SPT34493"", ""RCN11223"", ""RCN11049"", ""PA0064-C"", ""RCN11038"", ""RCN13424"", ""PD0740-C"", ""PF0096-C"", ""PH0431-CW"", ""SPT34503"", ""RCN11139"", ""RCN11715"", ""PD0030-C"", ""QG0432-C"", ""RCN13422"", ""RCN02284"", ""PD0045-C"", ""RCN13539"", ""PD0886-C"", ""SPT34504"", ""SPT34933"", ""RCN00833"", ""RCN09507"", ""RCN09992"", ""RCN08977"", ""SPT35902"", ""RCN11693"", ""RCN00780"", ""PA0007-C"", ""RCN03107"", ""PA0240-C"", ""SPT40759"", ""RCN13029"", ""RCN01131"", ""PD0056-C"", ""PA1320-Cw"", ""QG0476-C"", ""RCN11869"", ""RCN03514"", ""PD0776-C"", ""RCN13508"", ""QG0295-C"", ""RCN08975"", ""RCN11871"", ""PM0283-C"", ""SPT24773"", ""RCN08639"", ""RCN09120"", ""RCN08950"", ""RCN02024"", ""RCN09069"", ""RCN09675"", ""SPT19859"", ""SPT34417"", ""SPT41002"", ""RCN09360""]"
-1,1,18,0.07%,"[""RCN02146"", ""RCN02816"", ""RCN02365"", ""RCN02907"", ""RCN13351"", ""SPT34382"", ""RCN02953"", ""SPT35260"", ""SPT18468"", ""SPT35228"", ""SPT16994"", ""PT0023-CW"", ""SPT38537"", ""SPT35322"", ""RCN02894"", ""RCN13370"", ""SPT35354"", ""PV0132-C""]"
0,-1,674,2.76%,"[""SPT22577"", ""RCN08782"", ""RCN08378"", ""SPT35153"", ""RCN00747"", ""SPT20507"", ""SPT12560"", ""SPT26936"", ""RCN09273"", ""RCN00845"", ""RCN12380"", ""RCN00762"", ""SPT26502"", ""SPT36874"", ""RCN12992"", ""SPT16654"", ""PR0267-C"", ""RCN01001"", ""RCN03283"", ""RCN00846"", ""RCN02624"", ""RCN12619"", ""SPT22605"", ""SPT16737"", ""QG0332-CW"", ""RCN14278"", ""RCN02590"", ""SPT18586"", ""SPT19606"", ""SPT40632"", ""RCN03195"", ""SPT12329"", ""SPT16890"", ""RCN00876"", ""RCN03020"", ""SPT19582"", ""RCN01073"", ""SPT35134"", ""RCN00751"", ""PR0022-CW"", ""RCN00915"", ""RCN03088"", ""SPT18674"", ""RCN08348"", ""QW0129-CW"", ""SPT18626"", ""SPT43255"", ""SPT12293"", ""SPT26546"", ""RCN00895"", ""QW0046-CW"", ""SPT40979"", ""QG0512-CW"", ""RCN12599"", ""SPT18639"", ""RCN00147"", ""SPT41027"", ""RCN08763"", ""RCN11723"", ""RCN12059"", ""SPT35295"", ""FP0019-CW"", ""SPT19855"", ""SPT36883"", ""SPT35052"", ""SPT41123"", ""PD1178-C"", ""RCN02638"", ""RCN11114"", ""SPT26484"", ""SPT35050"", ""RCN13039"", ""RCN02959"", ""RCN12483"", ""SPT26935"", ""RCN12058"", ""RCN08629"", ""SPT12128"", ""PH1577-C"", ""QC0629-C"", ""SPT22278"", ""RCN11783"", ""RCN09278"", ""SPT12121"", ""RCN11805"", ""SPT26541"", ""PG0607-C"", ""RCN03347"", ""RCN14010"", ""SPT17018"", ""RCN11233"", ""SPT00830"", ""RCN08369"", ""RCN13037"", ""RCN10770"", ""RCN11196"", ""SPT36669"", ""SPT19973"", ""RCN07849"", ""RCN08732""]"
0,0,9563,39.18%,"[""PH1037-C"", ""QC0128-C"", ""RCN11936"", ""PE0221-C"", ""PD1041-C"", ""QE0470-C"", ""SPT42840"", ""PF1043-C"", ""RCN11167"", ""PE0462-C"", ""RCN02312"", ""PM0411-C"", ""PM0127-C"", ""PA0482-C"", ""PH0475-C"", ""PF0613-C"", ""PT0143-C"", ""QC0328-C"", ""SPT15880"", ""RCN08641"", ""QQ0054-C"", ""PA0204-C"", ""RCN09462"", ""PD1159-C"", ""QP0066-C"", ""SPT16618"", ""PD0817-C"", ""PH0553-C"", ""RCN09912"", ""RCN02337"", ""PH1587-C"", ""PK0002-C"", ""PF0410-CW"", ""SPT34937"", ""SPT16828"", ""RCN11565"", ""RCN03424"", ""QQ0061-C"", ""RCN08648"", ""PC0067-C"", ""PN0006-C"", ""QC0211-C"", ""PD1103-C"", ""PF0757-C"", ""PE0278-C"", ""PF0211-C"", ""PE0234-C"", ""PC0078-C"", ""SPT40647"", ""PF0154-C"", ""SPT26290"", ""PM0488-C"", ""RCN02732"", ""SPT43306"", ""QP0034-C"", ""PE0125-C"", ""PF0422-C"", ""RCN09266"", ""PH0609-C"", ""QM0134-C"", ""PF0980-C"", ""PD0987-C"", ""PE0223-C"", ""SPT42875"", ""SPT15027"", ""PD1117-C"", ""RCN02449"", ""SPT19924"", ""PE0453-C"", ""PV0369-C"", ""QP0025-C"", ""PF0747-C"", ""RCN02245"", ""RCN00215"", ""SPT15099"", ""RCN02157"", ""PF0512-C"", ""SPT00808"", ""PH0122-CW"", ""SPT34964"", ""SPT19914"", ""PM0232-C"", ""FP0042-C"", ""PH1297-C"", ""PR0119-C"", ""PR0160-C"", ""RCN09077"", ""PR0291-C"", ""PE0299-C"", ""SPT43289"", ""PK0020-C"", ""PM0598-C"", ""PF0310-C"", ""PD1021-C"", ""QP0258-C"", ""PT0169-C"", ""PH0110-C"", ""SPT43147"", ""RCN00184"", ""PH1677-CW2""]"
0,1,13,0.05%,"[""RCN09111"", ""PM0595-C"", ""RCN11599"", ""RCN11601"", ""RCN11554"", ""RCN11534"", ""RCN09122"", ""SPT15761"", ""RCN12032"", ""RCN09139"", ""PH0558-C"", ""RCN11573"", ""PH0548-C""]"
1,-1,11,0.05%,"[""SPT26343"", ""PW0047-C"", ""PW0076-C"", ""SPT26289"", ""PW0048-C"", ""PW0085-C"", ""RCN09059"", ""SPT26228"", ""PW0084-C"", ""SPT26248"", ""PW0105-C""]"
1,0,1,0.00%,"[""SPT24689""]"
1,1,186,0.76%,"[""PH0425-C"", ""PJ0197-C"", ""SPT26293"", ""PJ0209-C"", ""PW0056-C"", ""PH0474-CW"", ""PH1310-C"", ""SPT26258"", ""PJ0146-Cx"", ""SPT24677"", ""PJ0206-C"", ""PH0534-C"", ""SPT24681"", ""SPT35264"", ""PW0053-C"", ""PD1430-C"", ""RCN09027"", ""RCN13388"", ""SPT24660"", ""QS0154-C"", ""QS0110-C"", ""QS0157-C"", ""PJ0136-Cx"", ""PP0025-C"", ""PH0397-C"", ""PV0098-C"", ""SPT24678"", ""PC0019-C"", ""PH0959-Cx"", ""SPT00901"", ""PV0035-C"", ""PH0156-C"", ""PH0229-C"", ""RCN09062"", ""PV0112-C"", ""PH0413-C"", ""QS0156-C"", ""QV0021-C"", ""PW0050-C"", ""PJ0198-C"", ""SPT24668"", ""SPT24664"", ""QV0020-C"", ""RCN13059"", ""PP0013-C"", ""SPT00902"", ""RCN08119"", ""SPT00899"", ""RCN11766"", ""PW0054-C"", ""PH0149-CW"", ""PH0274-C"", ""SPT00898"", ""RCN10188"", ""SPT35384"", ""PH0278-CW"", ""PH0817-C"", ""SPT26295"", ""PP0026-C"", ""RCN13416"", ""PP0029-C"", ""PH0714-C"", ""QV0072-C"", ""PW0042-C"", ""SPT35290"", ""PW0057-C"", ""PW0060-C"", ""PJ0183-C"", ""SPT24670"", ""PW0063-C"", ""PV0274-C"", ""RCN11770"", ""QS0104-C"", ""PH0308-CW"", ""PH0697-C"", ""SPT24690"", ""RCN09058"", ""PW0049-C"", ""PJ0170-C"", ""QV0094-C"", ""PP0004-C"", ""SPT00892"", ""PV0194-C"", ""RCN13051"", ""RCN11771"", ""PP0021-C"", ""RCN13490"", ""PP0027-C"", ""SPT26254"", ""QS0168-C"", ""PH0544-C"", ""QS0155-C"", ""QV0071-C"", ""SPT24675"", ""SPT24695"", ""PP0020-C"", ""PH0480-C"", ""SPT24688"", ""PW0044-C"", ""PW0097-C""]"
MISSING,-1,5717,23.42%,"[""SPT54780"", ""SPT52741"", ""SPT54605"", ""SPT41202"", ""SPT54706"", ""RCN24865"", ""SPT44610"", ""SPT44965"", ""RCN18521"", ""SPT51258"", ""SPT64725"", ""SPT91788"", ""SPT43725"", ""SPT52773"", ""RCN25889"", ""SPT65189"", ""SPT41196"", ""RCN22650"", ""SPT72446"", ""SPT87989"", ""SPT51549"", ""SPT90743"", ""RCN14851"", ""SPT38428"", ""SPT54547"", ""SPT44934"", ""SPT88131"", ""SPT52698"", ""SPT51214"", ""SPT67818"", ""SPT46218"", ""SPT34722"", ""SPT51555"", ""SPT83709"", ""SPT34762"", ""SPT84042"", ""SPT88355"", ""RCN24892"", ""SPT51092"", ""RCN23207"", ""SPT83683"", ""RCN24951"", ""SPT67764"", ""SPT43999"", ""SPT91525"", ""RCN18681"", ""SPT88132"", ""SPT67199"", ""SPT70780"", ""RCN22575"", ""RCN18567"", ""RCN18496"", ""SPT46155"", ""SPT83794"", ""SPT34670"", ""SPT70928"", ""SPT67787"", ""SPT18253"", ""RCN11387"", ""SPT50970"", ""SPT84075"", ""SPT52404"", ""SPT72223"", ""SPT83915"", ""SPT51558"", ""RCN22590"", ""SPT64808"", ""SPT52455"", ""SPT54666"", ""SPT44071"", ""RCN26937"", ""SPT54729"", ""RCN18534"", ""SPT84089"", ""RCN22989"", ""SPT18290"", ""SPT51422"", ""RCN18477"", ""SPT83807"", ""SPT88155"", ""SPT45078"", ""SPT51369"", ""RCN25116"", ""SPT46382"", ""SPT66988"", ""RCN15147"", ""SPT72008"", ""RCN22817"", ""SPT72101"", ""SPT87882"", ""SPT51327"", ""SPT83770"", ""RCN23098"", ""SPT43765"", ""SPT46170"", ""SPT70462"", ""SPT54882"", ""SPT54718"", ""SPT53235"", ""SPT54631""]"


In [14]:
merged.groupby(["GCH1"]).apply(lambda s: pd.Series({
    "COUNT": len(s.Sample),
    "PERCENTAGE": f"{len(s.Sample) / n_total_samples:.2%}",
    "EXAMPLES": '["' + '", "'.join(s.Sample.sample(min([n_example_samples, len(s.Sample)]))) + '"]'
}), include_groups = False)

Unnamed: 0_level_0,COUNT,PERCENTAGE,EXAMPLES
GCH1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,11561,47.36%,"[""RCN13959"", ""RCN00223"", ""SPT15260"", ""SPT21321"", ""PM0190-C"", ""RCN13377"", ""SPT12519"", ""SPT34359"", ""RCN12613"", ""RCN23115"", ""SPT40538"", ""SPT15545"", ""RCN11434"", ""SPT67042"", ""SPT83123"", ""PC0122-C"", ""SPT46503"", ""SPT23059"", ""RCN03290"", ""SPT34771"", ""SPT88322"", ""RCN22931"", ""SPT50431"", ""RCN09667"", ""SPT91676"", ""SPT72018"", ""SPT24885"", ""RCN02142"", ""PA0342-CW"", ""SPT16959"", ""RCN22990"", ""RCN12820"", ""RCN03528"", ""SPT35538"", ""RCN02242"", ""SPT54654"", ""RCN02702"", ""SPT20567"", ""SPT46285"", ""SPT34471"", ""RCN22790"", ""RCN18536"", ""SPT38752"", ""SPT19503"", ""PA0295-CW"", ""RCN11425"", ""SPT19521"", ""RCN00972"", ""SPT83807"", ""SPT16871"", ""RCN01064"", ""SPT70686"", ""SPT34691"", ""RCN11438"", ""RCN09654"", ""RCN25059"", ""RCN09914"", ""SPT83142"", ""SPT34492"", ""SPT66979"", ""SPT26859"", ""SPT65173"", ""SPT18177"", ""RCN02538"", ""SPT26935"", ""SPT84123"", ""SPT41017"", ""RCN13931"", ""SPT52971"", ""RCN11221"", ""SPT83790"", ""SPT83628"", ""SPT67118"", ""SPT34235"", ""SPT54435"", ""RCN09016"", ""SPT00203"", ""SPT18434"", ""RCN22737"", ""SPT44885"", ""QP0166-C"", ""SPT41055"", ""RCN08692"", ""SPT65141"", ""SPT44018"", ""SPT19746"", ""QC0536-CW"", ""SPT40776"", ""SPT65241"", ""SPT88095"", ""SPT43785"", ""SPT34892"", ""SPT45024"", ""SPT53093"", ""RCN03311"", ""PW0045-C"", ""SPT53731"", ""RCN26058"", ""SPT43733"", ""RCN08831""]"
0,11095,45.45%,"[""PK0068-C"", ""RCN11927"", ""RCN13069"", ""RCN01074"", ""RCN00771"", ""PJ0270-C"", ""PD0577-C"", ""RCN10364"", ""RCN10377"", ""SPT50169"", ""PF0156-C"", ""PH1713-C"", ""RCN15167"", ""SPT34445"", ""SPT50823"", ""PM0249-C"", ""SPT44045"", ""PM0541-CW"", ""RCN18675"", ""RCN08922"", ""PF0311-C"", ""QQ0054-C"", ""RCN15243"", ""SPT14951"", ""SPT50325"", ""RCN10203"", ""SPT53903"", ""PM0132-C"", ""SPT53821"", ""SPT16755"", ""RCN00290"", ""PC0013-C"", ""PA0544-C"", ""QG0094-C"", ""SPT50241"", ""RCN08330"", ""SPT53849"", ""RCN13499"", ""PE0148-C"", ""RCN18745"", ""SPT67180"", ""PH0933-C"", ""SPT43032"", ""RCN02589"", ""FP0039-C"", ""PC0206-C"", ""SPT16995"", ""RCN00215"", ""PM0758-CW"", ""RCN24933"", ""PW0054-C"", ""QG0069-C"", ""SPT16605"", ""SPT51160"", ""PT0294-C"", ""PD0540-C"", ""PH0475-C"", ""PA0208-C"", ""RCN09007"", ""PM0320-C"", ""PH0962-C"", ""PD0774-C"", ""PF0117-C"", ""SPT54666"", ""RCN12797"", ""SPT50273"", ""PV0176-C"", ""SPT16788"", ""SPT42905"", ""PF1030-C"", ""SPT67571"", ""PA0182-C"", ""PH1339-C"", ""QQ0088-C"", ""RCN09116"", ""PH0528-C"", ""PM0809-C"", ""SPT43284"", ""RCN11196"", ""RCN08019"", ""RCN10199"", ""QG0192-C"", ""SPT46151"", ""QC0326-C"", ""RCN03125"", ""SPT51218"", ""PH1377-C"", ""PM0332-C"", ""PE0127-C"", ""QP0156-C"", ""SPT53784"", ""RCN15005"", ""SPT24153"", ""PT0006-CW"", ""SPT67032"", ""PM0413-C"", ""RCN12461"", ""PF1097-C"", ""PF0557-C"", ""QP0093-C""]"
1,1753,7.18%,"[""QC0138-C"", ""SPT15137"", ""RCN00209"", ""PH0549-C"", ""PD0515-C"", ""PD1093-C"", ""RCN09360"", ""PF0925-C"", ""PD1028-C"", ""PF0476-C"", ""PD0626-C"", ""PR0264-C"", ""QC0115-C"", ""RCN00253"", ""QC0324-C"", ""PD1087-C"", ""PD0915-C"", ""PF0955-C"", ""PH0327-C"", ""PD1135-C"", ""RCN09826"", ""RCN09889"", ""QP0110-C"", ""RCN03019"", ""PD1017-C"", ""RCN09317"", ""PH0346-C"", ""SPT15127"", ""QC0330-C"", ""PD0717-C"", ""PD0925-C"", ""SPT15052"", ""PH0014-C"", ""QE0464-C"", ""QP0026-C"", ""PD1185-C"", ""RCN00968"", ""RCN09863"", ""QP0041-C"", ""PD0862-C"", ""PD1086-C"", ""QC0129-C"", ""RCN02049"", ""RCN02315"", ""RCN08388"", ""PD0094-C"", ""QP0254-C"", ""SPT43918"", ""SPT36913"", ""PH0900-C"", ""PD0056-C"", ""PD0771-C"", ""RCN08207"", ""PR0125-C"", ""PD1099-C"", ""RCN08191"", ""SPT41047"", ""PH0382-C"", ""RCN08371"", ""QP0230-C"", ""PD0835-C"", ""RCN00291"", ""PR0016-C"", ""RCN00280"", ""SPT41193"", ""PD1007-C"", ""PH0569-C"", ""PF1086-C"", ""RCN02369"", ""PD1143-C"", ""RCN09929"", ""RCN09868"", ""RCN02558"", ""PD1019-C"", ""SPT15077"", ""RCN09316"", ""SPT67645"", ""PF0262-C"", ""PJ0247-C"", ""RCN00181"", ""PD1041-C"", ""RCN11885"", ""QC0160-C"", ""PD0889-C"", ""QP0168-C"", ""RCN09670"", ""QT0036-CW"", ""PD0919-C"", ""RCN15249"", ""QC0141-C"", ""RCN02532"", ""PD1494-C"", ""PD0587-C"", ""PD1482-C"", ""RCN03013"", ""QP0171-C"", ""PJ0162-C"", ""PD0535-C"", ""PD1211-C"", ""SPT40971""]"


In [15]:
merged.groupby(["CRT"]).apply(lambda s: pd.Series({
    "COUNT": len(s.Sample),
    "PERCENTAGE": f"{len(s.Sample) / n_total_samples:.2%}",
    "EXAMPLES": '["' + '", "'.join(s.Sample.sample(min([n_example_samples, len(s.Sample)]))) + '"]'
}), include_groups = False)

Unnamed: 0_level_0,COUNT,PERCENTAGE,EXAMPLES
CRT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,10671,43.72%,"[""SPT43489"", ""SPT44504"", ""PA1061-CW"", ""RCN02254"", ""RCN16528"", ""SPT44869"", ""RCN25349"", ""SPT84014"", ""RCN09401"", ""SPT40576"", ""RCN11229"", ""SPT18986"", ""RCN11613"", ""SPT72368"", ""QC0572-CW"", ""SPT34526"", ""RCN14682"", ""SPT12304"", ""SPT44655"", ""SPT54523"", ""SPT18194"", ""RCN08626"", ""PA0284-CW"", ""SPT44832"", ""SPT26522"", ""SPT40804"", ""PF1236-CW"", ""RCN09854"", ""SPT19955"", ""RCN01874"", ""RCN25948"", ""RCN02308"", ""SPT24778"", ""RCN23112"", ""RCN12359"", ""RCN07837"", ""SPT54418"", ""SPT26558"", ""RCN12370"", ""SPT18585"", ""SPT54614"", ""SPT52649"", ""SPT51180"", ""SPT38422"", ""SPT44676"", ""SPT72087"", ""SPT91140"", ""SPT88346"", ""SPT64700"", ""SPT55525"", ""SPT40838"", ""SPT67816"", ""SPT72019"", ""QC0557-CW"", ""RCN23084"", ""PV0297-C"", ""SPT67846"", ""SPT22248"", ""SPT38302"", ""SPT18604"", ""RCN15082"", ""SPT19753"", ""SPT38429"", ""SPT18955"", ""SPT88596"", ""SPT26950"", ""SPT18971"", ""RCN13086"", ""SPT19601"", ""SPT51378"", ""SPT91254"", ""RCN02925"", ""SPT51216"", ""RCN03353"", ""SPT15253"", ""SPT54466"", ""SPT53644"", ""SPT71983"", ""SPT91004"", ""SPT90752"", ""SPT26940"", ""SPT44692"", ""SPT70570"", ""SPT34791"", ""SPT24604"", ""RCN22612"", ""RCN02902"", ""SPT36593"", ""SPT24449"", ""SPT52437"", ""SPT67754"", ""SPT36384"", ""SPT44308"", ""SPT20030"", ""RCN25320"", ""SPT70964"", ""SPT54727"", ""SPT67491"", ""PF0670-C"", ""SPT42039""]"
0,13677,56.03%,"[""RCN08017"", ""PF0529-C"", ""PF0202-C"", ""PK0007-C"", ""SPT43192"", ""SPT43357"", ""SPT50352"", ""RCN11670"", ""RCN24810"", ""SPT46563"", ""RCN14667"", ""QP0202-C"", ""PF0583-C"", ""PD1464-C"", ""PD0726-C"", ""PM0206-C"", ""RCN11192"", ""RCN11775"", ""PD1399-C"", ""PW0002-C"", ""PD0772-C"", ""RCN25018"", ""RCN14721"", ""QG0143-C"", ""RCN00225"", ""SPT15133"", ""RCN08042"", ""SPT43330"", ""SPT67618"", ""PC0013-C"", ""SPT66905"", ""SPT35207"", ""SPT40677"", ""PM0204-C"", ""SPT43077"", ""PA0488-C"", ""RCN00261"", ""SPT67661"", ""PH0626-C"", ""SPT46152"", ""SPT43016"", ""PE0393-C"", ""SPT34896"", ""RCN14592"", ""QG0380-C"", ""QT0028-CW5"", ""PF0430-C"", ""RCN09062"", ""PF0010-C"", ""SPT36179"", ""RCN00211"", ""PM0565-C"", ""RCN08289"", ""PE0468-C"", ""QG0192-C"", ""PA0217-C"", ""QP0054-C"", ""PE0180-C"", ""SPT50856"", ""RCN10400"", ""QG0011-C"", ""RCN14937"", ""QT0033-CW"", ""SPT43085"", ""RCN08270"", ""QC0364-C"", ""PF0651-C"", ""SPT41106"", ""PV0257-C"", ""FP0073-C"", ""RCN08261"", ""QQ0049-C"", ""PF0874-C"", ""RCN08326"", ""PM0144-C"", ""SPT50887"", ""SPT15071"", ""RCN11625"", ""SPT15695"", ""SPT50258"", ""PT0061-C"", ""SPT34416"", ""PW0090-C"", ""PF0026-C"", ""PF0835-C"", ""QC0231-C"", ""PM0093-C"", ""PD1361-C"", ""SPT41870"", ""PJ0162-C"", ""PR0145-C"", ""PE0253-C"", ""PH1742-C"", ""RCN02744"", ""RCN24926"", ""PD0992-C"", ""RCN15041"", ""SPT22449"", ""SPT67545"", ""SPT83112""]"
1,61,0.25%,"[""PK0063-C"", ""SPT42947"", ""PF0905-C"", ""RCN13280"", ""RCN09132"", ""PF0219-C"", ""SPT22565"", ""PM0496-C"", ""SPT46144"", ""PK0051-C"", ""PF0729-C"", ""PF0195-C"", ""RCN15335"", ""PK0068-C"", ""PF0392-C"", ""PM0455-C"", ""PF0356-C"", ""PF0934-C"", ""PM0345-C"", ""PF0118-C"", ""PM0243-C"", ""PF0190-C"", ""SPT41222"", ""SPT26638"", ""PF0796-C"", ""PF0987-C"", ""PK0069-C"", ""SPT00802"", ""PK0019-C"", ""PF0149-C"", ""PF1103-C"", ""PF0054-C"", ""PK0023-C"", ""PM0336-C"", ""PM0124-C"", ""PM0580-C"", ""PF0960-C"", ""PM0548-CW"", ""PF0449-C"", ""PK0046-C"", ""RCN08145"", ""PF0231-C"", ""RCN11806"", ""RCN09082"", ""PM0476-C"", ""RCN08130"", ""SPT16920"", ""QV0096-C"", ""SPT26595"", ""RCN15176"", ""RCN15085"", ""PM0313-C"", ""PF0852-C"", ""PF0250-C"", ""RCN15231"", ""PK0066-C"", ""PF0590-C"", ""PA0193-C"", ""PF0961-C"", ""PK0075-C"", ""QQ0097-C""]"


---

# Discordance mainly in PM2_PM3