In [1]:
import os 

import pandas as pd
import numpy as np

import pandas as pd
from dnsmex.dxsm_data import localify
from dnsmex.dxsm_data import dataset_dict, pcp_df_of_nickname

In [2]:
def prep_data(path, export_filtered=False, export_germline=False, downsample=None, split_by_v=False):
    """
    Given a path to a .csv.gz file, load it, filter it, and save it to a new file.

    downsample is expressed in terms of thousands. 
    split_by_v will split the data by V family, and save each split to a separate file.
    """
    pcp_df = pd.read_csv(path, index_col=0)
    original_pcp_count = len(pcp_df)

    # drop rows where either parent or child contains an N
    pcp_df = pcp_df[~pcp_df["parent"].str.contains("N")]
    pcp_df = pcp_df[~pcp_df["child"].str.contains("N")]
    
    # make sure that the parent and child sequences differ
    pcp_df = pcp_df[pcp_df["parent"] != pcp_df["child"]]

    # drow rows where pcp_df["parent_name"] contains "naive"
    pcp_df = pcp_df[~pcp_df["parent_is_naive"]]

    filtered_pcp_count = len(pcp_df)

    # let path_without_suffix be path without .csv.gz
    path_without_suffix = path[:-7]
    if export_filtered:
        pcp_df.to_csv(path_without_suffix + "_noN_no-naive.csv.gz")

    germline_df = pcp_df[pcp_df["v_gene"].str.contains("IGHV[34]")]
    germline_pcp_count = len(germline_df)

    if export_germline:
        germline_df.to_csv(path_without_suffix + "_IGHV34_noN_no-naive.csv.gz")
        
    # return a little dataframe with the counts
    out_dict = pd.DataFrame({
        "original_pcp_count": [original_pcp_count],
        "filtered_pcp_count": [filtered_pcp_count],
        "germline_pcp_count": [germline_pcp_count]
    }, index=[os.path.basename(path)])

    def downsample_df(df, downsample):
        downsample_count = int(downsample * 1000)
        if downsample_count > len(df):
            return None
        # take evenly spaced samples from the dataframe
        return df.iloc[::len(df)//downsample_count].copy()

    if downsample is not None:
        downsampled_df = downsample_df(pcp_df, downsample)
        downsampled_df.to_csv(path_without_suffix + f"_downsample_{downsample}k.csv.gz")
    
    if split_by_v:
        pcp_df["v_family"] = pcp_df["v_gene"].str.split("-").str[0]
        for v_family in pcp_df["v_family"].unique():
            v_family_df = pcp_df[pcp_df["v_family"] == v_family]
            v_family_df.to_csv(path_without_suffix + f"_{v_family}_noN_no-naive.csv.gz") 
            if downsample is not None:
                downsampled_v_family_df = downsample_df(v_family_df, downsample)
                if downsampled_v_family_df is not None:
                    downsampled_v_family_df.to_csv(path_without_suffix + f"_{v_family}_downsample_{downsample}k.csv.gz")
    
    return out_dict

In [3]:
prep_data(localify("~/data/v1/rodriguez-airr-seq-race-prod-InclMutInv_pcp_2024-11-12_MASKED_NI_noN_no-naive.csv.gz"), export_filtered=True, split_by_v=True)

Unnamed: 0,original_pcp_count,filtered_pcp_count,germline_pcp_count
rodriguez-airr-seq-race-prod-InclMutInv_pcp_2024-11-12_MASKED_NI_noN_no-naive.csv.gz,21754,21754,15231


In [4]:
prep_data(localify("~/data/v1/wyatt-10x-1p5m_fs-all_InclMutInv_pcp_2024-10-29_NI_noN_no-naive.csv.gz"), downsample=50)
prep_data(localify("~/data/v1/tang-deepshm-prod-InclMutInv_pcp_2024-10-29_MASKED_NI_noN_no-naive.csv.gz"), downsample=50)

# prep_data(localify("data/v1/wyatt-10x-1p5m_fs-all_pcp_2024-04-29_NI.csv.gz"), split_by_v=True, downsample=5)
# prep_data(localify("data/v1/tang-deepshm-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive.csv.gz"), downsample=50)
# prep_data(localify("data/v0/wyatt-10x-1p5m_pcp_2023-11-30_NI.csv.gz"), export_filtered=True)

Unnamed: 0,original_pcp_count,filtered_pcp_count,germline_pcp_count
tang-deepshm-prod-InclMutInv_pcp_2024-10-29_MASKED_NI_noN_no-naive.csv.gz,522586,522586,397587
