In [None]:
import pandas as pd 
import numpy as np
import readfcs
import os


In [None]:

internal_and_external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '142Nd_cCaspase3', 
                '143Nd_pCRKL Y207', 
                '144Nd_pTyr',
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '150Nd_pSTAT5 Y694', 
                '151Eu_pSTAT3 S727', 
                '152Sm_CD13',
                '153Eu_pSTAT1 Y701', 
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '156Gd_pp38 T180Y182', 
                '157Gd_CD8a', 
                '158Gd_pSTAT3 Y705',
                '159Tb_pMAPKAPK T334', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '165Ho_pCREB S133', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '170Er_pSRC Y418',
                '171Yb_pERK T202Y204', 
                '172Yb_pS6 S235S236', 
                '173Yb_STAT3tot',
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '176Yb_pS6 S240244', 
                '195Pt_mBC2', 
                '209Bi_CD11b']    
        

external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '152Sm_CD13',
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '157Gd_CD8a', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '195Pt_mBC2', 
                '209Bi_CD11b']

internal = list(set(internal_and_external) - set(external))

In [None]:

metadata_df = pd.read_csv("metadata.txt", header=None, names=['filename', 'time', 'batch', 'patient_id'])
metadata_df['time'].astype(str)

metadata_df.set_index(['patient_id', 'time'], inplace=True)


def parse_patient_filename(filename):
    # Parse the filename to extract patient_id and time_point
    patient_id, time_point = filename.strip('.fcs').split('_')
    patient_id = str(patient_id)
    time_point = int(time_point[-1])

    # Access the batch using loc in pandas
    batch = metadata_df.loc[(patient_id, time_point), 'batch']

    return  batch


In [None]:
uncorrected_cytof_data_file = 'cytofdata/uncorrected_all.pkl'

if os.path.exists(uncorrected_cytof_data_file):
    print('Loading cytof data from file')

    data = pd.read_pickle(uncorrected_cytof_data_file)

else: 
    data_list = []

    dir_patients = 'cytofdata/patients'
    for filename in os.listdir(dir_patients):
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_patients, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]

            batch = parse_patient_filename(filename)

            df['file_id'] = filename.strip('.fcs')
            df['batch'] = str(batch)
                
            data_list.append(df)


    dir_donors = 'cytofdata/donors'
    for filename in os.listdir(dir_donors):   
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_donors, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]

            test_type, donor_id, batch = filename.strip('.fcs').split('_')

            df['file_id'] = '_'.join([test_type, donor_id])
            df['batch'] = str(batch)
                
            data_list.append(df)


    dir_rj_rt = 'batch_RT_RJ_1_4'
    for filename in os.listdir(dir_rj_rt):    
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_rj_rt, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]
            
            batch, test = filename.strip('.fcs').split('_')

            df['file_id'] = test
            df['batch'] = str(batch[-1])
                
            data_list.append(df)
    

    dir_rj_rt = 'batch_RT_RJ_5_6'
    for filename in os.listdir(dir_rj_rt):    
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_rj_rt, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]
            
            batch, test = filename.strip('.fcs').split('_')

            df['file_id'] = test
            df['batch'] = str(batch[-1])
                
            data_list.append(df)

    data = pd.concat(data_list, ignore_index=True)

    print('ArcSinh transforming data')
    for column in internal_and_external:
        data[column] = np.arcsinh(data[column]/ 5)

    data.to_pickle(uncorrected_cytof_data_file)

In [None]:
data['batch'].unique()

In [None]:
data_batch_1_4 = data[data['batch'].isin(['1', '2', '3', '4'])]
data_batch_5_6 = data[data['batch'].isin(['5', '6'])]


In [None]:
data_batch_1_4

In [None]:
data_batch_5_6

In [None]:
data_batch_1_4.to_pickle('cytofdata/uncorrected_batch_1_4.pkl')
data_batch_5_6.to_pickle('cytofdata/uncorrected_batch_5_6.pkl')