In [21]:
import pandas as pd 
import numpy as np
import readfcs
import os


In [22]:

internal_and_external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '142Nd_cCaspase3', 
                '143Nd_pCRKL Y207', 
                '144Nd_pTyr',
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '150Nd_pSTAT5 Y694', 
                '151Eu_pSTAT3 S727', 
                '152Sm_CD13',
                '153Eu_pSTAT1 Y701', 
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '156Gd_pp38 T180Y182', 
                '157Gd_CD8a', 
                '158Gd_pSTAT3 Y705',
                '159Tb_pMAPKAPK T334', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '165Ho_pCREB S133', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '170Er_pSRC Y418',
                '171Yb_pERK T202Y204', 
                '172Yb_pS6 S235S236', 
                '173Yb_STAT3tot',
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '176Yb_pS6 S240244', 
                '195Pt_mBC2', 
                '209Bi_CD11b']    
        

external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '152Sm_CD13',
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '157Gd_CD8a', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '195Pt_mBC2', 
                '209Bi_CD11b']

internal = list(set(internal_and_external) - set(external))

In [23]:

metadata_df = pd.read_csv("metadata.txt", header=None, names=['filename', 'time', 'batch', 'patient_id'])
metadata_df['time'].astype(str)

metadata_df.set_index(['patient_id', 'time'], inplace=True)


def parse_patient_filename(filename):
    # Parse the filename to extract patient_id and time_point
    patient_id, time_point = filename.strip('.fcs').split('_')
    patient_id = str(patient_id)
    time_point = int(time_point[-1])

    # Access the batch using loc in pandas
    batch = metadata_df.loc[(patient_id, time_point), 'batch']

    return  batch


In [24]:
uncorrected_cytof_data_file = 'cytofdata/uncorrected_all.pkl'

if os.path.exists(uncorrected_cytof_data_file):
    print('Loading cytof data from file')

    data = pd.read_pickle(uncorrected_cytof_data_file)

else: 
    data_list = []

    dir_patients = 'cytofdata/patients'
    for filename in os.listdir(dir_patients):
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_patients, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]

            batch = parse_patient_filename(filename)

            df['file_id'] = filename.strip('.fcs')
            df['batch'] = str(batch)
                
            data_list.append(df)


    dir_donors = 'cytofdata/donors'
    for filename in os.listdir(dir_donors):   
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_donors, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]

            test_type, donor_id, batch = filename.strip('.fcs').split('_')

            df['file_id'] = '_'.join([test_type, donor_id])
            df['batch'] = str(batch)
                
            data_list.append(df)


    dir_rj_rt = 'batch_RT_RJ_1_4'
    for filename in os.listdir(dir_rj_rt):    
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_rj_rt, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]
            
            batch, test = filename.strip('.fcs').split('_')

            df['file_id'] = test
            df['batch'] = str(batch[-1])
                
            data_list.append(df)
    

    dir_rj_rt = 'batch_RT_RJ_5_6'
    for filename in os.listdir(dir_rj_rt):    
        if filename.endswith('.fcs'):
            file_path = os.path.join(dir_rj_rt, filename)
                
            fcs_data = readfcs.read(file_path)
                
            df = pd.DataFrame(fcs_data.X, index=fcs_data.obs_names, columns=fcs_data.var_names)

            df = df[internal_and_external]
            
            batch, test = filename.strip('.fcs').split('_')

            df['file_id'] = test
            df['batch'] = str(batch[-1])
                
            data_list.append(df)

    data = pd.concat(data_list, ignore_index=True)

    print('ArcSinh transforming data')
    for column in internal_and_external:
        data[column] = np.arcsinh(data[column]/ 5)

    data.to_pickle(uncorrected_cytof_data_file)

ArcSinh transforming data


In [25]:
data['batch'].unique()

array(['4', '1', '2', '3', '5', '6'], dtype=object)

In [26]:
data_batch_1_4 = data[data['batch'].isin(['1', '2', '3', '4'])]
data_batch_5_6 = data[data['batch'].isin(['5', '6'])]


In [30]:
data_batch_1_4

Unnamed: 0,89Y_CD45,111Cd_CD3,112Cd_CD34,113Cd_CD123,114Cd_CD66b,116Cd_HLA-DR,141Pr_CD38,142Nd_cCaspase3,143Nd_pCRKL Y207,144Nd_pTyr,...,171Yb_pERK T202Y204,172Yb_pS6 S235S236,173Yb_STAT3tot,174Yb_CD11c,175Lu_CXCR4,176Yb_pS6 S240244,195Pt_mBC2,209Bi_CD11b,file_id,batch
0,2.217051,1.135451e-01,3.317439,3.812791e-01,1.590146,2.885046,3.629630,0.186837,0.327056,0.842817,...,0.361276,0.000000,2.558300,0.000000,0.953744,0.000000,3.512650,0.531857,01_t0,4
1,3.963797,0.000000e+00,0.311732,0.000000e+00,4.716617,0.714334,0.808533,0.152932,0.676452,0.183566,...,0.012663,0.460877,2.994078,1.007841,1.161596,0.040417,3.752279,3.391960,01_t0,4
2,2.961751,2.314977e-01,0.041203,2.750469e-01,4.226237,0.250043,0.766132,0.000000,0.075735,1.489841,...,0.583008,0.000000,3.148486,0.135580,0.000000,0.749384,3.559952,2.233353,01_t0,4
3,3.951971,3.915445e-01,0.030235,1.911737e+00,1.914248,0.196068,4.125344,0.238149,0.496188,0.466622,...,0.176374,0.000000,2.985532,0.909775,2.843206,1.296746,3.968642,0.490613,01_t0,4
4,3.204973,0.000000e+00,0.002475,4.006683e-01,4.130337,0.454735,0.448444,0.079543,0.328764,1.208815,...,0.000000,0.044884,3.489844,0.758197,0.813311,0.877981,2.142285,2.659254,01_t0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28835435,1.933876,0.000000e+00,0.000000,0.000000e+00,2.877097,0.088291,0.626438,0.196330,0.867764,0.685698,...,0.000000,0.000000,2.777611,0.000000,0.653948,1.072680,4.157113,0.078366,RT,4
28835436,2.650131,2.531160e-01,0.242885,0.000000e+00,4.288952,0.284044,0.558895,0.476855,0.970094,0.951070,...,0.271328,0.000000,4.164376,0.000000,1.000658,0.732168,4.470614,2.084281,RT,4
28835437,2.920556,5.748914e-10,0.000000,1.403368e-09,3.732402,0.000000,0.144446,0.297298,0.601309,0.565104,...,0.000000,0.000000,2.975235,0.027468,0.723384,0.000000,3.403662,1.492469,RT,4
28835438,1.975376,4.264121e-01,0.206369,7.725366e-01,3.395772,0.000000,1.693578,0.438377,1.236621,0.315330,...,0.130432,0.562467,4.013903,0.023892,1.468089,1.237285,4.417926,0.591822,RT,4


In [28]:
data_batch_5_6

Unnamed: 0,89Y_CD45,111Cd_CD3,112Cd_CD34,113Cd_CD123,114Cd_CD66b,116Cd_HLA-DR,141Pr_CD38,142Nd_cCaspase3,143Nd_pCRKL Y207,144Nd_pTyr,...,171Yb_pERK T202Y204,172Yb_pS6 S235S236,173Yb_STAT3tot,174Yb_CD11c,175Lu_CXCR4,176Yb_pS6 S240244,195Pt_mBC2,209Bi_CD11b,file_id,batch
8677684,1.322364,6.906810e-01,0.234809,0.000000e+00,2.769354,0.000000,1.444162,0.524762,0.794063,1.478515,...,0.758985,0.000000,3.347928,0.000000,0.289001,0.578966,4.352483,1.666835,42_t0,5
8677685,2.281195,0.000000e+00,0.000000,0.000000e+00,2.413961,0.368302,1.221557,0.585226,1.727114,0.454146,...,0.062885,0.000000,3.456754,0.000000,0.557239,0.785236,3.896820,1.982088,42_t0,5
8677686,1.769071,5.062488e-01,0.448229,0.000000e+00,3.020145,0.213722,1.356596,0.505471,0.311342,0.817396,...,0.156867,0.704822,2.859325,0.689864,0.477739,1.084606,3.719148,2.037591,42_t0,5
8677687,1.941955,3.401837e-10,0.000000,8.304195e-10,3.208765,0.000000,0.268287,0.480642,0.833470,1.158779,...,0.042892,0.362126,3.359798,0.697510,0.999510,0.374486,4.122007,0.738507,42_t0,5
8677688,1.052949,0.000000e+00,0.113614,1.226857e-01,2.890899,0.000000,1.574144,0.000000,0.867476,0.654612,...,0.597031,0.000000,3.408004,0.000000,2.043363,1.635563,3.932680,0.514583,42_t0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29425802,4.522675,0.000000e+00,0.080039,7.059654e-02,0.815707,2.757855,2.649409,0.263992,0.655001,1.367720,...,0.100442,0.000000,3.335423,2.760931,0.410043,1.372749,3.947045,2.588893,RT,6
29425803,2.304324,1.351632e-01,0.140411,0.000000e+00,3.907543,0.000000,1.299624,0.419498,1.518388,1.015800,...,0.172662,0.293180,3.237144,0.000000,0.660367,0.581367,4.326999,1.178895,RT,6
29425804,2.196853,5.941239e-01,0.219775,0.000000e+00,3.776895,0.000000,0.006998,0.225117,0.703771,1.300601,...,0.000000,0.000000,3.879221,0.000000,1.646059,1.029499,4.167752,2.295873,RT,6
29425805,1.541709,0.000000e+00,0.000000,0.000000e+00,2.794125,0.369857,1.074207,0.039364,1.249691,1.200093,...,0.058179,0.000000,3.502955,0.000000,0.349563,0.621298,4.488726,0.590534,RT,6


In [29]:
data_batch_1_4.to_pickle('cytofdata/uncorrected_batch_1_4.pkl')
data_batch_5_6.to_pickle('cytofdata/uncorrected_batch_5_6.pkl')