# Function: Extract CODEX signals from Xenium cell boundaries column by column

In [34]:
def integrate_codex_xenium(slide_id, column_id, base_paths, sdata_xenium):
    import os
    import numpy as np
    import pandas as pd
    import skimage as ski
    import tifffile
    import zarr
    import skimage.transform
    from ome_types.model import OME, Image, Pixels, Channel

    # Construct paths
    pool = f"{slide_id}_column_{column_id}"
    
    region_name = f"Region_{column_id}"
    slide_short = slide_id.replace("ID_", "")  # e.g. "ID_0022110" → "0022110"

    # Find the matching segmentation directory - FILTER OUT HIDDEN FILES
    all_dirs = os.listdir(base_paths['segmentation_base'])
    # Filter out macOS hidden files that start with ._
    visible_dirs = [d for d in all_dirs if not d.startswith('._')]
    
    segmentation_dir = [
        d for d in visible_dirs
        if f"__{slide_short}__{region_name}__" in d
    ]
    if not segmentation_dir:
        print(f"Available directories: {visible_dirs}")
        raise FileNotFoundError(f"No segmentation directory found for slide {slide_id}, column {column_id}")
    segmentation_path = os.path.join(base_paths['segmentation_base'], segmentation_dir[0], "cells.zarr.zip")
    
    def fix_macos_hidden_path(path):
        if not isinstance(path, str):
            # If it's not a string (e.g., int), just return it unchanged
            return path
        dirname = os.path.dirname(path)
        filename = os.path.basename(path)
        if filename.startswith("._"):
            real_filename = filename[2:]
            real_path = os.path.join(dirname, real_filename)
            if os.path.exists(real_path):
                return real_path
            else:
                raise FileNotFoundError(f"Expected real file not found: {real_path}")
        return path

    def fix_all_opt_paths(opt_dict):
        for k, v in opt_dict.items():
            opt_dict[k] = fix_macos_hidden_path(v)
        return opt_dict
    
    pool = f"{slide_id}_column_{column_id}"
    opt = {
        "pool": pool,
        "segmentation": segmentation_path,
        "phenocycler": f"{base_paths['phenocycler']}/{pool}.ome.ome.tif",
        "transmat": f"{base_paths['transmat']}/{pool}.ome_alignment_files/matrix.csv",
        "transsegmentation": f"{base_paths['out']}/{pool}/segmentation.csv",
        "adtcsv": f"{base_paths['out']}/{pool}/adt_countmat.csv",
        "adtcsv_cell_ids": f"{base_paths['out']}/{pool}/adt_countmat_with_cell_ids.csv",
        "plotlowres": f"{base_paths['out']}/{pool}/{pool}_new_cell_dapi_low.ome.tif",
        "plothighres": f"{base_paths['out']}/{pool}/{pool}_new_cell_dapi_high.ome.tif",
        "plotsegmentation": f"{base_paths['out']}/{pool}/{pool}_new_trans_segmentation.tiff",
        "tilesize": 1024,
        "compression": "zlib",
        "subresolution": 4,
        "interpreter": "rgb",
        "scale": 2,
    }

    opt = fix_all_opt_paths(opt)
    
    os.makedirs(os.path.dirname(opt["plotlowres"]), exist_ok=True)

    def open_zarr(path):
        # If the filename starts with '._', switch to the real file
        dirname = os.path.dirname(path)
        filename = os.path.basename(path)
    
        if filename.startswith("._"):
            real_filename = filename[2:]
            real_path = os.path.join(dirname, real_filename)
            if os.path.exists(real_path):
                path = real_path
            else:
                raise FileNotFoundError(f"Expected real file not found: {real_path}")
    
        # Additionally: if path is a directory store, make sure it exists and is a directory
        if not os.path.exists(path):
            raise FileNotFoundError(f"File or directory not found: {path}")
        
        if path.endswith(".zip"):
            store = zarr.ZipStore(path, mode="r")
        else:
            # Confirm it's a directory (for DirectoryStore)
            if not os.path.isdir(path):
                raise NotADirectoryError(f"Expected directory for DirectoryStore but got a file: {path}")
            store = zarr.DirectoryStore(path)
    
        return zarr.group(store=store)

    root = open_zarr(opt["segmentation"])
    cellseg_mask = np.array(root["masks"][1])
    transmat = np.loadtxt(opt["transmat"], delimiter=",")
    tform = ski.transform.AffineTransform(matrix=transmat)

    tiff = tifffile.imread(opt["phenocycler"], is_ome=False, level=0)
    tiff_dapi = tiff[0, :, :]

    trans_mask = ski.transform.warp(cellseg_mask, inverse_map=tform, output_shape=tiff_dapi.shape, order=0, preserve_range=True).astype(int)
    cell_borders = ski.segmentation.find_boundaries(trans_mask, mode="thin")

    def normalize(img):
        return ski.exposure.rescale_intensity(img, in_range="image", out_range=(0, 1))

    gray_norm = np.clip(normalize(tiff_dapi) * 1.75, 0, 1)
    rgb_image = np.stack([gray_norm]*3, axis=-1)
    rgb_image[cell_borders] = [1.0, 0.0, 0.0]

    image_down = ski.transform.resize(rgb_image, (int(rgb_image.shape[0] * 0.25), int(rgb_image.shape[1] * 0.25)), anti_aliasing=True)

    def save_as_ome(image, filename, subresolution=3, scale=2, tilesize=1024, interpreter="rgb", compression="zlib"):
        if image.ndim == 3 and image.shape[2] == 3:
            image = np.moveaxis(image, -1, 0)
        pyramid = [image]
        for _ in range(1, subresolution):
            downsampled = pyramid[-1][:, ::scale, ::scale]
            pyramid.append(downsampled)

        pixels = Pixels(
            dimension_order="XYZCT",
            size_x=image.shape[2],
            size_y=image.shape[1],
            size_z=1,
            size_c=image.shape[0],
            size_t=1,
            type=str(image.dtype) if str(image.dtype) != "float64" else "float",
            physical_size_x=0.125,
            physical_size_y=0.125,
            physical_size_x_unit="µm",
            physical_size_y_unit="µm",
            channels=[
                Channel(id="Channel:0:0", name="Red", samples_per_pixel=1),
                Channel(id="Channel:0:1", name="Green", samples_per_pixel=1),
                Channel(id="Channel:0:2", name="Blue", samples_per_pixel=1),
            ],
        )
        ome = OME(images=[Image(id="Image:0", name="Pyramidal OME", pixels=pixels)])

        with tifffile.TiffWriter(filename, bigtiff=True) as tif:
            tif.write(pyramid[0], photometric=interpreter, tile=(tilesize, tilesize),
                      compression=compression, subifds=len(pyramid)-1,
                      metadata={"axes": "CYX", "ome": ome})
            for level in pyramid[1:]:
                tif.write(level, photometric=interpreter, tile=(tilesize, tilesize), compression=compression)

    save_as_ome(image_down, opt["plotlowres"], opt["subresolution"], opt["scale"], opt["tilesize"], opt["interpreter"], opt["compression"])
    save_as_ome(rgb_image, opt["plothighres"], opt["subresolution"], opt["scale"], opt["tilesize"], opt["interpreter"], opt["compression"])
    ski.io.imsave(opt["plotsegmentation"], trans_mask)

    # Quantification
    unique_cell_ids = np.unique(trans_mask)
    unique_cell_ids = unique_cell_ids[unique_cell_ids > 0]
    df_adt = pd.DataFrame(index=unique_cell_ids)
    df_adt.index.name = 'cell_id'

    for i in range(tiff.shape[0]):
        props = ski.measure.regionprops(trans_mask, intensity_image=tiff[i])
        mean_signals = {prop.label: prop.mean_intensity for prop in props}
        df_adt[f'channel_{i}'] = df_adt.index.map(mean_signals)

    df_adt.to_csv(opt["adtcsv"])

    # Rename cell IDs
    cell_names_list = sdata_xenium[slide_id][f'column_{column_id}'].table.obs['cell_id'].tolist()
    cell_id_to_name = {i + 1: name for i, name in enumerate(cell_names_list)}
    df_adt_reset = df_adt.reset_index()
    df_adt_reset['cell_name'] = df_adt_reset['cell_id'].map(cell_id_to_name)
    df_adt_reset = df_adt_reset.rename(columns={'cell_id': 'index', 'cell_name': 'cell_id'})
    df_adt_reset = df_adt_reset[['index', 'cell_id'] + [col for col in df_adt_reset.columns if col not in ['index', 'cell_id']]]
    df_adt_reset.to_csv(opt["adtcsv_cell_ids"], index=False)

    print(f"Completed processing for {pool}")

# Run 2

In [23]:
import sys
import os
sys.path.append(os.path.abspath('../src'))
import load_sdata

# Load each Xenium dataset into a SpatialData object
sdata_xenium_first_slide = load_sdata.get_xenium_slide_data('0022110')
sdata_xenium_second_slide = load_sdata.get_xenium_slide_data('0022111')

sdata_xenium = {
    'ID_0022110': sdata_xenium_first_slide,
    'ID_0022111': sdata_xenium_second_slide,
}

In [35]:
base_paths = {
    "segmentation_base": "/media/Lynn/data/Lisa_raw_data/Xenium",  # now the true base
    "phenocycler": "/media/Lynn/data/CODEX_cropped",
    "transmat": "/media/Lynn/alignment/codex_columns",
    "out": "/media/Lynn/data/Integrated_data"
}
slides = ["ID_0022110", "ID_0022111"]
columns = [1, 2, 3, 4]

# Assuming sdata_xenium is a nested dict like: sdata_xenium['ID_0022110']['column_1']
for slide in slides:
    for col in columns:
        integrate_codex_xenium(slide, col, base_paths, sdata_xenium)

Completed processing for ID_0022110_column_3
Completed processing for ID_0022110_column_4
Completed processing for ID_0022111_column_1
Completed processing for ID_0022111_column_2
Completed processing for ID_0022111_column_3
Completed processing for ID_0022111_column_4


## Combine all obtained tables (all columns & all slides)

In [9]:
import pandas as pd

# Set path to output directory
out_base = "/media/Lynn/data/Integrated_data"

slides_columns = {
    "ID_0022110": [1, 2, 3, 4],
    "ID_0022111": [1, 2, 3, 4]
}

# Load and concatenate all CSVs
df_list = []
for slide, columns in slides_columns.items():
    for col in columns:
        pool = f"{slide}_column_{col}"
        csv_path = os.path.join(out_base, pool, "adt_countmat_with_cell_ids.csv")
        df = pd.read_csv(csv_path)
        df['slide'] = slide
        df['column'] = col
        df_list.append(df)

# Concatenate all into one DataFrame
df_combined = pd.concat(df_list, ignore_index=True)

In [10]:
df_combined.drop('index', axis = 1)

Unnamed: 0,cell_id,channel_0,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,channel_8,...,channel_19,channel_20,channel_21,channel_22,channel_23,channel_24,channel_25,channel_26,slide,column
0,aaabfeja-1,2298.041667,1.145833,6.166667,1272.468750,0.875000,74.020833,64.156250,23.708333,846.437500,...,69.614583,6.291667,15.812500,19.666667,156.187500,6.541667,2.062500,23.083333,ID_0022110,1
1,aaabincm-1,9660.401575,2.763780,9.582677,1513.377953,2.921260,75.472441,109.692913,54.763780,804.440945,...,136.007874,8.015748,69.385827,32.795276,158.921260,14.141732,4.779528,34.354331,ID_0022110,1
2,aaabjmea-1,3565.448980,4.428571,10.948980,1596.010204,2.857143,88.418367,105.469388,59.040816,1087.081633,...,108.734694,7.295918,20.806122,22.571429,176.102041,7.265306,4.867347,29.653061,ID_0022110,1
3,aaabjnmd-1,3629.280374,2.252336,12.663551,1531.308411,19.271028,85.551402,114.102804,68.448598,1254.785047,...,267.560748,9.280374,38.130841,43.364486,192.934579,11.439252,4.654206,26.925234,ID_0022110,1
4,aaabnjhk-1,10472.338235,6.441176,10.735294,1618.669118,8.757353,91.816176,120.352941,55.294118,861.073529,...,213.845588,8.154412,80.382353,41.345588,198.448529,15.830882,6.647059,38.169118,ID_0022110,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190221,oikdcfle-1,15366.360000,3.240000,192.020000,900.340000,8.160000,127.340000,73.040000,18.260000,10.240000,...,321.580000,3.860000,128.980000,35.220000,290.600000,20.160000,9.760000,40.880000,ID_0022111,4
1190222,oikddnjh-1,15569.080000,1.440000,94.080000,1337.040000,1.560000,378.040000,60.800000,15.120000,9.200000,...,445.840000,8.640000,231.880000,34.800000,558.560000,5.360000,0.360000,18.200000,ID_0022111,4
1190223,oikdkbdj-1,17177.684211,2.894737,62.631579,1597.789474,0.000000,95.368421,65.000000,4.789474,2.000000,...,341.894737,5.842105,164.210526,34.894737,274.526316,15.105263,0.578947,21.421053,ID_0022111,4
1190224,oikdpkpb-1,15659.100000,10.133333,33.122222,1606.788889,7.800000,94.944444,137.044444,32.922222,12.188889,...,462.488889,11.144444,170.900000,48.011111,457.477778,14.188889,13.555556,49.677778,ID_0022111,4


In [11]:
channel_names = ['DAPI', 'FoxP3', 'aSMA', 'CD4', 'CD8', 'CD31', 'CD11c', 'IFNG',
                 'Pan-Cytokeratin', 'CD68', 'CD20', 'CD66b', 'TNFa', 'CD45RO', 'CD14',
                 'CD11b', 'Vimentin', 'CD163', 'IL10', 'CD45', 'CCR7', 'CD38', 'CD69',
                 'Podoplanin', 'PNAd', 'CD16', 'CXCL13']

# Build mapping dictionary from old to new names
rename_dict = {f'channel_{i}': name for i, name in enumerate(channel_names)}

# Rename columns in your DataFrame
df_combined = df_combined.rename(columns=rename_dict)

In [12]:
output_path = "/media/Lynn/data/Integrated_data/run2_codex_intensities_per_cell.csv"
df_combined.to_csv(output_path, index=False)

## Merge by cell_id with Xenium adata

In [None]:
import scanpy as sc

# Load Xenium metadata table
adata_xenium_both_slides = sc.read_h5ad("/media/Lynn/data/Xenium_table_with_metadata/adata_both_slides.h5ad")

### Check common cell_ids

In [44]:
# Extract cell IDs from AnnData and DataFrame
xenium_cell_ids = np.unique(adata_xenium_both_slides.obs['cell_id'])
codex_cell_ids = df_combined['cell_id'].unique()

# Check intersection
shared_ids = np.intersect1d(xenium_cell_ids, codex_cell_ids)

print(f"Shared cell_ids: {len(shared_ids)}")
print(f"In Xenium only: {len(xenium_cell_ids) - len(shared_ids)}")
print(f"In CODEX only: {len(codex_cell_ids) - len(shared_ids)}")

Shared cell_ids: 1190067
In Xenium only: 3267
In CODEX only: 0


In [49]:
df_xenium = adata_xenium_both_slides.obs.copy()

# Make sure both are clean and lower-case string
df_combined['cell_id'] = df_combined['cell_id'].astype(str).str.strip().str.lower()
df_xenium['cell_id'] = df_xenium['cell_id'].astype(str).str.strip().str.lower()

df_xenium['cell_id'] = df_xenium['cell_id']

# Now merge
df_merged = pd.merge(df_combined, df_xenium, on='cell_id', how='inner')
print(f"Final merged rows: {df_merged.shape[0]}")

Final merged rows: 1190546


### Check for duplicates

In [51]:
dupes = df_combined['cell_id'].duplicated(keep=False)
print(f"Duplicated cell_ids in df_combined: {df_combined[dupes].shape[0]}")

Duplicated cell_ids in df_combined: 318


In [64]:
dupes = df_xenium['cell_id'].duplicated(keep=False)
print(f"Duplicated cell_ids in df_xenium: {df_xenium[dupes].shape[0]}")

Duplicated cell_ids in df_xenium: 322


#### After merging, we get 4 duplicates insted of 2 for each cell_id. 
2 of them are erroneous because they merge incorrectly (we can check by comparing 'index' coming from codex and 'cell_labels' coming from xenium)  

In [59]:
duplicates = df_merged[df_merged.duplicated(subset='cell_id', keep=False)]
duplicates_sorted = duplicates.sort_values('cell_id')

print(f"Number of duplicated rows: {duplicates_sorted.shape[0]}")
display(duplicates_sorted)

Number of duplicated rows: 640


Unnamed: 0,index,cell_id,channel_0,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,...,cell_labels,column_y,batch,core_ID,slide_ID,tissue,time_point,patient_ID,year,block_ID
515,519,aanniafc-1,5190.038043,2.293478,9.527174,1577.206522,0.956522,55.233696,84.296196,10.524457,...,519,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
516,519,aanniafc-1,5190.038043,2.293478,9.527174,1577.206522,0.956522,55.233696,84.296196,10.524457,...,412,column_1,dataset_1,X2Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_2
144071,412,aanniafc-1,3566.369427,1.076433,388.063694,1570.305732,0.961783,55.318471,86.299363,6.929936,...,412,column_1,dataset_1,X2Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_2
144070,412,aanniafc-1,3566.369427,1.076433,388.063694,1570.305732,0.961783,55.318471,86.299363,6.929936,...,519,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
522388,927,abilllki-1,20350.315789,3.215789,141.721053,3724.200000,242.152632,79.415789,202.163158,15.342105,...,927,column_0,dataset_0,X1Y1,22111.0,ileum,during_treatment,6.0,2020.0,B2020_28194_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656097,134605,ofibcoed-1,4426.944828,0.965517,10.358621,887.572414,0.089655,14.331034,86.379310,12.124138,...,134605,column_0,dataset_0,X1Y8,22111.0,colon,before_treatment,1.0,2021.0,B2021_14289_3
378686,121496,ogkfggkb-1,55.026506,1.973494,0.612048,11.144578,1.978313,1.055422,5.190361,2.012048,...,121496,column_2,dataset_2,Unknown,,,,,,
378685,121496,ogkfggkb-1,55.026506,1.973494,0.612048,11.144578,1.978313,1.055422,5.190361,2.012048,...,142938,column_0,dataset_0,Unknown,,,,,,
142561,142938,ogkfggkb-1,87.277251,2.350711,0.073460,58.722749,5.158768,1.414692,15.587678,4.033175,...,121496,column_2,dataset_2,Unknown,,,,,,


#### -> keep only the rows where index==cell_labels

In [60]:
df_merged_cleaned = df_merged[df_merged['index'] == df_merged['cell_labels']]

In [61]:
df_merged_cleaned

Unnamed: 0,index,cell_id,channel_0,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,...,cell_labels,column_y,batch,core_ID,slide_ID,tissue,time_point,patient_ID,year,block_ID
0,4,aaabfeja-1,2298.041667,1.145833,6.166667,1272.468750,0.875000,74.020833,64.156250,23.708333,...,4,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
1,5,aaabincm-1,9660.401575,2.763780,9.582677,1513.377953,2.921260,75.472441,109.692913,54.763780,...,5,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
2,6,aaabjmea-1,3565.448980,4.428571,10.948980,1596.010204,2.857143,88.418367,105.469388,59.040816,...,6,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
3,7,aaabjnmd-1,3629.280374,2.252336,12.663551,1531.308411,19.271028,85.551402,114.102804,68.448598,...,7,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
4,8,aaabnjhk-1,10472.338235,6.441176,10.735294,1618.669118,8.757353,91.816176,120.352941,55.294118,...,8,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190541,199985,oikdcfle-1,15366.360000,3.240000,192.020000,900.340000,8.160000,127.340000,73.040000,18.260000,...,199985,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190542,199986,oikddnjh-1,15569.080000,1.440000,94.080000,1337.040000,1.560000,378.040000,60.800000,15.120000,...,199986,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190543,199987,oikdkbdj-1,17177.684211,2.894737,62.631579,1597.789474,0.000000,95.368421,65.000000,4.789474,...,199987,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190544,199988,oikdpkpb-1,15659.100000,10.133333,33.122222,1606.788889,7.800000,94.944444,137.044444,32.922222,...,199988,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2


#### Remaining duplicate cell_ids come from different columns

In [62]:
duplicates = df_merged_cleaned[df_merged_cleaned.duplicated(subset='cell_id', keep=False)]
duplicates_sorted = duplicates.sort_values('cell_id')

print(f"Number of duplicated rows: {duplicates_sorted.shape[0]}")
display(duplicates_sorted)

Number of duplicated rows: 318


Unnamed: 0,index,cell_id,channel_0,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,...,cell_labels,column_y,batch,core_ID,slide_ID,tissue,time_point,patient_ID,year,block_ID
515,519,aanniafc-1,5190.038043,2.293478,9.527174,1577.206522,0.956522,55.233696,84.296196,10.524457,...,519,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
144071,412,aanniafc-1,3566.369427,1.076433,388.063694,1570.305732,0.961783,55.318471,86.299363,6.929936,...,412,column_1,dataset_1,X2Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_2
522388,927,abilllki-1,20350.315789,3.215789,141.721053,3724.200000,242.152632,79.415789,202.163158,15.342105,...,927,column_0,dataset_0,X1Y1,22111.0,ileum,during_treatment,6.0,2020.0,B2020_28194_2
823548,1113,abilllki-1,12600.529851,3.492537,657.537313,1823.507463,14.425373,148.776119,108.902985,204.462687,...,1113,column_2,dataset_2,X3Y1,22111.0,ileum,before_treatment,6.0,2013.0,B2013_6777_4
525353,3890,agkhkida-1,13379.197531,7.771605,60.265432,2978.925926,2.672840,134.827160,174.290123,119.635802,...,3890,column_0,dataset_0,X1Y1,22111.0,ileum,during_treatment,6.0,2020.0,B2020_28194_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141796,142149,offebmkd-1,16099.354515,8.856187,1942.461538,2191.180602,2.160535,263.866221,192.424749,27.214047,...,142149,column_0,dataset_0,X1Y8,22110.0,colon,during_treatment,3.0,2020.0,B2020_41872_5
820104,162105,ofibcoed-1,80.120155,1.511628,0.259690,18.374031,6.172481,1.974806,13.606589,6.664729,...,162105,column_1,dataset_1,Unknown,,,,,,
656097,134605,ofibcoed-1,4426.944828,0.965517,10.358621,887.572414,0.089655,14.331034,86.379310,12.124138,...,134605,column_0,dataset_0,X1Y8,22111.0,colon,before_treatment,1.0,2021.0,B2021_14289_3
142560,142938,ogkfggkb-1,87.277251,2.350711,0.073460,58.722749,5.158768,1.414692,15.587678,4.033175,...,142938,column_0,dataset_0,Unknown,,,,,,


### Rename CODEX channels

In [65]:
channel_names = ['DAPI', 'FoxP3', 'aSMA', 'CD4', 'CD8', 'CD31', 'CD11c', 'IFNG',
                 'Pan-Cytokeratin', 'CD68', 'CD20', 'CD66b', 'TNFa', 'CD45RO', 'CD14',
                 'CD11b', 'Vimentin', 'CD163', 'IL10', 'CD45', 'CCR7', 'CD38', 'CD69',
                 'Podoplanin', 'PNAd', 'CD16', 'CXCL13']

# Build mapping dictionary from old to new names
rename_dict = {f'channel_{i}': name for i, name in enumerate(channel_names)}

# Rename columns in your DataFrame
df_merged_cleaned = df_merged_cleaned.rename(columns=rename_dict)

In [66]:
df_merged_cleaned

Unnamed: 0,index,cell_id,DAPI,FoxP3,aSMA,CD4,CD8,CD31,CD11c,IFNG,...,cell_labels,column_y,batch,core_ID,slide_ID,tissue,time_point,patient_ID,year,block_ID
0,4,aaabfeja-1,2298.041667,1.145833,6.166667,1272.468750,0.875000,74.020833,64.156250,23.708333,...,4,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
1,5,aaabincm-1,9660.401575,2.763780,9.582677,1513.377953,2.921260,75.472441,109.692913,54.763780,...,5,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
2,6,aaabjmea-1,3565.448980,4.428571,10.948980,1596.010204,2.857143,88.418367,105.469388,59.040816,...,6,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
3,7,aaabjnmd-1,3629.280374,2.252336,12.663551,1531.308411,19.271028,85.551402,114.102804,68.448598,...,7,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
4,8,aaabnjhk-1,10472.338235,6.441176,10.735294,1618.669118,8.757353,91.816176,120.352941,55.294118,...,8,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190541,199985,oikdcfle-1,15366.360000,3.240000,192.020000,900.340000,8.160000,127.340000,73.040000,18.260000,...,199985,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190542,199986,oikddnjh-1,15569.080000,1.440000,94.080000,1337.040000,1.560000,378.040000,60.800000,15.120000,...,199986,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190543,199987,oikdkbdj-1,17177.684211,2.894737,62.631579,1597.789474,0.000000,95.368421,65.000000,4.789474,...,199987,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190544,199988,oikdpkpb-1,15659.100000,10.133333,33.122222,1606.788889,7.800000,94.944444,137.044444,32.922222,...,199988,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2


### Check that slide in codex and slide_ID in Xenium match for all rows

In [132]:
df_codex_xenium = pd.read_csv("/media/Lynn/data/Integrated_data/df_merged_both_slides.csv")

In [134]:
# Ensure slide_ID is a string and remove decimal if present
slide_id_clean = df_codex_xenium['slide_ID'].astype(str).str.replace('.0', '', regex=False)

# Extract numeric part from 'slide' (e.g., 'ID_0022110' → '22110')
slide_clean = df_codex_xenium['slide'].str.extract(r'(\d+)$')[0].str.lstrip('0')

# Strip leading zeros from slide_ID for comparison
slide_id_clean = slide_id_clean.str.lstrip('0')

# Find mismatches
unmatched = df_codex_xenium[slide_clean != slide_id_clean]

# Show unmatched rows
if not unmatched.empty:
    print("Unmatched rows:")
    print(unmatched)

Unmatched rows:
          index     cell_id         DAPI     FoxP3       aSMA         CD4  \
143         147  aadmilee-1    59.777778  7.027778   1.342593   58.064815   
3879       3883  agejjkcl-1   126.516667  7.000000   1.655556   58.816667   
3899       3903  agfbkjke-1    91.528634  8.312775   2.352423   57.903084   
3901       3905  agfcalnm-1   274.816176  7.323529   3.867647   98.632353   
4069       4073  agjnfphb-1    96.000000  5.412500   1.787500   63.837500   
...         ...         ...          ...       ...        ...         ...   
1190194  199958  oijkenho-1   133.478261  0.956522   0.069170   19.122530   
1190197  199961  oijmbacn-1  5639.815789  0.000000  22.828947  243.473684   
1190198  199962  oijmdfmj-1   118.115385  0.998168   0.047619   19.945055   
1190201  199965  oijnepfo-1   106.338061  0.869976   0.028369   24.973995   
1190208  199972  oijpfake-1   109.002257  1.878104   0.033860   22.498871   

              CD8       CD31      CD11c      IFNG  ...  cel

In [135]:
# Exclude rows with NaN slide_ID from the unmatched DataFrame
unmatched_non_nan = unmatched[unmatched['slide_ID'].notna()]

# Display the result
print("Unmatched rows with non-NaN slide_ID:")
print(unmatched_non_nan)

Unmatched rows with non-NaN slide_ID:
Empty DataFrame
Columns: [index, cell_id, DAPI, FoxP3, aSMA, CD4, CD8, CD31, CD11c, IFNG, Pan-Cytokeratin, CD68, CD20, CD66b, TNFa, CD45RO, CD14, CD11b, Vimentin, CD163, IL10, CD45, CCR7, CD38, CD69, Podoplanin, PNAd, CD16, CXCL13, slide, column_x, transcript_counts, control_probe_counts, genomic_control_counts, control_codeword_counts, unassigned_codeword_counts, deprecated_codeword_counts, total_counts, cell_area, nucleus_area, nucleus_count, segmentation_method, region, z_level, cell_labels, column_y, batch, core_ID, slide_ID, tissue, time_point, patient_ID, year, block_ID]
Index: []

[0 rows x 54 columns]


In [154]:
df_codex_xenium = df_codex_xenium[df_codex_xenium['slide_ID'].notna()]

In [155]:
df_codex_xenium

Unnamed: 0,index,cell_id,DAPI,FoxP3,aSMA,CD4,CD8,CD31,CD11c,IFNG,...,cell_labels,column_y,batch,core_ID,slide_ID,tissue,time_point,patient_ID,year,block_ID
0,4,aaabfeja-1,2298.041667,1.145833,6.166667,1272.468750,0.875000,74.020833,64.156250,23.708333,...,4,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
1,5,aaabincm-1,9660.401575,2.763780,9.582677,1513.377953,2.921260,75.472441,109.692913,54.763780,...,5,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
2,6,aaabjmea-1,3565.448980,4.428571,10.948980,1596.010204,2.857143,88.418367,105.469388,59.040816,...,6,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
3,7,aaabjnmd-1,3629.280374,2.252336,12.663551,1531.308411,19.271028,85.551402,114.102804,68.448598,...,7,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
4,8,aaabnjhk-1,10472.338235,6.441176,10.735294,1618.669118,8.757353,91.816176,120.352941,55.294118,...,8,column_0,dataset_0,X1Y1,22110.0,ileum,before_treatment,5.0,2017.0,B2017_27346_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190221,199985,oikdcfle-1,15366.360000,3.240000,192.020000,900.340000,8.160000,127.340000,73.040000,18.260000,...,199985,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190222,199986,oikddnjh-1,15569.080000,1.440000,94.080000,1337.040000,1.560000,378.040000,60.800000,15.120000,...,199986,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190223,199987,oikdkbdj-1,17177.684211,2.894737,62.631579,1597.789474,0.000000,95.368421,65.000000,4.789474,...,199987,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2
1190224,199988,oikdpkpb-1,15659.100000,10.133333,33.122222,1606.788889,7.800000,94.944444,137.044444,32.922222,...,199988,column_3,dataset_3,X4Y6,22111.0,ileum,before_treatment,4.0,2016.0,B2016_31839_2


## Save Integrated Df

### As pandas df

In [156]:
output_path = "/media/Lynn/data/Integrated_data/df_merged_both_slides.csv"
df_codex_xenium.to_csv(output_path, index=False)

### As anndata

In [136]:
df = pd.read_csv("/media/Lynn/data/Integrated_data/df_codex_intensities_per_cell.csv")

In [137]:
combined_adata = adata_xenium_both_slides.copy()

#### Rename cell_ids with slide and column number to avoid dublicate names

In [138]:
column_prefix_map = {
    'column_0': 'col1',
    'column_1': 'col2',
    'column_2': 'col3',
    'column_3': 'col4'
}

# Apply the prefix to each cell_id based on its column
combined_adata.obs['cell_id'] = combined_adata.obs.apply(
    lambda row: f"{column_prefix_map[row['column']]}_{row['cell_id']}",
    axis=1
)

In [139]:
df['cell_id'] = df.apply(
    lambda row: f"col{row['column']}_{row['cell_id']}",
    axis=1
)

In [140]:
# For combined_adata: convert float-like string to int string without decimals
combined_adata.obs['slide_str'] = combined_adata.obs['slide_ID'].astype(str).str.replace('.0$', '', regex=True)

# For df: extract numeric part after 'ID_00' prefix
df['slide_str'] = df['slide'].str.extract(r'ID_00(\d+)', expand=False)

print(combined_adata.obs['slide_str'].unique())
print(df['slide_str'].unique())

['nan' '22110' '22111']
['22110' '22111']


In [141]:
# Prepend slide_str to cell_id
combined_adata.obs['cell_id'] = combined_adata.obs['slide_str'] + '_' + combined_adata.obs['cell_id'].astype(str)
df['cell_id'] = df['slide_str'] + '_' + df['cell_id'].astype(str)

In [142]:
df = df.set_index('cell_id')
combined_adata.obs = combined_adata.obs.set_index('cell_id')

In [144]:
# Check for duplicates in df
print("Duplicates in df index:", df.index.duplicated().sum())

# Check for duplicates in combined_adata.obs
print("Duplicates in combined_adata.obs index:", combined_adata.obs.index.duplicated().sum())


Duplicates in df index: 0
Duplicates in combined_adata.obs index: 1


In [145]:
# Get duplicated cell_ids
dupes_df = combined_adata.obs.index[combined_adata.obs.index.duplicated(keep=False)]

combined_adata.obs.loc[dupes_df]

Unnamed: 0_level_0,transcript_counts,control_probe_counts,genomic_control_counts,control_codeword_counts,unassigned_codeword_counts,deprecated_codeword_counts,total_counts,cell_area,nucleus_area,nucleus_count,...,column,batch,core_ID,slide_ID,tissue,time_point,patient_ID,year,block_ID,slide_str
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
nan_col3_npmajkia-1,0,0,0,0,0,0,0,127.927661,9.3925,1.0,...,column_2,dataset_2,Unknown,,,,,,,
nan_col3_npmajkia-1,0,0,0,0,0,0,0,157.821099,18.243126,1.0,...,column_2,dataset_2,Unknown,,,,,,,
nan_col3_npmajkia-1,0,0,0,0,0,0,0,127.927661,9.3925,1.0,...,column_2,dataset_2,Unknown,,,,,,,
nan_col3_npmajkia-1,0,0,0,0,0,0,0,157.821099,18.243126,1.0,...,column_2,dataset_2,Unknown,,,,,,,


In [146]:
# Drop the duplicates
combined_adata = combined_adata[combined_adata.obs.index != 'nan_col3_npmajkia-1'].copy()

In [147]:
# Keep only the cell_ids in df
valid_cell_ids = df.index.intersection(combined_adata.obs.index)

# Subset the entire AnnData object (not just .obs!)
combined_adata = combined_adata[valid_cell_ids].copy()

In [148]:
# Now join 
combined_adata.obs = combined_adata.obs.join(df, how='left', rsuffix='_merged')

In [150]:
cols_to_drop = [col for col in combined_adata.obs.columns if col.endswith('_merged')]
combined_adata.obs.drop(columns=cols_to_drop, inplace=True)

In [152]:
# Optionally, reset index if you want 'cell_id' back as a column
combined_adata.obs = combined_adata.obs.reset_index()

In [153]:
combined_adata.obs

Unnamed: 0,cell_id,transcript_counts,control_probe_counts,genomic_control_counts,control_codeword_counts,unassigned_codeword_counts,deprecated_codeword_counts,total_counts,cell_area,nucleus_area,...,IL10,CD45,CCR7,CD38,CD69,Podoplanin,PNAd,CD16,CXCL13,slide
0,22110_col1_aaabfeja-1,41,0,0,0,0,0,41,24.926251,,...,93.510417,69.614583,6.291667,15.812500,19.666667,156.187500,6.541667,2.062500,23.083333,ID_0022110
1,22110_col1_aaabincm-1,80,0,0,0,0,0,80,32.873751,21.313751,...,136.590551,136.007874,8.015748,69.385827,32.795276,158.921260,14.141732,4.779528,34.354331,ID_0022110
2,22110_col1_aaabjmea-1,58,0,0,0,0,0,58,25.106876,11.695469,...,140.581633,108.734694,7.295918,20.806122,22.571429,176.102041,7.265306,4.867347,29.653061,ID_0022110
3,22110_col1_aaabjnmd-1,58,0,0,0,0,0,58,27.680782,12.372813,...,160.429907,267.560748,9.280374,38.130841,43.364486,192.934579,11.439252,4.654206,26.925234,ID_0022110
4,22110_col1_aaabnjhk-1,98,0,0,0,0,0,98,35.357345,20.546094,...,134.338235,213.845588,8.154412,80.382353,41.345588,198.448529,15.830882,6.647059,38.169118,ID_0022110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118438,22111_col4_oikdcfle-1,0,0,0,0,0,0,0,13.095313,13.095313,...,75.860000,321.580000,3.860000,128.980000,35.220000,290.600000,20.160000,9.760000,40.880000,ID_0022111
1118439,22111_col4_oikddnjh-1,3,0,0,0,0,0,3,6.096094,6.096094,...,84.840000,445.840000,8.640000,231.880000,34.800000,558.560000,5.360000,0.360000,18.200000,ID_0022111
1118440,22111_col4_oikdkbdj-1,0,0,0,0,0,0,0,5.418750,5.418750,...,35.526316,341.894737,5.842105,164.210526,34.894737,274.526316,15.105263,0.578947,21.421053,ID_0022111
1118441,22111_col4_oikdpkpb-1,0,0,0,0,0,0,0,22.713595,11.469688,...,84.100000,462.488889,11.144444,170.900000,48.011111,457.477778,14.188889,13.555556,49.677778,ID_0022111


#### Save the merged adata

In [157]:
output_path = "/media/Lynn/data/Integrated_data/adata_merged_both_slides.h5ad"
combined_adata.write(output_path)

## Add transformed centroids and polygon coordinates (FAILED)

In [162]:
import os
import numpy as np
import pandas as pd
from skimage.io import imread
from skimage.measure import regionprops, find_contours

def extract_cell_geometry_from_mask(trans_mask, slide_short, column_id):
    props = regionprops(trans_mask)
    records = []
    for prop in props:
        cell_id = prop.label
        centroid = (float(prop.centroid[1]), float(prop.centroid[0]))  # (x, y)

        binary_mask = (trans_mask == cell_id).astype(np.uint8)
        contours = find_contours(binary_mask, level=0.5)
        polygon = [(float(x), float(y)) for y, x in max(contours, key=len)] if contours else []

        full_id = f"{slide_short}_col{column_id}_cell{cell_id}"
        records.append({
            "cell_id": full_id,
            "centroid": centroid,
            "polygon_coords": polygon
        })
    return records

In [163]:
# ---- Paths & Parameters ----
base_dir = "/media/Lynn/data/Integrated_data"
slides = ["ID_0022110", "ID_0022111"]
columns = [1, 2, 3, 4]

# ---- Main Loop ----
all_records = []
for slide in slides:
    slide_short = slide.replace("ID_", "")
    for col in columns:
        pool = f"{slide}_column_{col}"
        mask_path = os.path.join(base_dir, pool, f"{pool}_new_trans_segmentation.tiff")
        if not os.path.exists(mask_path):
            print(f"Missing file: {mask_path}")
            continue
        print(f"Loading: {mask_path}")
        trans_mask = imread(mask_path)
        records = extract_cell_geometry_from_mask(trans_mask, slide_short, col)
        all_records.extend(records)

# ---- Create DataFrame ----
df_geometry = pd.DataFrame(all_records)
print("Final DataFrame shape:", df_geometry.shape)
print(df_geometry.head())

# Optional: save
# df_geometry.to_parquet("cell_geometry_transformed.parquet")

Loading: /media/Lynn/data/Integrated_data/ID_0022110_column_1/ID_0022110_column_1_new_trans_segmentation.tiff


KeyboardInterrupt: 

In [164]:
import os
import numpy as np
import pandas as pd
from skimage.io import imread
from skimage.measure import regionprops, find_contours

def extract_cell_geometry_from_mask(trans_mask, slide_short, column_id):
    """
    Extract cell geometry (centroid and polygon) from segmentation mask.
    
    Args:
        trans_mask: 2D numpy array with labeled cells
        slide_short: Short slide identifier
        column_id: Column identifier
    
    Returns:
        List of dictionaries with cell geometry data
    """
    # Validate mask
    if trans_mask.max() == 0:
        print(f"Warning: Empty mask for slide {slide_short}, column {column_id}")
        return []
    
    props = regionprops(trans_mask)
    records = []
    
    for prop in props:
        cell_id = prop.label
        centroid = (float(prop.centroid[1]), float(prop.centroid[0]))  # (x, y)
        
        # Create binary mask for this specific cell
        binary_mask = (trans_mask == cell_id).astype(np.uint8)
        
        # Find contours
        contours = find_contours(binary_mask, level=0.5)
        
        # Extract polygon coordinates
        if contours:
            # Get the longest contour (main cell boundary)
            main_contour = max(contours, key=len)
            polygon = [(float(x), float(y)) for y, x in main_contour]
        else:
            polygon = []
            print(f"Warning: No contours found for cell {cell_id}")
        
        # Create unique cell identifier
        full_id = f"{slide_short}_col{column_id}_cell{cell_id}"
        
        records.append({
            "cell_id": full_id,
            "centroid": centroid,
            "polygon_coords": polygon,
            "area": float(prop.area),
            "perimeter": float(prop.perimeter) if hasattr(prop, 'perimeter') else None
        })
    
    return records

# ---- Paths & Parameters ----
base_dir = "/media/Lynn/data/Integrated_data"
slides = ["ID_0022110", "ID_0022111"]
columns = [1, 2, 3, 4]

# ---- Main Loop ----
all_records = []
total_files = len(slides) * len(columns)
processed_files = 0

for slide in slides:
    slide_short = slide.replace("ID_", "")
    
    for col in columns:
        pool = f"{slide}_column_{col}"
        mask_path = os.path.join(base_dir, pool, f"{pool}_new_trans_segmentation.tiff")
        
        if not os.path.exists(mask_path):
            print(f"Missing file: {mask_path}")
            processed_files += 1
            continue
        
        print(f"Loading ({processed_files+1}/{total_files}): {mask_path}")
        
        try:
            trans_mask = imread(mask_path)
            records = extract_cell_geometry_from_mask(trans_mask, slide_short, col)
            all_records.extend(records)
            print(f"  Extracted {len(records)} cells")
            
        except Exception as e:
            print(f"Error loading {mask_path}: {e}")
            
        processed_files += 1
        
        # Optional: Memory management for very large datasets
        if len(all_records) >= 10000:  # Save every 10k records
            temp_df = pd.DataFrame(all_records)
            temp_df.to_parquet(f"temp_geometry_{processed_files}.parquet")
            all_records = []

# ---- Create DataFrame ----
if all_records:
    df_geometry = pd.DataFrame(all_records)
    print(f"\nFinal DataFrame shape: {df_geometry.shape}")
    print(f"Total cells extracted: {len(df_geometry)}")
    print("\nDataFrame preview:")
    print(df_geometry.head())
    
    # Display summary statistics
    if 'area' in df_geometry.columns:
        print(f"\nArea statistics:")
        print(f"  Mean: {df_geometry['area'].mean():.2f}")
        print(f"  Median: {df_geometry['area'].median():.2f}")
        print(f"  Range: {df_geometry['area'].min():.2f} - {df_geometry['area'].max():.2f}")
    
    # Save to file
    output_path = "cell_geometry_extracted.parquet"
    df_geometry.to_parquet(output_path, index=False)
    print(f"\nData saved to: {output_path}")
    
else:
    print("No data extracted. Check file paths and mask contents.")

Loading (1/8): /media/Lynn/data/Integrated_data/ID_0022110_column_1/ID_0022110_column_1_new_trans_segmentation.tiff


KeyboardInterrupt: 

In [165]:
def integrate_codex_xenium(slide_id, column_id, base_paths, sdata_xenium):
    import os
    import numpy as np
    import pandas as pd
    import skimage as ski
    import tifffile
    import zarr
    import skimage.transform
    from ome_types.model import OME, Image, Pixels, Channel

    # Construct paths
    pool = f"{slide_id}_column_{column_id}"
    
    region_name = f"Region_{column_id}"
    slide_short = slide_id.replace("ID_", "")  # e.g. "ID_0022110" → "0022110"

    # Find the matching segmentation directory - FILTER OUT HIDDEN FILES
    all_dirs = os.listdir(base_paths['segmentation_base'])
    # Filter out macOS hidden files that start with ._
    visible_dirs = [d for d in all_dirs if not d.startswith('._')]
    
    segmentation_dir = [
        d for d in visible_dirs
        if f"__{slide_short}__{region_name}__" in d
    ]
    if not segmentation_dir:
        print(f"Available directories: {visible_dirs}")
        raise FileNotFoundError(f"No segmentation directory found for slide {slide_id}, column {column_id}")
    segmentation_path = os.path.join(base_paths['segmentation_base'], segmentation_dir[0], "cells.zarr.zip")
    
    def fix_macos_hidden_path(path):
        if not isinstance(path, str):
            # If it's not a string (e.g., int), just return it unchanged
            return path
        dirname = os.path.dirname(path)
        filename = os.path.basename(path)
        if filename.startswith("._"):
            real_filename = filename[2:]
            real_path = os.path.join(dirname, real_filename)
            if os.path.exists(real_path):
                return real_path
            else:
                raise FileNotFoundError(f"Expected real file not found: {real_path}")
        return path

    def fix_all_opt_paths(opt_dict):
        for k, v in opt_dict.items():
            opt_dict[k] = fix_macos_hidden_path(v)
        return opt_dict
    
    pool = f"{slide_id}_column_{column_id}"
    opt = {
        "pool": pool,
        "segmentation": segmentation_path,
        "phenocycler": f"{base_paths['phenocycler']}/{pool}.ome.ome.tif",
        "transmat": f"{base_paths['transmat']}/{pool}.ome_alignment_files/matrix.csv",
        "transsegmentation": f"{base_paths['out']}/{pool}/segmentation.csv",
        "adtcsv": f"{base_paths['out']}/{pool}/adt_countmat.csv",
        "adtcsv_cell_ids": f"{base_paths['out']}/{pool}/adt_countmat_with_cell_ids.csv",
        "plotlowres": f"{base_paths['out']}/{pool}/{pool}_new_cell_dapi_low.ome.tif",
        "plothighres": f"{base_paths['out']}/{pool}/{pool}_new_cell_dapi_high.ome.tif",
        "plotsegmentation": f"{base_paths['out']}/{pool}/{pool}_new_trans_segmentation.tiff",
        "tilesize": 1024,
        "compression": "zlib",
        "subresolution": 4,
        "interpreter": "rgb",
        "scale": 2,
    }

    opt = fix_all_opt_paths(opt)
    
    os.makedirs(os.path.dirname(opt["plotlowres"]), exist_ok=True)

    def open_zarr(path):
        # If the filename starts with '._', switch to the real file
        dirname = os.path.dirname(path)
        filename = os.path.basename(path)
    
        if filename.startswith("._"):
            real_filename = filename[2:]
            real_path = os.path.join(dirname, real_filename)
            if os.path.exists(real_path):
                path = real_path
            else:
                raise FileNotFoundError(f"Expected real file not found: {real_path}")
    
        # Additionally: if path is a directory store, make sure it exists and is a directory
        if not os.path.exists(path):
            raise FileNotFoundError(f"File or directory not found: {path}")
        
        if path.endswith(".zip"):
            store = zarr.ZipStore(path, mode="r")
        else:
            # Confirm it's a directory (for DirectoryStore)
            if not os.path.isdir(path):
                raise NotADirectoryError(f"Expected directory for DirectoryStore but got a file: {path}")
            store = zarr.DirectoryStore(path)
    
        return zarr.group(store=store)

    root = open_zarr(opt["segmentation"])
    cellseg_mask = np.array(root["masks"][1])
    transmat = np.loadtxt(opt["transmat"], delimiter=",")
    tform = ski.transform.AffineTransform(matrix=transmat)

    tiff = tifffile.imread(opt["phenocycler"], is_ome=False, level=0)
    tiff_dapi = tiff[0, :, :]

    trans_mask = ski.transform.warp(cellseg_mask, inverse_map=tform, output_shape=tiff_dapi.shape, order=0, preserve_range=True).astype(int)
    cell_borders = ski.segmentation.find_boundaries(trans_mask, mode="thin")

    def normalize(img):
        return ski.exposure.rescale_intensity(img, in_range="image", out_range=(0, 1))

    gray_norm = np.clip(normalize(tiff_dapi) * 1.75, 0, 1)
    rgb_image = np.stack([gray_norm]*3, axis=-1)
    rgb_image[cell_borders] = [1.0, 0.0, 0.0]

    image_down = ski.transform.resize(rgb_image, (int(rgb_image.shape[0] * 0.25), int(rgb_image.shape[1] * 0.25)), anti_aliasing=True)

    def save_as_ome(image, filename, subresolution=3, scale=2, tilesize=1024, interpreter="rgb", compression="zlib"):
        if image.ndim == 3 and image.shape[2] == 3:
            image = np.moveaxis(image, -1, 0)
        pyramid = [image]
        for _ in range(1, subresolution):
            downsampled = pyramid[-1][:, ::scale, ::scale]
            pyramid.append(downsampled)

        pixels = Pixels(
            dimension_order="XYZCT",
            size_x=image.shape[2],
            size_y=image.shape[1],
            size_z=1,
            size_c=image.shape[0],
            size_t=1,
            type=str(image.dtype) if str(image.dtype) != "float64" else "float",
            physical_size_x=0.125,
            physical_size_y=0.125,
            physical_size_x_unit="µm",
            physical_size_y_unit="µm",
            channels=[
                Channel(id="Channel:0:0", name="Red", samples_per_pixel=1),
                Channel(id="Channel:0:1", name="Green", samples_per_pixel=1),
                Channel(id="Channel:0:2", name="Blue", samples_per_pixel=1),
            ],
        )
        ome = OME(images=[Image(id="Image:0", name="Pyramidal OME", pixels=pixels)])

        with tifffile.TiffWriter(filename, bigtiff=True) as tif:
            tif.write(pyramid[0], photometric=interpreter, tile=(tilesize, tilesize),
                      compression=compression, subifds=len(pyramid)-1,
                      metadata={"axes": "CYX", "ome": ome})
            for level in pyramid[1:]:
                tif.write(level, photometric=interpreter, tile=(tilesize, tilesize), compression=compression)

    save_as_ome(image_down, opt["plotlowres"], opt["subresolution"], opt["scale"], opt["tilesize"], opt["interpreter"], opt["compression"])
    save_as_ome(rgb_image, opt["plothighres"], opt["subresolution"], opt["scale"], opt["tilesize"], opt["interpreter"], opt["compression"])
    ski.io.imsave(opt["plotsegmentation"], trans_mask)

    # Quantification
    unique_cell_ids = np.unique(trans_mask)
    unique_cell_ids = unique_cell_ids[unique_cell_ids > 0]
    df_adt = pd.DataFrame(index=unique_cell_ids)
    df_adt.index.name = 'cell_id'

    # Extract transformed mask coordinates for each cell
    # Create a dictionary to store the mask coordinates for each cell
    cell_mask_coords = {}
    
    for cell_id in unique_cell_ids:
        # Get all pixel coordinates where this cell exists in transformed mask
        cell_pixels = np.where(trans_mask == cell_id)
        # cell_pixels[0] = y coordinates (rows), cell_pixels[1] = x coordinates (cols)
        y_coords = cell_pixels[0].tolist()
        x_coords = cell_pixels[1].tolist()
        
        # Store as lists of coordinates
        cell_mask_coords[cell_id] = {
            'x_coords': x_coords,
            'y_coords': y_coords
        }
    
    # Add mask coordinate columns to the dataframe
    df_adt['mask_x_coords'] = df_adt.index.map(lambda x: cell_mask_coords.get(x, {}).get('x_coords', []))
    df_adt['mask_y_coords'] = df_adt.index.map(lambda x: cell_mask_coords.get(x, {}).get('y_coords', []))

    # Continue with intensity quantification
    for i in range(tiff.shape[0]):
        props = ski.measure.regionprops(trans_mask, intensity_image=tiff[i])
        mean_signals = {prop.label: prop.mean_intensity for prop in props}
        df_adt[f'channel_{i}'] = df_adt.index.map(mean_signals)

    df_adt.to_csv(opt["adtcsv"])

    # Rename cell IDs
    cell_names_list = sdata_xenium[slide_id][f'column_{column_id}'].table.obs['cell_id'].tolist()
    cell_id_to_name = {i + 1: name for i, name in enumerate(cell_names_list)}
    df_adt_reset = df_adt.reset_index()
    df_adt_reset['cell_name'] = df_adt_reset['cell_id'].map(cell_id_to_name)
    df_adt_reset = df_adt_reset.rename(columns={'cell_id': 'index', 'cell_name': 'cell_id'})
    
    # Reorder columns to put mask coordinates after cell_id
    mask_cols = ['mask_x_coords', 'mask_y_coords']
    other_cols = [col for col in df_adt_reset.columns if col not in ['index', 'cell_id'] + mask_cols]
    df_adt_reset = df_adt_reset[['index', 'cell_id'] + mask_cols + other_cols]
    
    df_adt_reset.to_csv(opt["adtcsv_cell_ids"], index=False)

    print(f"Completed processing for {pool}")
    print(f"Added transformed mask coordinates for {len(cell_mask_coords)} cells")

In [166]:
import sys
import os
sys.path.append(os.path.abspath('../src'))
import load_sdata

# Load each Xenium dataset into a SpatialData object
sdata_xenium_first_slide = load_sdata.get_xenium_slide_data('0022110')
sdata_xenium_second_slide = load_sdata.get_xenium_slide_data('0022111')

sdata_xenium = {
    'ID_0022110': sdata_xenium_first_slide,
    'ID_0022111': sdata_xenium_second_slide,
}

In [167]:
base_paths = {
    "segmentation_base": "/media/Lynn/data/Lisa_raw_data/Xenium",  # now the true base
    "phenocycler": "/media/Lynn/data/CODEX_cropped",
    "transmat": "/media/Lynn/alignment/codex_columns",
    "out": "/media/Lynn/data/Integrated_data_with_transformed_polygons"
}
slides = ["ID_0022110", "ID_0022111"]
columns = [1, 2, 3, 4]

# Assuming sdata_xenium is a nested dict like: sdata_xenium['ID_0022110']['column_1']
for slide in slides:
    for col in columns:
        integrate_codex_xenium(slide, col, base_paths, sdata_xenium)

KeyboardInterrupt: 

# Run3

In [1]:
def integrate_codex_xenium(slide_id, codex_image_path, base_paths, sdata_xenium, codex_label):
    import os
    import numpy as np
    import pandas as pd
    import skimage as ski
    import tifffile
    import zarr
    from ome_types.model import OME, Image, Pixels, Channel

    # Construct paths
    slide_short = slide_id.replace("ID_", "")
    region_name = "Region_1"  # Since Xenium data is no longer split by column

    # Find matching segmentation directory
    all_dirs = os.listdir(base_paths['segmentation_base'])
    visible_dirs = [d for d in all_dirs if not d.startswith('._')]
    # Find segmentation directory
    segmentation_dir = [
        d for d in visible_dirs
        if f"__{slide_short}__" in d
    ]
    if not segmentation_dir:
        print(f"Available directories: {visible_dirs}")
        raise FileNotFoundError(f"No segmentation directory found for slide {slide_id}")
    
    segmentation_path = os.path.join(base_paths['segmentation_base'], segmentation_dir[0], "cells.zarr.zip")

    def fix_macos_hidden_path(path):
        if not isinstance(path, str): return path
        dirname = os.path.dirname(path)
        filename = os.path.basename(path)
        if filename.startswith("._"):
            real_path = os.path.join(dirname, filename[2:])
            if os.path.exists(real_path):
                return real_path
            else:
                raise FileNotFoundError(f"Expected real file not found: {real_path}")
        return path

    def fix_all_opt_paths(opt_dict):
        for k, v in opt_dict.items():
            opt_dict[k] = fix_macos_hidden_path(v)
        return opt_dict

    output_dir = os.path.join(base_paths["out"], slide_id)
    os.makedirs(output_dir, exist_ok=True)

    transmat_map = {
    "ID_56764": "ID_56764_alignment_files",
    "ID_56777_1_1_core_1": "ID_56777_1_1_core_1_alignment_files",
    "ID_56777_1_1_core_2": "ID_56777_1_1_core_2_alignment_files",
    "ID_56777_1_1_core_3": "ID_56777_1_1_core_3_alignment_files",
    "ID_56777_1_1_core_4": "ID_56777_1_1_core_4_alignment_files",
    "ID_56777_1_2": "ID_56777_1_2_alignment_files",
    "ID_56777_2": "ID_56777_2_alignment_files",
    }

    alignment_folder = transmat_map[codex_label]

    opt = {
        "pool": slide_id,
        "segmentation": segmentation_path,
        "phenocycler": codex_image_path,
        "transmat": f"/media/Lynn/alignment/codex_whole_slides/{alignment_folder}/matrix.csv",
        "transsegmentation": f"{output_dir}/{codex_label}_segmentation.csv",
        "adtcsv": f"{output_dir}/{codex_label}_adt_countmat.csv",
        "adtcsv_cell_ids": f"{output_dir}/{codex_label}_adt_countmat_with_cell_ids.csv",
        "plotlowres": f"{output_dir}/{codex_label}_lowres.ome.tif",
        "plothighres": f"{output_dir}/{codex_label}_highres.ome.tif",
        "plotsegmentation": f"{output_dir}/{codex_label}_trans_segmentation.tiff",
        "tilesize": 1024,
        "compression": "zlib",
        "subresolution": 4,
        "interpreter": "rgb",
        "scale": 2,
    }

    opt = fix_all_opt_paths(opt)

    def open_zarr(path):
        if path.endswith(".zip"):
            store = zarr.ZipStore(path, mode="r")
        elif os.path.isdir(path):
            store = zarr.DirectoryStore(path)
        else:
            raise FileNotFoundError(f"Expected a valid zarr store at: {path}")
        return zarr.group(store=store)

    root = open_zarr(opt["segmentation"])
    cellseg_mask = np.array(root["masks"][1])

    transmat = np.loadtxt(opt["transmat"], delimiter=",")
    tform = ski.transform.AffineTransform(matrix=transmat)

    tiff = tifffile.imread(opt["phenocycler"], is_ome=False, level=0)
    tiff_dapi = tiff[0]

    trans_mask = ski.transform.warp(cellseg_mask, inverse_map=tform, output_shape=tiff_dapi.shape, order=0, preserve_range=True).astype(int)
    cell_borders = ski.segmentation.find_boundaries(trans_mask, mode="thin")

    def normalize(img):
        return ski.exposure.rescale_intensity(img, in_range="image", out_range=(0, 1))

    gray_norm = np.clip(normalize(tiff_dapi) * 1.75, 0, 1)
    rgb_image = np.stack([gray_norm]*3, axis=-1)
    rgb_image[cell_borders] = [1.0, 0.0, 0.0]

    image_down = ski.transform.resize(rgb_image, (rgb_image.shape[0] // 4, rgb_image.shape[1] // 4), anti_aliasing=True)

    def save_as_ome(image, filename, subresolution=3, scale=2, tilesize=1024, interpreter="rgb", compression="zlib"):
        if image.ndim == 3 and image.shape[2] == 3:
            image = np.moveaxis(image, -1, 0)
        pyramid = [image]
        for _ in range(1, subresolution):
            downsampled = pyramid[-1][:, ::scale, ::scale]
            pyramid.append(downsampled)

        pixels = Pixels(
            dimension_order="XYZCT",
            size_x=image.shape[2],
            size_y=image.shape[1],
            size_z=1,
            size_c=image.shape[0],
            size_t=1,
            type=str(image.dtype) if str(image.dtype) != "float64" else "float",
            physical_size_x=0.125,
            physical_size_y=0.125,
            physical_size_x_unit="µm",
            physical_size_y_unit="µm",
            channels=[
                Channel(id="Channel:0:0", name="Red", samples_per_pixel=1),
                Channel(id="Channel:0:1", name="Green", samples_per_pixel=1),
                Channel(id="Channel:0:2", name="Blue", samples_per_pixel=1),
            ],
        )
        ome = OME(images=[Image(id="Image:0", name="Pyramidal OME", pixels=pixels)])
        with tifffile.TiffWriter(filename, bigtiff=True) as tif:
            tif.write(pyramid[0], photometric=interpreter, tile=(tilesize, tilesize),
                      compression=compression, subifds=len(pyramid)-1,
                      metadata={"axes": "CYX", "ome": ome})
            for level in pyramid[1:]:
                tif.write(level, photometric=interpreter, tile=(tilesize, tilesize), compression=compression)

    save_as_ome(image_down, opt["plotlowres"], opt["subresolution"], opt["scale"], opt["tilesize"], opt["interpreter"], opt["compression"])
    save_as_ome(rgb_image, opt["plothighres"], opt["subresolution"], opt["scale"], opt["tilesize"], opt["interpreter"], opt["compression"])
    ski.io.imsave(opt["plotsegmentation"], trans_mask)

    # Quantification
    unique_cell_ids = np.unique(trans_mask)
    unique_cell_ids = unique_cell_ids[unique_cell_ids > 0]
    df_adt = pd.DataFrame(index=unique_cell_ids)
    df_adt.index.name = 'cell_id'

    for i in range(tiff.shape[0]):
        props = ski.measure.regionprops(trans_mask, intensity_image=tiff[i])
        mean_signals = {prop.label: prop.mean_intensity for prop in props}
        df_adt[f'channel_{i}'] = df_adt.index.map(mean_signals)

    df_adt.to_csv(opt["adtcsv"])

    # Rename cell IDs
    cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()
    cell_id_to_name = {i + 1: name for i, name in enumerate(cell_names_list)}
    df_adt_reset = df_adt.reset_index()
    df_adt_reset['cell_name'] = df_adt_reset['cell_id'].map(cell_id_to_name)
    df_adt_reset = df_adt_reset.rename(columns={'cell_id': 'index', 'cell_name': 'cell_id'})
    df_adt_reset = df_adt_reset[['index', 'cell_id'] + [col for col in df_adt_reset.columns if col not in ['index', 'cell_id']]]
    df_adt_reset.to_csv(opt["adtcsv_cell_ids"], index=False)

    print(f"Completed processing for {slide_id} - {codex_label}")

In [2]:
import sys
import os
sys.path.append(os.path.abspath('../src'))
import load_sdata

# Load each Xenium dataset into a SpatialData object
sdata_xenium_first_slide = load_sdata.get_xenium_slide_data('0056764')
sdata_xenium_second_slide = load_sdata.get_xenium_slide_data('0056777')

sdata_xenium = {
    'ID_0056764': sdata_xenium_first_slide,
    'ID_0056777': sdata_xenium_second_slide
}

version mismatch: detected: RasterFormatV02, requested: FormatV04
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
version mismatch: detected: RasterFormatV02, requested: FormatV04
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, 

In [4]:
base_paths = {
    "segmentation_base": "/scratch/lyarab/Xenium/Run3",
    "phenocycler": {
        "ID_0056764": ["/media/Lynn/data/Lisa_raw_data/CODEX/ID_56764.ome.ome.tif"],
        "ID_0056777": [
            "/media/Lynn/data/Lisa_raw_data/CODEX/ID_56777_1_1_core_1.ome.ome.tif",
            "/media/Lynn/data/Lisa_raw_data/CODEX/ID_56777_1_1_core_2.ome.ome.tif",
            "/media/Lynn/data/Lisa_raw_data/CODEX/ID_56777_1_1_core_3.ome.ome.tif",
            "/media/Lynn/data/Lisa_raw_data/CODEX/ID_56777_1_1_core_4.ome.ome.tif",
            "/media/Lynn/data/Lisa_raw_data/CODEX/ID_56777_1_2.ome.ome.tif",
            "/media/Lynn/data/Lisa_raw_data/CODEX/ID_56777_2.ome.ome.tif"
        ]
    },
    "transmat": "/media/Lynn/alignment/codex_whole_slides",
    "out": "/media/Lynn/data/Integrated_data"
}

codex_label_to_alignment = {
    # For ID_56764
    "codex_1": "ID_56764",

    # For ID_56777
    "codex_1_56777": "ID_56777_1_1_core_1",
    "codex_2_56777": "ID_56777_1_1_core_2",
    "codex_3_56777": "ID_56777_1_1_core_3",
    "codex_4_56777": "ID_56777_1_1_core_4",
    "codex_5_56777": "ID_56777_1_2",
    "codex_6_56777": "ID_56777_2",
}

#if slide_id == "ID_0056764":
    #codex_label = "ID_56764"
    #integrate_codex_xenium(slide_id, codex_path, base_paths, sdata_xenium, codex_label)

slide_id = "ID_0056777" 
codex_list = base_paths["phenocycler"][slide_id]

for idx, codex_path in enumerate(codex_list, start=1):
    index = f"codex_{idx}_56777"
    codex_label = codex_label_to_alignment[index]
    integrate_codex_xenium(slide_id, codex_path, base_paths, sdata_xenium, codex_label)


  ski.io.imsave(opt["plotsegmentation"], trans_mask)
  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_1


  ski.io.imsave(opt["plotsegmentation"], trans_mask)
  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_2


  ski.io.imsave(opt["plotsegmentation"], trans_mask)
  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_3


  ski.io.imsave(opt["plotsegmentation"], trans_mask)
  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_4


  ski.io.imsave(opt["plotsegmentation"], trans_mask)
  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_2


  ski.io.imsave(opt["plotsegmentation"], trans_mask)
  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_2


In [181]:
df_slide_1=pd.read_csv('/media/Lynn/data/Integrated_data/ID_0056764/ID_56764_adt_countmat_with_cell_ids.csv')

In [182]:
df_slide_1

Unnamed: 0,index,cell_id,channel_0,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,...,channel_19,channel_20,channel_21,channel_22,channel_23,channel_24,channel_25,channel_26,channel_27,channel_28
0,1,aaaaapoi-1,3301.540000,1.486667,9.170000,1019.833333,15.130000,26.853333,16.176667,9.300000,...,10.440000,0.600000,36.483333,5.140000,48.146667,18.266667,3.243333,353.530000,7.073333,155.796667
1,2,aaaadfjn-1,5866.028571,0.485714,26.500000,1061.485714,36.585714,30.857143,17.400000,3.371429,...,92.042857,1.628571,56.042857,11.300000,108.185714,66.057143,2.385714,200.700000,6.971429,193.385714
2,3,aaaagadk-1,4794.887023,1.587786,81.404580,1066.697710,14.187786,32.538931,29.490076,8.290076,...,25.429008,0.512977,48.346565,8.085496,74.442748,47.563359,3.221374,184.001527,8.540458,224.044275
3,4,aaaagmge-1,3549.487936,0.683646,49.219839,860.010724,6.260054,19.782842,12.504021,3.418231,...,9.804290,1.332440,26.849866,3.809651,54.809651,34.461126,5.332440,463.474531,7.040214,159.941019
4,5,aaaahchk-1,5209.069686,1.463415,2.456446,929.735192,7.132404,19.212544,13.745645,2.519164,...,4.097561,0.435540,36.097561,4.285714,22.588850,16.623693,3.529617,164.905923,6.285714,152.721254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737497,737500,oinjdnej-1,8511.494737,1.315789,4.136842,665.315789,3.915789,23.368421,7.873684,12.621053,...,184.852632,4.389474,55.273684,73.736842,49.273684,17.368421,5.600000,586.105263,60.210526,341.347368
737498,737501,oinjdnmp-1,9686.445783,1.698795,4.168675,744.614458,3.096386,22.843373,37.228916,6.542169,...,139.253012,3.349398,49.638554,22.891566,19.481928,12.180723,3.578313,1027.228916,127.903614,311.686747
737499,737502,oinjeidc-1,7863.773585,0.509434,3.773585,536.018868,4.867925,16.320755,40.471698,12.528302,...,258.358491,2.867925,34.679245,19.471698,30.867925,59.603774,2.641509,939.339623,148.301887,260.735849
737500,737503,oinjejim-1,8403.611111,2.166667,3.611111,580.722222,5.555556,18.055556,31.000000,10.111111,...,134.888889,3.277778,49.111111,34.361111,34.861111,24.638889,4.555556,638.138889,63.638889,282.972222


# Merge runs 2 and 3

In [26]:
import os
import pandas as pd

# Base path
out_base = "/media/Lynn/data/Integrated_data"

# Define single-CODEX and multi-CODEX slide IDs
single_codex_slides = ["ID_0056764"]
multi_codex_slide = "ID_0056777"

df_list = []

# 1. Load single-CODEX slides (e.g., ID_0056764)
for slide in single_codex_slides:
    csv_path = os.path.join(out_base, slide, "adt_countmat_with_cell_ids.csv")
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        df['slide'] = slide
        df['region'] = None  # No region distinction
        df_list.append(df)
    else:
        print(f"Warning: File not found for slide {slide}: {csv_path}")

# 2. Load all CSVs from multi-CODEX slide (e.g., ID_0056777)
multi_codex_dir = os.path.join(out_base, multi_codex_slide)
if os.path.exists(multi_codex_dir):
    for subfolder in os.listdir(multi_codex_dir):
        subdir_path = os.path.join(multi_codex_dir, subfolder)
        csv_path = os.path.join(subdir_path, "adt_countmat_with_cell_ids.csv")
        
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            df['slide'] = multi_codex_slide
            df['region'] = subfolder  # Track which CODEX image it came from
            df_list.append(df)
        else:
            print(f"Warning: Missing CSV in {subfolder}")
else:
    print(f"Warning: Multi-CODEX slide folder not found: {multi_codex_dir}")

# 3. Combine all into one DataFrame
if df_list:
    df_combined = pd.concat(df_list, ignore_index=True)
    print("All data combined successfully.")
else:
    print("No data was loaded. Please check paths.")


All data combined successfully.


In [28]:
channel_names = ["DAPI", "FoxP3", "aSMA", "CD4", "CD8", "CD31", "CD11c", "IFNG", 
                 "Pan-Cytokeratin", "CD68", "CD20", "CD66b", "TNFa", "CD45RO", "CD14", 
                 "CD11b", "Vimentin", "CD163", "PDGFRA", "CD45", "CCR7", "IL10", "CD38", 
                 "CD69", "Podoplanin", "PNAd", "ECP", "MPO", "MIP-3"]

# Build mapping dictionary from old to new names
rename_dict = {f'channel_{i}': name for i, name in enumerate(channel_names)}

# Rename columns in your DataFrame
df_combined = df_combined.rename(columns=rename_dict)

In [29]:
df_combined

Unnamed: 0,index,cell_id,DAPI,FoxP3,aSMA,CD4,CD8,CD31,CD11c,IFNG,...,IL10,CD38,CD69,Podoplanin,PNAd,ECP,MPO,MIP-3,slide,region
0,1,aaaaapoi-1,3301.540000,1.486667,9.170000,1019.833333,15.130000,26.853333,16.176667,9.300000,...,36.483333,5.140000,48.146667,18.266667,3.243333,353.530000,7.073333,155.796667,ID_0056764,
1,2,aaaadfjn-1,5866.028571,0.485714,26.500000,1061.485714,36.585714,30.857143,17.400000,3.371429,...,56.042857,11.300000,108.185714,66.057143,2.385714,200.700000,6.971429,193.385714,ID_0056764,
2,3,aaaagadk-1,4794.887023,1.587786,81.404580,1066.697710,14.187786,32.538931,29.490076,8.290076,...,48.346565,8.085496,74.442748,47.563359,3.221374,184.001527,8.540458,224.044275,ID_0056764,
3,4,aaaagmge-1,3549.487936,0.683646,49.219839,860.010724,6.260054,19.782842,12.504021,3.418231,...,26.849866,3.809651,54.809651,34.461126,5.332440,463.474531,7.040214,159.941019,ID_0056764,
4,5,aaaahchk-1,5209.069686,1.463415,2.456446,929.735192,7.132404,19.212544,13.745645,2.519164,...,36.097561,4.285714,22.588850,16.623693,3.529617,164.905923,6.285714,152.721254,ID_0056764,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1468744,728867,ohoedhjo-1,1578.833333,0.000000,2.500000,225.222222,28.222222,7.222222,5.500000,0.000000,...,22.833333,7.777778,10.277778,2.500000,1.277778,33.611111,1.222222,22.555556,ID_0056777,ID_0056777_1_1_core_1
1468745,728868,ohoedlpo-1,5691.866667,2.655556,6.744444,782.133333,3.811111,15.988889,20.811111,0.033333,...,32.933333,27.077778,21.411111,8.600000,4.033333,188.300000,37.733333,164.911111,ID_0056777,ID_0056777_1_1_core_1
1468746,728869,ohoeedfm-1,1737.081594,1.517078,3.193548,110.422201,0.164137,3.277989,2.176471,1.913662,...,4.832068,1.258065,7.378558,1.216319,0.994307,275.277989,76.793169,92.388046,ID_0056777,ID_0056777_1_1_core_1
1468747,728870,ohoekdhf-1,2421.100000,3.100000,477.280000,776.800000,25.780000,201.340000,60.000000,8.260000,...,183.280000,7.660000,42.700000,36.860000,1.800000,54.180000,7.260000,103.640000,ID_0056777,ID_0056777_1_1_core_1


In [30]:
df_run2 = pd.read_csv("/media/Lynn/data/Integrated_data/run2_codex_intensities_per_cell.csv")

In [24]:
df_run2

Unnamed: 0,index,cell_id,DAPI,FoxP3,aSMA,CD4,CD8,CD31,CD11c,IFNG,...,CCR7,CD38,CD69,Podoplanin,PNAd,CD16,CXCL13,slide,column,region
0,4,aaabfeja-1,2298.041667,1.145833,6.166667,1272.468750,0.875000,74.020833,64.156250,23.708333,...,6.291667,15.812500,19.666667,156.187500,6.541667,2.062500,23.083333,ID_0022110,1,
1,5,aaabincm-1,9660.401575,2.763780,9.582677,1513.377953,2.921260,75.472441,109.692913,54.763780,...,8.015748,69.385827,32.795276,158.921260,14.141732,4.779528,34.354331,ID_0022110,1,
2,6,aaabjmea-1,3565.448980,4.428571,10.948980,1596.010204,2.857143,88.418367,105.469388,59.040816,...,7.295918,20.806122,22.571429,176.102041,7.265306,4.867347,29.653061,ID_0022110,1,
3,7,aaabjnmd-1,3629.280374,2.252336,12.663551,1531.308411,19.271028,85.551402,114.102804,68.448598,...,9.280374,38.130841,43.364486,192.934579,11.439252,4.654206,26.925234,ID_0022110,1,
4,8,aaabnjhk-1,10472.338235,6.441176,10.735294,1618.669118,8.757353,91.816176,120.352941,55.294118,...,8.154412,80.382353,41.345588,198.448529,15.830882,6.647059,38.169118,ID_0022110,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190221,199985,oikdcfle-1,15366.360000,3.240000,192.020000,900.340000,8.160000,127.340000,73.040000,18.260000,...,3.860000,128.980000,35.220000,290.600000,20.160000,9.760000,40.880000,ID_0022111,4,
1190222,199986,oikddnjh-1,15569.080000,1.440000,94.080000,1337.040000,1.560000,378.040000,60.800000,15.120000,...,8.640000,231.880000,34.800000,558.560000,5.360000,0.360000,18.200000,ID_0022111,4,
1190223,199987,oikdkbdj-1,17177.684211,2.894737,62.631579,1597.789474,0.000000,95.368421,65.000000,4.789474,...,5.842105,164.210526,34.894737,274.526316,15.105263,0.578947,21.421053,ID_0022111,4,
1190224,199988,oikdpkpb-1,15659.100000,10.133333,33.122222,1606.788889,7.800000,94.944444,137.044444,32.922222,...,11.144444,170.900000,48.011111,457.477778,14.188889,13.555556,49.677778,ID_0022111,4,


In [31]:
df_combined['column'] = pd.NA
df_run2['region'] = pd.NA

In [32]:
# Combine row-wise
df_combined_all = pd.concat([df_combined, df_run2], ignore_index=True)

In [33]:
df_combined_all

Unnamed: 0,index,cell_id,DAPI,FoxP3,aSMA,CD4,CD8,CD31,CD11c,IFNG,...,Podoplanin,PNAd,ECP,MPO,MIP-3,slide,region,column,CD16,CXCL13
0,1,aaaaapoi-1,3301.540000,1.486667,9.170000,1019.833333,15.130000,26.853333,16.176667,9.300000,...,18.266667,3.243333,353.530000,7.073333,155.796667,ID_0056764,,,,
1,2,aaaadfjn-1,5866.028571,0.485714,26.500000,1061.485714,36.585714,30.857143,17.400000,3.371429,...,66.057143,2.385714,200.700000,6.971429,193.385714,ID_0056764,,,,
2,3,aaaagadk-1,4794.887023,1.587786,81.404580,1066.697710,14.187786,32.538931,29.490076,8.290076,...,47.563359,3.221374,184.001527,8.540458,224.044275,ID_0056764,,,,
3,4,aaaagmge-1,3549.487936,0.683646,49.219839,860.010724,6.260054,19.782842,12.504021,3.418231,...,34.461126,5.332440,463.474531,7.040214,159.941019,ID_0056764,,,,
4,5,aaaahchk-1,5209.069686,1.463415,2.456446,929.735192,7.132404,19.212544,13.745645,2.519164,...,16.623693,3.529617,164.905923,6.285714,152.721254,ID_0056764,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2658970,199985,oikdcfle-1,15366.360000,3.240000,192.020000,900.340000,8.160000,127.340000,73.040000,18.260000,...,290.600000,20.160000,,,,ID_0022111,,4,9.760000,40.880000
2658971,199986,oikddnjh-1,15569.080000,1.440000,94.080000,1337.040000,1.560000,378.040000,60.800000,15.120000,...,558.560000,5.360000,,,,ID_0022111,,4,0.360000,18.200000
2658972,199987,oikdkbdj-1,17177.684211,2.894737,62.631579,1597.789474,0.000000,95.368421,65.000000,4.789474,...,274.526316,15.105263,,,,ID_0022111,,4,0.578947,21.421053
2658973,199988,oikdpkpb-1,15659.100000,10.133333,33.122222,1606.788889,7.800000,94.944444,137.044444,32.922222,...,457.477778,14.188889,,,,ID_0022111,,4,13.555556,49.677778


In [34]:
def add_suffix(row):
    # Start with the slide ID
    suffix = row['slide']
    # Add column info if present
    if 'column' in row and pd.notna(row['column']):
        suffix += f"_col{row['column']}"
    return f"{row['cell_id']}_{suffix}"

# Create new cell_id values with the suffix
df_combined_all['cell_id'] = df_combined_all.apply(add_suffix, axis=1)

In [35]:
df_combined_all

Unnamed: 0,index,cell_id,DAPI,FoxP3,aSMA,CD4,CD8,CD31,CD11c,IFNG,...,Podoplanin,PNAd,ECP,MPO,MIP-3,slide,region,column,CD16,CXCL13
0,1,aaaaapoi-1_ID_0056764,3301.540000,1.486667,9.170000,1019.833333,15.130000,26.853333,16.176667,9.300000,...,18.266667,3.243333,353.530000,7.073333,155.796667,ID_0056764,,,,
1,2,aaaadfjn-1_ID_0056764,5866.028571,0.485714,26.500000,1061.485714,36.585714,30.857143,17.400000,3.371429,...,66.057143,2.385714,200.700000,6.971429,193.385714,ID_0056764,,,,
2,3,aaaagadk-1_ID_0056764,4794.887023,1.587786,81.404580,1066.697710,14.187786,32.538931,29.490076,8.290076,...,47.563359,3.221374,184.001527,8.540458,224.044275,ID_0056764,,,,
3,4,aaaagmge-1_ID_0056764,3549.487936,0.683646,49.219839,860.010724,6.260054,19.782842,12.504021,3.418231,...,34.461126,5.332440,463.474531,7.040214,159.941019,ID_0056764,,,,
4,5,aaaahchk-1_ID_0056764,5209.069686,1.463415,2.456446,929.735192,7.132404,19.212544,13.745645,2.519164,...,16.623693,3.529617,164.905923,6.285714,152.721254,ID_0056764,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2658970,199985,oikdcfle-1_ID_0022111_col4,15366.360000,3.240000,192.020000,900.340000,8.160000,127.340000,73.040000,18.260000,...,290.600000,20.160000,,,,ID_0022111,,4,9.760000,40.880000
2658971,199986,oikddnjh-1_ID_0022111_col4,15569.080000,1.440000,94.080000,1337.040000,1.560000,378.040000,60.800000,15.120000,...,558.560000,5.360000,,,,ID_0022111,,4,0.360000,18.200000
2658972,199987,oikdkbdj-1_ID_0022111_col4,17177.684211,2.894737,62.631579,1597.789474,0.000000,95.368421,65.000000,4.789474,...,274.526316,15.105263,,,,ID_0022111,,4,0.578947,21.421053
2658973,199988,oikdpkpb-1_ID_0022111_col4,15659.100000,10.133333,33.122222,1606.788889,7.800000,94.944444,137.044444,32.922222,...,457.477778,14.188889,,,,ID_0022111,,4,13.555556,49.677778


# Merge Xenium and CODEX (all runs)

In [61]:
import anndata as ad

adata_lisa_annotation = ad.read_h5ad('/media/Lynn/data/Xenium_table_with_metadata/run2_3_norm100_log_scale_16_umap_leiden_0_7.h5ad')

In [52]:
adata_lisa_annotation

AnnData object with n_obs × n_vars = 2205208 × 422
    obs: 'cell_id', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'region', 'z_level', 'cell_labels', 'core_ID', 'slide_ID', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_1_genes', 'pct_counts_in_top_5_genes', 'pct_counts_in_top_10_genes', 'n_counts', 'tissue', 'time_point', 'patient_ID', 'year', 'block_ID', 'cohort', 'run_ID', 'leiden_0.7', 'leiden_log', 'slide_str'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mean', 'std'
    uns: 'leiden_0.7', 'leiden_log_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [62]:
# Step 1: Create 'slide_str' column by formatting slide_ID as a string
adata_lisa_annotation.obs['slide_str'] = adata_lisa_annotation.obs['slide_ID'].astype(str).apply(lambda x: f"ID_00{x}")

In [63]:
# Step 2: Function to generate new cell ID with conditional column info
def add_suffix_adata(row):
    slide_str = row['slide_str']
    suffix = slide_str

    # Only add _colX if the slide_str matches one of the column-containing slides
    if slide_str in ["ID_0022110", "ID_0022111"] and pd.notna(row['core_ID']):
        try:
            col_num = row['core_ID'].split('X')[1].split('Y')[0]
            suffix += f"_col{col_num}"
        except IndexError:
            pass
    
    return f"{row['cell_id']}_{suffix}"

# Step 3: Apply new cell ID and update .obs_names
adata_lisa_annotation.obs['cell_id'] = adata_lisa_annotation.obs.apply(add_suffix_adata, axis=1)
adata_lisa_annotation.obs_names = adata_lisa_annotation.obs['cell_id']

In [64]:
adata_lisa_annotation.obs

Unnamed: 0_level_0,cell_id,transcript_counts,control_probe_counts,genomic_control_counts,control_codeword_counts,unassigned_codeword_counts,deprecated_codeword_counts,total_counts,cell_area,nucleus_area,...,tissue,time_point,patient_ID,year,block_ID,cohort,run_ID,leiden_0.7,leiden_log,slide_str
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaabfeja-1_ID_0022110_col1,aaabfeja-1_ID_0022110_col1,41,0,0,0,0,0,41.0,24.926251,,...,ileum,before_treatment,5_NR,2017,B2017_27346_1,non-responder,o37013,16,14,ID_0022110
aaabincm-1_ID_0022110_col1,aaabincm-1_ID_0022110_col1,80,0,0,0,0,0,80.0,32.873751,21.313751,...,ileum,before_treatment,5_NR,2017,B2017_27346_1,non-responder,o37013,16,14,ID_0022110
aaabjmea-1_ID_0022110_col1,aaabjmea-1_ID_0022110_col1,58,0,0,0,0,0,58.0,25.106876,11.695469,...,ileum,before_treatment,5_NR,2017,B2017_27346_1,non-responder,o37013,16,14,ID_0022110
aaabjnmd-1_ID_0022110_col1,aaabjnmd-1_ID_0022110_col1,58,0,0,0,0,0,58.0,27.680782,12.372813,...,ileum,before_treatment,5_NR,2017,B2017_27346_1,non-responder,o37013,16,14,ID_0022110
aaabnjhk-1_ID_0022110_col1,aaabnjhk-1_ID_0022110_col1,98,0,0,0,0,0,98.0,35.357345,20.546094,...,ileum,before_treatment,5_NR,2017,B2017_27346_1,non-responder,o37013,16,14,ID_0022110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
oinipijh-1_ID_0056764,oinipijh-1_ID_0056764,9,0,0,0,0,0,9.0,34.092970,14.991876,...,ileum,before_treatment,05_R,2020,B2020.6796,responder,o38447,17,17,ID_0056764
oinjdnej-1_ID_0056764,oinjdnej-1_ID_0056764,28,0,0,0,0,0,28.0,25.152032,25.152032,...,ileum,before_treatment,05_R,2020,B2020.6796,responder,o38447,0,0,ID_0056764
oinjdnmp-1_ID_0056764,oinjdnmp-1_ID_0056764,15,0,0,0,0,0,15.0,20.455782,20.455782,...,ileum,before_treatment,05_R,2020,B2020.6796,responder,o38447,27,5,ID_0056764
oinjejim-1_ID_0056764,oinjejim-1_ID_0056764,6,0,0,0,0,0,6.0,9.753750,9.753750,...,ileum,before_treatment,05_R,2020,B2020.6796,responder,o38447,5,14,ID_0056764


## Check common cells

In [65]:
import numpy as np

xenium_ids = set(adata_lisa_annotation.obs['cell_id'])
codex_ids = set(df_combined_all['cell_id'])

shared_ids = xenium_ids & codex_ids
only_in_xenium = xenium_ids - codex_ids
only_in_codex = codex_ids - xenium_ids

print(f"Shared cell_ids: {len(shared_ids)}")
print(f"In Xenium only: {len(only_in_xenium)}")
print(f"In CODEX only: {len(only_in_codex)}")

Shared cell_ids: 2205205
In Xenium only: 3
In CODEX only: 453770


In [66]:
print("\nExamples only in Xenium:")
print(list(only_in_xenium)[:10])

print("\nExamples only in CODEX:")
print(list(only_in_codex)[:10])


Examples only in Xenium:
['ohnkecao-1_ID_0056777', 'niljbjca-1_ID_0022110_col2', 'ohnkdnko-1_ID_0056777']

Examples only in CODEX:
['dgilaeop-1_ID_0022111_col3', 'ejagbkpp-1_ID_0056764', 'jmlofnnl-1_ID_0022111_col1', 'obcdklln-1_ID_0022110_col3', 'nknghbec-1_ID_0056777', 'ngkmpcbl-1_ID_0056777', 'dmblobpo-1_ID_0022110_col4', 'njpkbhaf-1_ID_0022111_col4', 'bkijjlpp-1_ID_0022111_col4', 'lnfijhlk-1_ID_0056764']


In [71]:
df_combined_all = df_combined_all.rename(columns={'region': 'cropped_region_for_alignment'})

In [None]:
# Step 1: Set 'cell_id' as index in both DataFrames
df_combined_all = df_combined_all.set_index('cell_id')
df_xenium = adata_lisa_annotation.obs.copy()
df_xenium.index.name = 'cell_id'  # ensure index name matches

In [72]:
# Step 2: Perform the merge on the index
df_merged = df_combined_all.join(df_xenium, how='inner')

# Step 3: Print the result
print(f"Final merged rows: {df_merged.shape[0]}")

Final merged rows: 2205205


## Save as df

In [73]:
output_path = "/media/Lynn/data/Integrated_data/run2_run3_merged_xenium_codex.csv"
df_merged.to_csv(output_path, index=False)

## Save as adata

In [77]:
# Keep only the cell_ids in df
valid_cell_ids = df_combined_all.index.intersection(adata_lisa_annotation.obs.index)

In [79]:
# Subset the entire AnnData object (not just .obs!)
adata_lisa_annotation = adata_lisa_annotation[valid_cell_ids].copy()

In [80]:
# Now join 
adata_lisa_annotation.obs = adata_lisa_annotation.obs.join(df_combined_all, how='inner', rsuffix='_merged')

In [85]:
adata_lisa_annotation

AnnData object with n_obs × n_vars = 2205205 × 422
    obs: 'cell_id', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'region', 'z_level', 'cell_labels', 'core_ID', 'slide_ID', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_1_genes', 'pct_counts_in_top_5_genes', 'pct_counts_in_top_10_genes', 'n_counts', 'tissue', 'time_point', 'patient_ID', 'year', 'block_ID', 'cohort', 'run_ID', 'leiden_0.7', 'leiden_log', 'slide_str', 'index', 'DAPI', 'FoxP3', 'aSMA', 'CD4', 'CD8', 'CD31', 'CD11c', 'IFNG', 'Pan-Cytokeratin', 'CD68', 'CD20', 'CD66b', 'TNFa', 'CD45RO', 'CD14', 'CD11b', 'Vimentin', 'CD163', 'PDGFRA', 'CD45', 'CCR7', 'IL10', 'CD38', 'CD69', 'Podoplanin', 'PNAd', 'ECP', 'MPO', 'MIP-3', 'slide', 'cropped_region_for_alignment', 'column', 'CD16', 'CXCL13'
  

### Do some checks first

In [82]:
# Get .obs as DataFrame
df_obs = adata_lisa_annotation.obs.copy()

# Check where the values differ
diff_rows = df_obs[df_obs['slide'] != df_obs['slide_str']]

# Display number of differing rows and the rows themselves if any
print(f"Number of rows where 'slide' and 'slide_str' differ: {diff_rows.shape[0]}")
if not diff_rows.empty:
    display(diff_rows[['slide', 'slide_str']])

Number of rows where 'slide' and 'slide_str' differ: 0


In [83]:
# Get .obs as DataFrame
df_obs = adata_lisa_annotation.obs.copy()

# Check where the values differ
diff_rows = df_obs[df_obs['index'] != df_obs['cell_labels']]

# Display number of differing rows and the rows themselves if any
print(f"Number of rows where 'index' (codex) and 'cell_labels' (xenium) differ: {diff_rows.shape[0]}")
if not diff_rows.empty:
    display(diff_rows[['index', 'cell_labels']])

Number of rows where 'index' (codex) and 'cell_labels' (xenium) differ: 0


### Add Xenium Annotation

In [86]:
adata_lisa_annotation.obs['leiden_0.7'].unique()

['11', '21', '10', '25', '0', ..., '28', '32', '34', '35', '36']
Length: 37
Categories (37, object): ['0', '1', '2', '3', ..., '33', '34', '35', '36']

In [87]:
annotation_dict = {
    "0": "T cells",
    "1": "Infl. Fibroblasts",
    "2": "Plasma Cells",
    "3": "Crypt cells (colon)",
    "4": "Marophages",
    "5": "Neutrophils (M2?)",
    "6": "Enterocytes (Ileum)",
    "7": "B cells",
    "8": "Endothelial Cells/ Myofibroblasts",
    "9": "CD8+ T cells",
    "10": "Colon Goblet Cells",
    "11": "Crypt Cells (Ileum)",
    "12": "Smooth muscle cells",
    "13": "Colonocytes",
    "14": "Mast Cells",
    "15": "Eos? Enteroendocrine?",
    "16": "LND cells",
    "17": "Proliferating cells",
    "18": "Lymphatic Endothelial Cells (LECs)",
    "19": "Endothelial Cells",
    "20": "Enteroendocrine Cells",
    "21": "Th1/Th17 cells",
    "22": "Monocytes",
    "23": "activated Dendritic cells",
    "24": "Neuroendocrine cell",
    "25": "Enteroendocrine L-cells",
    "26": "Tuft cells",
    "27": "Best4+ epithelial cells + Immune cells",
    "28": "pro-inflammatory Myeloid Cells",
    "29": "cDC1",
    "30": "Infl. Macrophages",
    "31": "??",
    "32": "??",
    "33": "??",
    "34": "??",
    "35": "Smooth Muscle Cells/Myofibroblasts",
    "36": "Smooth Muscle Cells/Myofibroblasts"
}

# Map string-based 'leiden' clusters to annotations
adata_lisa_annotation.obs['xenium_annotation'] = adata_lisa_annotation.obs['leiden_0.7'].map(annotation_dict)

### Save

In [90]:
adata_lisa_annotation.obs['column'] = adata_lisa_annotation.obs['column'].apply(
    lambda x: f"column_{x}" if pd.notna(x) else "NA"
)

In [93]:
adata_lisa_annotation.obs.rename(columns={
    "leiden_0.7": "xenium_leiden_0.7",
    "leiden_log": "xenium_leiden_log"
}, inplace=True)

In [94]:
adata_lisa_annotation.var.rename(columns={col: f"xenium_{col}" for col in adata_lisa_annotation.var.columns}, inplace=True)
adata_lisa_annotation.uns = {f"xenium_{k}": v for k, v in adata_lisa_annotation.uns.items()}
adata_lisa_annotation.obsm = {
    (f"xenium_{k}" if k != "spatial" else k): v 
    for k, v in adata_lisa_annotation.obsm.items()
}
adata_lisa_annotation.varm = {f"xenium_{k}": v for k, v in adata_lisa_annotation.varm.items()}
adata_lisa_annotation.layers = {f"xenium_{k}": v for k, v in adata_lisa_annotation.layers.items()}
adata_lisa_annotation.obsp = {f"xenium_{k}": v for k, v in adata_lisa_annotation.obsp.items()}

In [98]:
adata_lisa_annotation.obs.drop(columns='block_ID', inplace=True)

In [100]:
adata_lisa_annotation.obs['response_group'] = np.where(
    adata_lisa_annotation.obs['patient_ID'].str.endswith('_NR'), 'Non-Responder',
    np.where(
        adata_lisa_annotation.obs['patient_ID'].str.endswith('_R'), 'Responder', 'Unknown'
    )
)

In [102]:
output_path = "/media/Lynn/data/Integrated_data/adata/run2_3_codex_raw_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad"
adata_lisa_annotation.write(output_path)

# Nimbus (FAILED)

## Run3

In [1]:
def integrate_codex_xenium(slide_id, codex_label, base_paths, sdata_xenium):
    import os
    import numpy as np
    import pandas as pd
    import skimage as ski
    import tifffile

    output_dir = os.path.join(base_paths["out"], slide_id)
    os.makedirs(output_dir, exist_ok=True)

    # Segmentation file
    seg_path = os.path.join(
        base_paths["segmentation_base"],
        f"{codex_label}_whole_cell.tiff"
    )
    if not os.path.exists(seg_path):
        raise FileNotFoundError(f"Segmentation not found: {seg_path}")
    trans_mask = tifffile.imread(seg_path)

    # CODEX marker folder
    codex_folder = os.path.join(base_paths["phenocycler"], codex_label)
    if not os.path.isdir(codex_folder):
        raise FileNotFoundError(f"CODEX folder not found: {codex_folder}")

    # --- Load marker images ---
    marker_files = [f for f in os.listdir(codex_folder) if f.endswith(".tiff")]
    marker_files.sort()
    markers = []
    marker_names = []
    for f in marker_files:
        img = tifffile.imread(os.path.join(codex_folder, f))
        markers.append(img)
        marker_names.append(os.path.splitext(f)[0])  # e.g. "DAPI"

    markers = np.array(markers)

    # --- Quantification ---
    unique_cell_ids = np.unique(trans_mask)
    unique_cell_ids = unique_cell_ids[unique_cell_ids > 0]
    df_adt = pd.DataFrame(index=unique_cell_ids)
    df_adt.index.name = 'cell_id'

    for i, marker_name in enumerate(marker_names):
        props = ski.measure.regionprops(trans_mask, intensity_image=markers[i])
        mean_signals = {prop.label: prop.mean_intensity for prop in props}
        df_adt[marker_name] = df_adt.index.map(mean_signals)

    # Save tables
    adtcsv = os.path.join(output_dir, f"{codex_label}_adt_countmat.csv")
    df_adt.to_csv(adtcsv)

    # Map back to Xenium cell IDs
    cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()
    cell_id_to_name = {i + 1: name for i, name in enumerate(cell_names_list)}
    df_adt_reset = df_adt.reset_index()
    df_adt_reset['cell_name'] = df_adt_reset['cell_id'].map(cell_id_to_name)
    df_adt_reset = df_adt_reset.rename(columns={'cell_id': 'index', 'cell_name': 'cell_id'})
    df_adt_reset = df_adt_reset[['index', 'cell_id'] + [c for c in df_adt_reset.columns if c not in ['index', 'cell_id']]]
    adtcsv_cell_ids = os.path.join(output_dir, f"{codex_label}_adt_countmat_with_cell_ids.csv")
    df_adt_reset.to_csv(adtcsv_cell_ids, index=False)

    print(f"Completed processing for {slide_id} - {codex_label}")


In [2]:
import sys
import os
sys.path.append(os.path.abspath('../src'))
import load_sdata

# Load each Xenium dataset into a SpatialData object
sdata_xenium_first_slide = load_sdata.get_xenium_slide_data('0056764')
sdata_xenium_second_slide = load_sdata.get_xenium_slide_data('0056777')

sdata_xenium = {
    'ID_0056764': sdata_xenium_first_slide,
    'ID_0056777': sdata_xenium_second_slide
}

version mismatch: detected: RasterFormatV02, requested: FormatV04
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
version mismatch: detected: RasterFormatV02, requested: FormatV04
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, 

In [8]:
base_paths = {
    "segmentation_base": "/media/Lynn/data/Nimbus/segmentation",
    "phenocycler": "/media/Lynn/data/Nimbus/nimbus_output/run3",
    "transmat": "/media/Lynn/alignment/codex_whole_slides",  # keep if you still need it for other workflows
    "out": "/media/Lynn/data/Integrated_data/Nimbus"
}


slide_id = "ID_0056777"

# your codex folders should be named ID_56777_1_1_core_1, ID_56777_1_1_core_2, ...
codex_labels = [
    "ID_56777_1_1_core_1",
    "ID_56777_1_1_core_2",
    "ID_56777_1_1_core_3",
    "ID_56777_1_1_core_4",
    "ID_56777_1_2",
    "ID_56777_2"
]

for codex_label in codex_labels:
    integrate_codex_xenium(slide_id, codex_label, base_paths, sdata_xenium)


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_1


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_2


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_3


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_1_core_4


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_1_2


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056777 - ID_56777_2


In [10]:
base_paths = {
    "segmentation_base": "/media/Lynn/data/Nimbus/segmentation",
    "phenocycler": "/media/Lynn/data/Nimbus/nimbus_output/run3",
    "transmat": "/media/Lynn/alignment/codex_whole_slides",  # keep if you still need it for other workflows
    "out": "/media/Lynn/data/Integrated_data/Nimbus"
}
slide_id = "ID_0056764"
codex_label = "ID_56764"

integrate_codex_xenium(slide_id, codex_label, base_paths, sdata_xenium)

  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0056764 - ID_56764


## Run2

In [2]:
import sys
import os
sys.path.append(os.path.abspath('../src'))
import load_sdata

# Load each Xenium dataset into a SpatialData object
sdata_xenium_first_slide = load_sdata.get_xenium_slide_data('0022110')
sdata_xenium_second_slide = load_sdata.get_xenium_slide_data('0022111')

sdata_xenium = {
    'ID_0022110': sdata_xenium_first_slide,
    'ID_0022111': sdata_xenium_second_slide
}

  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_com

In [4]:
base_paths = {
    "segmentation_base": "/media/Lynn/data/Nimbus/segmentation",
    "phenocycler": "/media/Lynn/data/Nimbus/nimbus_output/run2",
    "transmat": "/media/Lynn/alignment/codex_whole_slides",  # keep if you still need it for other workflows
    "out": "/media/Lynn/data/Integrated_data/Nimbus"
}


slide_id = "ID_0022110"

# your codex folders should be named ID_56777_1_1_core_1, ID_56777_1_1_core_2, ...
codex_labels = [
    "ID_0022110_column_1",
    "ID_0022110_column_2",
    "ID_0022110_column_3",
    "ID_0022110_column_4"
]

for codex_label in codex_labels:
    integrate_codex_xenium(slide_id, codex_label, base_paths, sdata_xenium)


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022110 - ID_0022110_column_1


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022110 - ID_0022110_column_2


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022110 - ID_0022110_column_3


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022110 - ID_0022110_column_4


In [5]:
slide_id = "ID_0022111"

# your codex folders should be named ID_56777_1_1_core_1, ID_56777_1_1_core_2, ...
codex_labels = [
    "ID_0022111_column_1",
    "ID_0022111_column_2",
    "ID_0022111_column_3",
    "ID_0022111_column_4"
]

for codex_label in codex_labels:
    integrate_codex_xenium(slide_id, codex_label, base_paths, sdata_xenium)


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022111 - ID_0022111_column_1


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022111 - ID_0022111_column_2


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022111 - ID_0022111_column_3


  cell_names_list = sdata_xenium[slide_id]['column_1'].table.obs['cell_id'].tolist()


Completed processing for ID_0022111 - ID_0022111_column_4


### Combine all dfs

In [9]:
import os
import pandas as pd

# Base path
out_base = "/media/Lynn/data/Integrated_data/Nimbus"

# Define single-CODEX and multi-CODEX slide IDs
single_codex_slides = ["ID_0056764"]
multi_codex_slide = "ID_0056777"

df_list = []

# 1. Load single-CODEX slides (e.g., ID_0056764)
for slide in single_codex_slides:
    csv_path = os.path.join(out_base, slide, "ID_56764_adt_countmat_with_cell_ids.csv")
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        df['slide'] = slide
        df['region'] = None  # No region distinction
        df_list.append(df)
    else:
        print(f"Warning: File not found for slide {slide}: {csv_path}")

# 2. Load all CSVs from multi-CODEX slide (e.g., ID_0056777)
multi_codex_dir = os.path.join(out_base, multi_codex_slide)
if os.path.exists(multi_codex_dir):
    for subfolder in os.listdir(multi_codex_dir):
        subdir_path = os.path.join(multi_codex_dir, subfolder)
        csv_path = os.path.join(subdir_path, f"{subfolder}_adt_countmat_with_cell_ids.csv")
        
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            df['slide'] = multi_codex_slide
            df['region'] = subfolder  # Track which CODEX image it came from
            df_list.append(df)
        else:
            print(f"Warning: Missing CSV in {subfolder}")
else:
    print(f"Warning: Multi-CODEX slide folder not found: {multi_codex_dir}")

# 3. Combine all into one DataFrame
if df_list:
    df_combined = pd.concat(df_list, ignore_index=True)
    print("All data combined successfully.")
else:
    print("No data was loaded. Please check paths.")


All data combined successfully.


In [11]:
import pandas as pd

# Set path to output directory
out_base = "/media/Lynn/data/Integrated_data/Nimbus"

slides_columns = {
    "ID_0022110": [1, 2, 3, 4],
    "ID_0022111": [1, 2, 3, 4]
}

# Load and concatenate all CSVs
df_list = []
for slide, columns in slides_columns.items():
    for col in columns:
        pool = f"{slide}_column_{col}"
        csv_path = os.path.join(out_base, slide, pool, f"{pool}_adt_countmat_with_cell_ids.csv")
        df = pd.read_csv(csv_path)
        df['slide'] = slide
        df['column'] = col
        df_list.append(df)

# Concatenate all into one DataFrame
df_combined_run2 = pd.concat(df_list, ignore_index=True)

  df = pd.read_csv(csv_path)
  df = pd.read_csv(csv_path)
  df = pd.read_csv(csv_path)


In [13]:
df_all_runs = pd.concat([df_combined, df_combined_run2], ignore_index=True)

In [14]:
df_all_runs

Unnamed: 0,index,cell_id,CCR7,CD11b,CD11c,CD14,CD163,CD20,CD31,CD38,...,PDGFRA,PNAd,Pan-Cytokeratin,Podoplanin,TNFa,Vimentin,aSMA,slide,region,column
0,1,aaaaapoi-1,7.060000,97.553333,8.313333,183.616667,4.976667,5.756667,4.993333,6.460000,...,7.013333,7.296667,186.236667,5.733333,15.006667,5.596667,7.376667,ID_0056764,,
1,2,aaaadfjn-1,5.485714,14.114286,5.614286,142.414286,4.814286,4.914286,4.842857,5.942857,...,7.942857,5.857143,110.600000,7.328571,18.928571,4.857143,5.728571,ID_0056764,,
2,3,aaaagadk-1,7.546565,134.326718,15.227481,204.210687,4.880916,6.297710,6.071756,8.981679,...,21.529771,13.059542,203.140458,11.480916,16.345038,6.447328,21.215267,ID_0056764,,
3,4,aaaagmge-1,7.592493,51.973190,8.273458,205.260054,5.262735,5.782842,4.833780,7.260054,...,11.246649,8.018767,206.243968,10.823056,7.329759,5.490617,20.710456,ID_0056764,,
4,5,aaaahchk-1,6.421603,38.836237,6.073171,179.923345,4.979094,5.567944,4.742160,5.581882,...,5.721254,8.533101,188.937282,6.055749,6.240418,5.418118,5.686411,ID_0056764,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2658970,199985,,5.080000,16.780000,5.900000,10.120000,69.980000,5.300000,7.000000,71.020000,...,,5.140000,5.300000,5.860000,6.360000,6.660000,19.820000,ID_0022111,,4.0
2658971,199986,,10.120000,42.720000,6.800000,98.040000,77.440000,6.360000,49.720000,108.320000,...,,5.440000,6.680000,42.680000,56.680000,88.640000,9.400000,ID_0022111,,4.0
2658972,199987,,6.684211,16.526316,6.631579,75.526316,68.052632,6.578947,7.210526,33.736842,...,,8.368421,6.631579,12.263158,65.421053,13.210526,11.947368,ID_0022111,,4.0
2658973,199988,,6.822222,101.844444,8.011111,145.400000,139.066667,6.211111,5.522222,64.777778,...,,11.633333,6.188889,35.155556,144.177778,36.444444,6.744444,ID_0022111,,4.0


In [15]:
def add_suffix(row):
    # Start with the slide ID
    suffix = row['slide']
    # Add column info if present
    if 'column' in row and pd.notna(row['column']):
        suffix += f"_col{int(row['column'])}"
    return f"{row['cell_id']}_{suffix}"

# Create new cell_id values with the suffix
df_all_runs['cell_id'] = df_all_runs.apply(add_suffix, axis=1)

### Combine with xenium and raw codex adata

In [6]:
import anndata as ad

adata_lisa_annotation = ad.read_h5ad('/media/Lynn/data/Integrated_data/adata/run2_3_codex_raw_xenium_norm100_log_scale_16_umap_leiden_0_7.h5ad')

In [16]:
import numpy as np

xenium_ids = set(adata_lisa_annotation.obs['cell_id'])
codex_ids = set(df_all_runs['cell_id'])

shared_ids = xenium_ids & codex_ids
only_in_xenium = xenium_ids - codex_ids
only_in_codex = codex_ids - xenium_ids

print(f"Shared cell_ids: {len(shared_ids)}")
print(f"In Xenium only: {len(only_in_xenium)}")
print(f"In CODEX only: {len(only_in_codex)}")

Shared cell_ids: 1518421
In Xenium only: 686784
In CODEX only: 1017333
