In [1]:
import argparse
import gzip
import requests
import pandas as pd
import sys
from pathlib import Path
import os
import csv
import glob
from collections import defaultdict
from scipy.sparse import coo_matrix
from scipy.io import mmwrite
from shapely.geometry import MultiPoint, Polygon, Point
from rtree import index
import read_roi
from tqdm import tqdm

In [2]:
###  1.
###  to 10x matrix
###

def assign_indices(df):
    genes = defaultdict(lambda: len(genes))
    df['cell_idx'] = df['cell'].astype('category').cat.codes
    df['gene_idx'] = df['gene'].apply(lambda x: genes[x])
    return df, genes


def group_data(df):
    grouped_data = df.groupby(['cell_idx', 'gene_idx']).size().reset_index(name='counts')
    return grouped_data


def write_matrix(grouped_data, output_dir):
    rows, cols, data = grouped_data['cell_idx'].values, grouped_data['gene_idx'].values, grouped_data['counts'].values
    sparse_matrix = coo_matrix((data, (rows, cols)), shape=(max(rows) + 1, max(cols) + 1)).T
    with gzip.open(f"{output_dir}/cell_feature_matrix/matrix.mtx.gz", 'wb') as f:
        mmwrite(f, sparse_matrix, field='integer', precision=None, symmetry='general')


def query_api(genes):
    unique_genes = list(genes.keys())
    r = requests.post(
        url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
        json={
            'organism': 'mmusculus',
            'target': 'ENSG',
            'query': unique_genes,
        }
    )
    converted_genes = {gene: result['converted'] for gene, result in zip(unique_genes, r.json()['result'])}
    return converted_genes


def write_features(genes, converted_genes, output_dir):
    with gzip.open(f"{output_dir}/cell_feature_matrix/features.tsv.gz", 'wt') as f:
        for gene, idx in sorted(genes.items(), key=lambda x: x[1]):
            ensembl_id = converted_genes[gene]
            f.write(f"{ensembl_id}\t{gene}\tGene Expression\n")


def write_barcodes(df, output_dir):
    with gzip.open(f"{output_dir}/cell_feature_matrix/barcodes.tsv.gz", 'wt') as f:
        for cell in df['cell'].astype('category').cat.categories:
            f.write(f"{cell}\n")

In [3]:
#### 2.
#### takes in baysor_results.csv and generates convex hull boundaries for each cell
####


def make_roi_coordinates_df(roi_coordinates):
    print('Converting convex hulls to ROI table format.')
    coords_list = []
    for roi_name, coordinates in tqdm(roi_coordinates.items()):
        for x, y in coordinates:
            coords_list.append({'ROI_Name': roi_name, 'X': x, 'Y': y})
    coords_df = pd.DataFrame(coords_list)
    return coords_df


def baysor_results_df2coords_df(baysor_results_df):
    cell_groups = baysor_results_df.groupby('cell')
    convex_hulls = {}
    print('Calculating convex hulls for baysor segmentation.')
    for cell_name, group in tqdm(cell_groups):
        if cell_name != '0':  # Skip points not assigned to any cell
            points = group[['x', 'y']].values
            multi_point = MultiPoint(points)
            convex_hull = multi_point.convex_hull
            if convex_hull.geom_type == 'Polygon':
                coords = list(convex_hull.exterior.coords)
                convex_hulls[cell_name] = coords
    coords_df = make_roi_coordinates_df(convex_hulls)
    return coords_df

In [4]:
###
### 3
### CellPose roi_to_csv.py

def imagejzip2coords_df(zip_file_path):
    rois = read_roi.read_roi_zip(zip_file_path)
    rois_converted = []
    for roi_name, roi in rois.items():
        if roi['type'] == 'rectangle':
            x, y, w, h = roi['left'], roi['top'], roi['width'], roi['height']
            rois_converted.append({'ROI_Name': roi_name, 'X': x, 'Y': y})
        elif roi['type'] == 'freehand':
            x, y = roi['x'], roi['y']
            for i in range(len(x)):
                rois_converted.append({'ROI_Name': roi_name, 'X': x[i], 'Y': y[i]})
        else:
            raise ValueError(f'Unsupported ROI type: {roi["type"]}')
    coords_df = pd.DataFrame(rois_converted)
    return coords_df


def vertecies_to_shapely_polygon(roi_name=None, coordinates=None):
    if len(coordinates) < 3:
        print(f'Warning: Not enough coordinates for {roi_name} to form a valid polygon.')
        pg = Polygon()
    else:
        if coordinates[0] != coordinates[-1]:
            coordinates.append(coordinates[0])  # Add the first coordinate to the end if necessary
        pg = Polygon(coordinates)
    return pg


def make_polygon_df(coords_df):
    polygons = []
    for name, group in coords_df.groupby('ROI_Name'):
        vertices = [(x, y) for x, y in zip(group['X'], group['Y'])]
        # Add the last vertex identical to the first to make the polygon closed
        polygons.append({'ROI_Name': name, 'polygon': vertecies_to_shapely_polygon(name, vertices)})
    polygon_df = pd.DataFrame(polygons)
    polygon_df['Area'] = polygon_df['polygon'].apply(lambda x: x.area)
    polygon_df['valid'] = polygon_df['polygon'].apply(lambda x: x.is_valid)
    polygon_df['empty'] = polygon_df['polygon'].apply(lambda x: x.is_empty)
    polygon_df = polygon_df[polygon_df['valid']]
    polygon_df = polygon_df[~polygon_df['empty']]
    polygon_df = polygon_df.drop(['valid', 'empty'], axis=1)
    return polygon_df

In [5]:
###
###roi_containment.py
###


def map_nucleus2cell(nucleus_coords_df, cells_coords_df):
    print('Calculating nucleus to cell mappings.')
    nuc_cell_intersections = []

    idx = index.Index()
    cells_polygons = {}
    cells_names = []

    polygon_df_nuclei = make_polygon_df(nucleus_coords_df)
    polygon_df_cells = make_polygon_df(cells_coords_df)

    print('Building up rtree index of cells.')
    for cell_name, group in tqdm(polygon_df_cells.groupby('ROI_Name')):
#         print(cell_name, group)
        cell_polygon = group['polygon'].reset_index(drop=True)[0]
#         import pdb; pdb.set_trace()
        cells_polygons[cell_name] = cell_polygon
        cells_names.append(cell_name)
        idx.insert(len(cells_names) - 1, cell_polygon.bounds)

    print('Checking if nuclei are contained in cells.')
    for nucleus_name, group in tqdm(polygon_df_nuclei.groupby('ROI_Name')):
        nucleus_polygon = group['polygon'].reset_index(drop=True)[0]
        for i in idx.intersection(nucleus_polygon.bounds):
            cell_name = cells_names[i]
            cell_polygon = cells_polygons[cell_name]
            if cell_polygon.intersects(nucleus_polygon):
                nuc_cell_intersection = {'ROI_Name_nucleus': nucleus_name, 'ROI_Name_cell': cell_name}
                nuc_cell_intersections.append(nuc_cell_intersection)
    nuc_cell_df = pd.DataFrame(nuc_cell_intersections)
    return nuc_cell_df

In [6]:
###
###
### transcripts_in_nucleus.py


def get_transcripts_points(baysor_results_df):
    transcripts_points = [Point(x, y) for x, y in zip(baysor_results_df['x'], baysor_results_df['y'])]
    return transcripts_points



def create_rtree(polygon_df_nuclei):  
    """Create an R-tree spatial index for the nucleus boundaries."""
    idx = index.Index()
    for i, row in enumerate(polygon_df_nuclei.iterrows()):
        nucleus = row[1]['polygon']
        idx.insert(i, nucleus.bounds, obj=nucleus)
    return idx




def transcripts_in_nucleus(baysor_results_df, nucleus_coords_df):
    print('Loading nucleus boundaries...')
    nuclei = {}
    for nucleus_name, group in make_polygon_df(nucleus_coords_df).groupby('ROI_Name'):
        nuclei[nucleus_name] = group['polygon']
    print(f'Loaded {len(nuclei)} nuclei.')
    print('Loading transcripts...')
    transcripts = get_transcripts_points(baysor_results_df)
    print(f'Loaded {len(transcripts)} transcripts.')
    # Create R-tree spatial index for nuclei
    print('Creating R-tree index...')
    nucleus_polygon_df = make_polygon_df(nucleus_coords_df)
    idx = create_rtree(nucleus_polygon_df)
    # Check if each transcript is within a nucleus
    print('Checking transcripts...')
    results = []
    for i, transcript in tqdm(enumerate(transcripts), total=len(transcripts)):
        # Check if transcript is within any nucleus
        within_nucleus = False
        for jj in idx.intersection((transcript.x, transcript.y)):
            if transcript.within(nuclei[list(nuclei.keys())[jj]]).reset_index(drop=True)[0]:
                within_nucleus = True
                break
        results.append(within_nucleus)
    # Write results to output file
    baysor_results_df['within_nucleus'] = results
    return baysor_results_df

In [36]:
def make_cells_df(baysor_cell_stats, nucleus2cell_mapping, nucleus_coords_df):
    nucleus_polygon_df = make_polygon_df(nucleus_coords_df)
    # Apply transformations to cells DataFrame
    cells_df = (
        baysor_cell_stats.assign(
            control_probe_counts=0,
            control_codeword_counts=0,
            total_counts=lambda df: df['n_transcripts'],
            cell_area=lambda df: df['area'],
            x_centroid=lambda df: df['x'],
            y_centroid=lambda df: df['y'],
            cell_id=lambda df: df['cell']
        )
        .rename(columns={'n_transcripts': 'transcript_counts'})
    )
    cells_df = cells_df[['cell_id', 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area']]
    cells_df = pd.merge(cells_df, nucleus2cell_mapping, left_on='cell_id', right_on='ROI_Name_cell', how='left')
    cells_df = pd.merge(cells_df, nucleus_polygon_df, left_on='ROI_Name_nucleus', right_on='ROI_Name', how='left')
    cells_df = cells_df.rename(columns={"ROI_Name_nucleus": "nucleus_id", "Area": "nucleus_area"})
    cells_df = cells_df.drop(columns=['ROI_Name_cell', 'ROI_Name', "polygon"])
    # find cells that have more than one nucleus intersection
    subset_cols = ['cell_id', 'x_centroid', 'y_centroid', 'transcript_counts',
                   'control_probe_counts', 'control_codeword_counts', 'total_counts',
                   'cell_area']
    duplicates = cells_df[cells_df.duplicated(subset=subset_cols, keep=False)]
    cells_df = cells_df[~cells_df.duplicated(subset=subset_cols, keep='first')]
    return cells_df, duplicates

In [8]:
def make_transcripts_df(baysor_results_df):
    transcripts_df = (
        baysor_results_df.assign(
            transcript_id = lambda df: df['molecule_id'],
            cell_id = lambda df: df['cell'],
            overlaps_nucleus = lambda df: df['within_nucleus'],
            feature_name = lambda df: df['gene'],
            x_location = lambda df: df['x'],
            y_location = lambda df: df['y'],
            z_location = 0,
            qv = 42.0
        )
    )
    transcripts_df = transcripts_df[['transcript_id', 'cell_id', 'overlaps_nucleus', 'feature_name', 'x_location', 'y_location', 'z_location', 'qv']]
    return transcripts_df

In [9]:
def make_cell_boundaries(coords_df):
    cell_boundaries_df = (
        coords_df.assign(
        cell_id = lambda df: df['ROI_Name'],
        vertex_x = lambda df: df['X'],
        vertex_y = lambda df: df['Y'],
        )
    )
    cell_boundaries_df = cell_boundaries_df[['cell_id', 'vertex_x', 'vertex_y']]
    return cell_boundaries_df

In [10]:
argparse.Namespace(baysor_results_dir = '/data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_A1-1_results_baysor_results/',
                   cellpose_roi_path = '/data/petar/fgf1/resolve/rois/32810-1377-slide3_A1-1_DAPI_dapi_ROIs.zip',
                   output_dir = './test/')

Namespace(baysor_results_dir='/data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_A1-1_results_baysor_results/', cellpose_roi_path='/data/petar/fgf1/resolve/rois/32810-1377-slide3_A1-1_DAPI_dapi_ROIs.zip', output_dir='./test/')

In [37]:
def convert_resolve_to_xenium(args):
    print('processing: ')
    for key, value in args.__dict__.items():
        print(key + ': ' + value)
    print('')
    # 1
    #create cell_feature_matrix outputs
    print("STAGE 1: Converting baysor results to 10x cell_feature_matrix.")  
    output_dir = args.output_dir
    cell_feature_matrix_dirpath = Path(f"{output_dir}/cell_feature_matrix/")
    cell_feature_matrix_dirpath.mkdir(exist_ok=True, parents=True)
    baysor_results_dir = args.baysor_results_dir
    baysor_results_path = f"{baysor_results_dir}/baysor_results.csv"
    baysor_results_df = pd.read_csv(baysor_results_path)
    baysor_results_df, genes = assign_indices(baysor_results_df)
    grouped_data = group_data(baysor_results_df)
    write_matrix(grouped_data, output_dir)
    converted_genes = query_api(genes)
    write_features(genes, converted_genes, args.output_dir)
    write_barcodes(baysor_results_df, args.output_dir)
    print(f"Matrix dimensions: {grouped_data.shape}")
    print(f"Number of genes: {len(genes)}")
    print(f"Number of cells: {len(baysor_results_df['cell'].astype('category').cat.categories)}")
    print("done.")
    print("")

    # 2
    # generate convex hulls for baysor segmentations
    print('STAGE 2: Generating cell segmentations from Baysor transcript assignment.')
    cells_coords_df = baysor_results_df2coords_df(baysor_results_df)
    cell_boundaries_df = make_cell_boundaries(cells_coords_df)
    print('writing cell_boundaries.csv.gz')
    cell_boundaries_df.to_csv(f"{output_dir}/cell_boundaries.csv.gz", index=False, compression='gzip')
    print("done.")
    print("")
    
    # 3
    #
    print('STAGE 3: Converting ImageJ nucleus ROIs.')
    cellpose_roi_path = args.cellpose_roi_path
    nucleus_coords_df = imagejzip2coords_df(cellpose_roi_path)
    print("done.")
    print("")
    
    # 4
    #
    print('STAGE 4: Mapping nuclei to cells.')
    nucleus2cell_mapping = map_nucleus2cell(nucleus_coords_df, cells_coords_df)
    print("done.")
    print("")
    
    # 5
    #
    print("STAGE 5: Checking for transcripts in nuclei.")
    baysor_results_df = transcripts_in_nucleus(baysor_results_df, nucleus_coords_df)
    transcripts_df = make_transcripts_df(baysor_results_df)
    print("writing transcripts.csv.gz")
    transcripts_df.to_csv(f"{output_dir}/transcripts.csv.gz", index=False, compression='gzip')
    print("done.")
    print("")
    
    # 6
    #
    print("STAGE 6: Writing cells.csv.gz")
    baysor_cell_stats_path = f"{baysor_results_dir}/baysor_cell_stats.csv"
    baysor_cell_stats = pd.read_csv(baysor_cell_stats_path)
    cells_df, duplicates = make_cells_df(baysor_cell_stats, nucleus2cell_mapping, nucleus_coords_df)
    cells_df.to_csv(f"{output_dir}/cells.csv.gz", index=False, compression='gzip')
    duplicates.to_csv(f"{output_dir}/multinuc_cells.csv.gz", index=False, compression='gzip')
    print("done.")
    print("")

In [12]:
# test args
args = argparse.Namespace(baysor_results_dir = '/data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_A1-1_results_baysor_results/',
                          cellpose_roi_path = '/data/petar/fgf1/resolve/rois/32810-1377-slide3_A1-1_DAPI_dapi_ROIs.zip',
                          output_dir = './test/')

In [13]:
sample_names = [x.replace('_results_baysor_results', '') for x in os.listdir('/data/petar/fgf1/resolve/baysor_segmentation/') if '_results_baysor_results' in x]

In [14]:
namespace_list = []
for sample_name in sample_names:
    args = argparse.Namespace(baysor_results_dir = f"/data/petar/fgf1/resolve/baysor_segmentation/{sample_name}_results_baysor_results/",
                          cellpose_roi_path = f"/data/petar/fgf1/resolve/rois/{sample_name}_DAPI_dapi_ROIs.zip",
                          output_dir = f"/data/petar/fgf1/resolve/xe/{sample_name}/")
    namespace_list.append(args)

In [15]:
namespace_list

[Namespace(baysor_results_dir='/data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_A2-1_results_baysor_results/', cellpose_roi_path='/data/petar/fgf1/resolve/rois/32810-1377-slide3_A2-1_DAPI_dapi_ROIs.zip', output_dir='/data/petar/fgf1/resolve/xe/32810-1377-slide3_A2-1/'),
 Namespace(baysor_results_dir='/data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_B2-1_results_baysor_results/', cellpose_roi_path='/data/petar/fgf1/resolve/rois/32810-1377-slide3_B2-1_DAPI_dapi_ROIs.zip', output_dir='/data/petar/fgf1/resolve/xe/32810-1377-slide3_B2-1/'),
 Namespace(baysor_results_dir='/data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_D2-1_results_baysor_results/', cellpose_roi_path='/data/petar/fgf1/resolve/rois/32810-1377-slide3_D2-1_DAPI_dapi_ROIs.zip', output_dir='/data/petar/fgf1/resolve/xe/32810-1377-slide3_D2-1/'),
 Namespace(baysor_results_dir='/data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_C1-1_results_baysor_results/', cellpose_roi_path='/

In [None]:
for ns in namespace_list:
    convert_resolve_to_xenium(ns)
    print('')
    print('')
    print('')
    print('=============================================================')

processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_A2-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_A2-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_A2-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (424214, 3)
Number of genes: 100
Number of cells: 15042
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 15042/15042 [00:30<00:00, 501.05it/s]


Converting convex hulls to ROI table format.


100%|██████████| 15041/15041 [00:00<00:00, 274365.59it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 15041/15041 [00:02<00:00, 6084.27it/s]


Checking if nuclei are contained in cells.


100%|██████████| 2431/2431 [00:00<00:00, 4957.35it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 2431 nuclei.
Loading transcripts...
Loaded 2500624 transcripts.
Creating R-tree index...
Checking transcripts...


100%|██████████| 2500624/2500624 [03:26<00:00, 12123.67it/s]


writing transcripts.csv.gz
done.

STAGE 6: Writing cells.csv.gz
done.




processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_B2-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_B2-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_B2-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (444026, 3)
Number of genes: 100
Number of cells: 14025
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 14025/14025 [00:33<00:00, 416.17it/s]


Converting convex hulls to ROI table format.


100%|██████████| 14024/14024 [00:00<00:00, 230408.24it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 14024/14024 [00:02<00:00, 5945.01it/s]


Checking if nuclei are contained in cells.


100%|██████████| 4252/4252 [00:00<00:00, 4651.17it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 4252 nuclei.
Loading transcripts...
Loaded 3021030 transcripts.
Creating R-tree index...
Checking transcripts...


100%|██████████| 3021030/3021030 [05:39<00:00, 8889.10it/s] 


writing transcripts.csv.gz
done.

STAGE 6: Writing cells.csv.gz
done.




processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_D2-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_D2-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_D2-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (441382, 3)
Number of genes: 100
Number of cells: 16461
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 16461/16461 [00:31<00:00, 518.82it/s]


Converting convex hulls to ROI table format.


100%|██████████| 16460/16460 [00:00<00:00, 236643.05it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 16460/16460 [00:03<00:00, 4192.04it/s]


Checking if nuclei are contained in cells.


100%|██████████| 3625/3625 [00:01<00:00, 3379.78it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 3625 nuclei.
Loading transcripts...
Loaded 2343003 transcripts.
Creating R-tree index...
Checking transcripts...


100%|██████████| 2343003/2343003 [04:11<00:00, 9331.26it/s] 


writing transcripts.csv.gz
done.

STAGE 6: Writing cells.csv.gz
done.




processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_C1-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_C1-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_C1-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (424438, 3)
Number of genes: 100
Number of cells: 14355
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 14355/14355 [00:34<00:00, 417.76it/s]


Converting convex hulls to ROI table format.


100%|██████████| 14354/14354 [00:00<00:00, 160313.78it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 14354/14354 [00:02<00:00, 5189.91it/s]


Checking if nuclei are contained in cells.


100%|██████████| 3574/3574 [00:00<00:00, 4585.69it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 3574 nuclei.
Loading transcripts...
Loaded 2637268 transcripts.
Creating R-tree index...
Checking transcripts...


100%|██████████| 2637268/2637268 [04:27<00:00, 9871.36it/s] 


writing transcripts.csv.gz
done.

STAGE 6: Writing cells.csv.gz
done.




processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_A1-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_A1-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_A1-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (540813, 3)
Number of genes: 100
Number of cells: 18987
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 18987/18987 [00:46<00:00, 409.51it/s]


Converting convex hulls to ROI table format.


100%|██████████| 18986/18986 [00:00<00:00, 173787.82it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 18986/18986 [00:04<00:00, 4574.83it/s]


Checking if nuclei are contained in cells.


100%|██████████| 3275/3275 [00:00<00:00, 3985.43it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 3275 nuclei.
Loading transcripts...
Loaded 3282829 transcripts.
Creating R-tree index...
Checking transcripts...


100%|██████████| 3282829/3282829 [04:43<00:00, 11588.37it/s]


writing transcripts.csv.gz
done.

STAGE 6: Writing cells.csv.gz
done.




processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_D1-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_D1-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_D1-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (448311, 3)
Number of genes: 100
Number of cells: 15509
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 15509/15509 [00:33<00:00, 459.02it/s]


Converting convex hulls to ROI table format.


100%|██████████| 15508/15508 [00:00<00:00, 267291.01it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 15508/15508 [00:02<00:00, 5913.79it/s]


Checking if nuclei are contained in cells.


100%|██████████| 3754/3754 [00:00<00:00, 4554.11it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 3754 nuclei.
Loading transcripts...
Loaded 2646613 transcripts.
Creating R-tree index...
Checking transcripts...


100%|██████████| 2646613/2646613 [04:16<00:00, 10307.59it/s]


writing transcripts.csv.gz
done.

STAGE 6: Writing cells.csv.gz
done.




processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_B1-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_B1-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_B1-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (390807, 3)
Number of genes: 100
Number of cells: 13408
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 13408/13408 [00:29<00:00, 447.25it/s]


Converting convex hulls to ROI table format.


100%|██████████| 13407/13407 [00:00<00:00, 186498.52it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 13407/13407 [00:02<00:00, 5828.23it/s]


Checking if nuclei are contained in cells.


100%|██████████| 3520/3520 [00:00<00:00, 4664.70it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 3520 nuclei.
Loading transcripts...
Loaded 2417289 transcripts.
Creating R-tree index...
Checking transcripts...


100%|██████████| 2417289/2417289 [03:54<00:00, 10325.54it/s]


writing transcripts.csv.gz
done.

STAGE 6: Writing cells.csv.gz
done.




processing: 
baysor_results_dir: /data/petar/fgf1/resolve/baysor_segmentation/32810-1377-slide3_C2-1_results_baysor_results/
cellpose_roi_path: /data/petar/fgf1/resolve/rois/32810-1377-slide3_C2-1_DAPI_dapi_ROIs.zip
output_dir: /data/petar/fgf1/resolve/xe/32810-1377-slide3_C2-1/

STAGE 1: Converting baysor results to 10x cell_feature_matrix.
Matrix dimensions: (327476, 3)
Number of genes: 100
Number of cells: 11988
done.

STAGE 2: Generating cell segmentations from Baysor transcript assignment.
Calculating convex hulls for baysor segmentation.


100%|██████████| 11988/11988 [00:23<00:00, 504.22it/s] 


Converting convex hulls to ROI table format.


100%|██████████| 11987/11987 [00:00<00:00, 188957.04it/s]


writing cell_boundaries.csv.gz
done.

STAGE 3: Converting ImageJ nucleus ROIs.
done.

STAGE 4: Mapping nuclei to cells.
Calculating nucleus to cell mappings.
Building up rtree index of cells.


100%|██████████| 11987/11987 [00:02<00:00, 4589.14it/s]


Checking if nuclei are contained in cells.


100%|██████████| 3512/3512 [00:01<00:00, 2905.16it/s]


done.

STAGE 5: Checking for transcripts in nuclei.
Loading nucleus boundaries...
Loaded 3512 nuclei.
Loading transcripts...
Loaded 2001422 transcripts.
Creating R-tree index...
Checking transcripts...


 15%|█▌        | 308810/2001422 [00:34<02:15, 12467.92it/s]

In [39]:
sample_names

['32810-1377-slide3_A2-1',
 '32810-1377-slide3_B2-1',
 '32810-1377-slide3_D2-1',
 '32810-1377-slide3_C1-1',
 '32810-1377-slide3_A1-1',
 '32810-1377-slide3_D1-1',
 '32810-1377-slide3_B1-1',
 '32810-1377-slide3_C2-1']

In [17]:
####
###
##
#
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Process input file to create a cell feature matrix.')
    parser.add_argument('baysor_results_dir', type=str, help='path to baysor results directory')
    parser.add_argument('cellpose_roi_path', type=str, help='path to cellpose ROI zip')
    parser.add_argument('output_dir', type=str, help='path to output folder')
    args = parser.parse_args()
    convert_resolve_to_xenium(args)

usage: ipykernel_launcher.py [-h]
                             baysor_results_dir cellpose_roi_path output_dir
ipykernel_launcher.py: error: the following arguments are required: cellpose_roi_path, output_dir


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [33]:
cells_df = pd.read_csv('/data/petar/fgf1/resolve/xe/32810-1377-slide3_A2-1/cells.csv.gz')
cells_df

Unnamed: 0,cell_id,x_centroid,y_centroid,transcript_counts,control_probe_counts,control_codeword_counts,total_counts,cell_area,nucleus_id,nucleus_area
0,Cell106_3665_1929,3665.324324,1928.750000,148,0,0,148,2949.0,Cell159_3638_1964,1656.5
1,Cell109_3635_1962,3635.379487,1961.594872,195,0,0,195,3644.0,Cell159_3638_1964,1656.5
2,Cell263_4658_2233,4657.504545,2233.079545,440,0,0,440,6012.0,Cell190_4705_2266,1438.5
3,Cell342_1627_5709,1627.155102,5708.538776,245,0,0,245,5162.0,Cell602_1653_5775,1575.0
4,Cell354_1772_5621,1772.475177,5620.907801,282,0,0,282,4098.0,Cell580_1794_5657,911.0
...,...,...,...,...,...,...,...,...,...,...
73,Cell6731_10336_15868,10336.296296,15868.500000,54,0,0,54,1634.0,Cell2312_10329_15878,2513.0
74,Cell6807_1622_5808,1622.159664,5807.882353,119,0,0,119,2460.0,Cell621_1627_5854,3360.5
75,Cell8430_10242_12537,10242.000000,12536.666667,9,0,0,9,110.0,Cell1918_10257_12567,2888.5
76,Cell8500_5892_7033,5891.888889,7033.027778,36,0,0,36,1506.0,Cell808_5877_7029,2249.5


In [34]:
cells_df.columns

Index(['cell_id', 'x_centroid', 'y_centroid', 'transcript_counts',
       'control_probe_counts', 'control_codeword_counts', 'total_counts',
       'cell_area', 'nucleus_id', 'nucleus_area'],
      dtype='object')

In [35]:
subset_cols = ['cell_id']
duplicates = cells_df[~cells_df.duplicated(subset=subset_cols, keep=False)]
duplicates

Unnamed: 0,cell_id,x_centroid,y_centroid,transcript_counts,control_probe_counts,control_codeword_counts,total_counts,cell_area,nucleus_id,nucleus_area
0,Cell106_3665_1929,3665.324324,1928.750000,148,0,0,148,2949.0,Cell159_3638_1964,1656.5
1,Cell109_3635_1962,3635.379487,1961.594872,195,0,0,195,3644.0,Cell159_3638_1964,1656.5
2,Cell263_4658_2233,4657.504545,2233.079545,440,0,0,440,6012.0,Cell190_4705_2266,1438.5
3,Cell342_1627_5709,1627.155102,5708.538776,245,0,0,245,5162.0,Cell602_1653_5775,1575.0
4,Cell354_1772_5621,1772.475177,5620.907801,282,0,0,282,4098.0,Cell580_1794_5657,911.0
...,...,...,...,...,...,...,...,...,...,...
73,Cell6731_10336_15868,10336.296296,15868.500000,54,0,0,54,1634.0,Cell2312_10329_15878,2513.0
74,Cell6807_1622_5808,1622.159664,5807.882353,119,0,0,119,2460.0,Cell621_1627_5854,3360.5
75,Cell8430_10242_12537,10242.000000,12536.666667,9,0,0,9,110.0,Cell1918_10257_12567,2888.5
76,Cell8500_5892_7033,5891.888889,7033.027778,36,0,0,36,1506.0,Cell808_5877_7029,2249.5


In [22]:
len(duplicates['cell_id'].unique().tolist())

197

In [23]:
pd.read_csv('/data/petar/fgf1/resolve/xe/32810-1377-slide3_B1-1/transcripts.csv.gz')

Unnamed: 0,transcript_id,cell_id,overlaps_nucleus,feature_name,x_location,y_location,z_location,qv
0,1,0,False,Sorcs1,4033.0,800.0,0,42.0
1,2,0,False,Sorcs1,2836.0,1866.0,0,42.0
2,3,Cell10803_2921_1834,False,Sorcs1,2929.0,1832.0,0,42.0
3,4,Cell68_3306_1405,False,Sorcs1,3286.0,1391.0,0,42.0
4,5,Cell68_3306_1405,True,Sorcs1,3337.0,1390.0,0,42.0
...,...,...,...,...,...,...,...,...
2417284,2417285,Cell7164_10081_16865,False,Immp2l,10091.0,16900.0,0,42.0
2417285,2417286,0,False,Immp2l,10387.0,15973.0,0,42.0
2417286,2417287,0,False,Immp2l,10540.0,16426.0,0,42.0
2417287,2417288,Cell7164_10081_16865,False,Immp2l,10107.0,16889.0,0,42.0


In [41]:
gzip.__spec__

ModuleSpec(name='gzip', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f697eed9820>, origin='/tools/anaconda/envs/nmq407/fgf1/lib/python3.8/gzip.py')