In [None]:
import sys
!{sys.executable} -m pip install pyarrow rtree

# Assign transcripts

In [None]:
import pandas as pd
import numpy as np
from shapely import wkb
import geopandas as gpd
from shapely.geometry import Point


## Load the transcripts and the boundaries

In [None]:
transcripts = pd.read_csv("transcripts.csv")

# convert to geoseries points
points = gpd.GeoSeries.from_xy(transcripts.global_x, transcripts.global_y)

In [None]:
boundaries = pd.read_parquet("boundaries.parquet")
# each cell exists 7 times, for each layer, 
# remove the duplicates
# it's faster to do this before the convert it into shapely polygons
boundaries = boundaries.Geometry.drop_duplicates(ignore_index=True)
# Decode the WKB
boundaries = gpd.GeoSeries(boundaries.apply(wkb.loads))

In [None]:
# Plot every 50th cells as a small qc
# boundaries[::50].plot()

## Assign the transcripts to cells

Here, we use a spatial join to get the cell id for each transcript

In [None]:
cell = gpd.sjoin(
    gpd.GeoDataFrame({"XY": points}, geometry="XY"),
    gpd.GeoDataFrame({"cells": boundaries}, geometry="cells"),
    how="left",
)


In [None]:
# add it to the transcripts
transcripts['cell_id'] = cell['index_right'].fillna(0).astype('int')

In [None]:
# Rename and add some columns to make it compatible with tile-xenium
transcripts = transcripts.rename(columns={
    "transcript_id":        "ensembl_id",
    "unique_transcript_id": "transcript_id",
    "global_x":             "x_location", 
    "global_y":             "y_location", 
    "global_z":             "z_location", 
    "gene":                 "feature_name"}
)
transcripts['qv'] = 20
transcripts['overlaps_nucleus'] = 0
transcripts

## Save the results transcripts.csv

In [None]:
transcripts.to_csv("transcripts_assigned.csv")