In [2]:
import csv
import numpy as np
import ast
import re
import matplotlib.pyplot as plt

# **Read the files**

In [9]:
with open('tissue_cell_pairs.tsv', newline='') as file:
    reader = csv.reader(file, delimiter='\t')
    tissue_cells = list(reader)
print("number of cleaned cells:", len(tissue_cells))

number of cleaned cells: 219


In [18]:
with open('common_cells_across_tissues.csv', newline='') as file:
    reader = csv.reader(file, delimiter='\t')
    common_cells = list(reader)
common_cells.append('serous glandular cells')
print("number of common cells:", len(common_cells))

number of common cells: 17


In [41]:
with open('gene_list.csv', newline='') as file:
    reader = csv.reader(file, delimiter='\t')
    genes = list(reader)
print("number of cleaned genes:", len(genes))

number of cleaned genes: 964


In [15]:
# Write the cell-gene expression nTPM matrix
nTPM_matrix = []
with open('gene_expression_matrix.csv', newline='') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
      matrix_row = []
      for nTPM in row:
        matrix_row.append(float(nTPM))
      nTPM_matrix.append(matrix_row)
nTPM_matrix = np.array(nTPM_matrix)
m, n= nTPM_matrix.shape

print("The cleaned cell-gene expression matrix is size of " + str(m) + "x" + str(n))

The cleaned cell-gene expression matrix is size of 219x964


# **Form positive and negative labels**

Find the top 50 high expressed genes

In [24]:
positives = {}
selected_cells1 = []
count1 = 0
file_path = 'positive_labels.csv'
with open(file_path , 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    next(reader, None)
    for row in reader:
      formatted = row[0].split(',')
      markers = []
      for marker in formatted[1:]:
        markers.append(int(marker))
      positives[int(formatted[0])] = markers
      count1 += len(markers)
      selected_cells1.append(int(formatted[0]))

print("number of cells in positive labels:", len(selected_cells1))
print("number of total positive lables:", count1)

number of cells in positive labels: 64
number of total positive lables: 701


In [25]:
nTPM_medians = np.median(nTPM_matrix, axis=0).tolist()
genes_sorted = [index for index, value in sorted(enumerate(nTPM_medians), key=lambda x: x[1], reverse=True)]
negatives1 = genes_sorted[:50]

In [26]:
negatives2 = [
    ["PECAM1", "lung", 'endothelial cells', "LNP", "PECAM-1 directed re-targeting of exogenous mRNA providing two orders of magnitude enhancement of vascular delivery and expression in lungs independent of apolipoprotein E-mediated uptake"],
    ["VCAM1", "vascular", 'endothelial cells', "LNP", "Selective targeting of nanomedicine to inflamed cerebral vasculature to enhance the blood–brain barrier"],
    ["CD4", "pbmc", "t-cells", "LNP", "Highly efficient CD4+ T cell targeting and genetic recombination using engineered CD4+ cell-homing mRNA-LNPs"],
    ["CD5", "pbmc", "t-cells", "LNP", "CAR T cells produced in vivo to treat cardiac injury"],
    ["CD19", "pbmc", "b-cells", "LNP", ""],
    ["CD3", "pbmc", "t-cells", "LNP", "conference; doudna paper"],
    ["NCR1", "pbmc", "nk-cells", "LNP", "conference"],
    ["CD14", "pbmc", "macrophages", "LNP", "conference"],
    ["MRC1", "pbmc", "macrophages", "LNP", "conference"],
    ["ITGAM", "pbmc", "macrophages", "", "bacteria injector"],
    ["CD28", "pbmc", "t-cells", "EDV", "doudna EDV"],
    ["CD40", "pbmc", "b-cells", "lenti", "fengzhang new lenti papr"],
    ["ENG", "vascular", "endothelial cells", "LNP", "Targeting of immunoliposomes to endothelial cells using a single-chain Fv fragment directed against human endoglin (CD105)"],
    ["MRC1", "pbmc", "dendritic cells", "LNP", "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9322927/#B8-pharmaceuticals-15-00897"],
    ["CD8", "pbmc", "t-cells", "LNP", ""],
    ["PDPN", "skin", "endothelial cells", "LNP", "Targeted delivery of lipid nanoparticle to lymphatic endothelial cells via anti-podoplanin antibody"],
    ["PLVAP", "lung", "endothelial cells", "LNP?", "https://pubs.acs.org/doi/full/10.1021/acschembio.0c00003"],
    ["FCER2", "pbmc", "b-cells", "", "WildDISCO whole body imaging"]
]

In [28]:
negatives = {}
selected_cells2 = []
count2 = 0
file_path = 'negative_labels.csv'
with open(file_path , 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    next(reader, None)
    for row in reader:
      formatted = row[0].split(',')
      markers = []
      for marker in formatted[1:]:
        markers.append(int(marker))
      negatives[int(formatted[0])] = markers
      count2 += len(markers)
      selected_cells2.append(int(formatted[0]))

print("number of cells in negative labels:", len(selected_cells2))
print("number of total negative lables from lit:", count2)

number of cells in negative labels: 179
number of total negative lables from lit: 3992


# **Train for parameters p, q, r**

In [32]:
m = len(tissue_cells)
dic = {}
min_score = 0
for p in np.linspace(-500, -20, 20):
  for q in np.linspace(-500, -20, 20):
    for r in np.linspace(-500, -20, 20):
      penalty_matrix = [] #penalty for jth tissue_cell based on ith tissue_cell
      for i in range(m):
        row = []
        for j in range(m):
          if j == i:
            row.append(1000)
          elif tissue_cells[j][1] == tissue_cells[i][1]:
            if tissue_cells[j][1] in common_cells:
              row.append(0) # same cell (common) and diff tissue
            else:
              row.append(p) # same cell (non common) and diff tissue
          elif tissue_cells[j][0] == tissue_cells[i][0]:
            row.append(q) # diff cell and same tissue
          else:
            row.append(r) # diff cell and diff tissue
        penalty_matrix.append(row)
      penalty_matrix = np.array(penalty_matrix)
      objective_matrix = np.dot(penalty_matrix, nTPM_matrix)
      objective_matrix = objective_matrix.tolist()

      count1 = 0
      for obj_row in objective_matrix:
        markers_suggested = sorted(list(enumerate(obj_row)), key = lambda x: x[1], reverse=True)[: 10]
        markers_suggested = [index for index, value in markers_suggested]
      for i in markers_suggested:
        if i in negatives1:
          count1 += 1

      count2 = 0
      for row in negatives2:
        if [row[0]] in genes:
          marker = genes.index([row[0]])
          cell = tissue_cells.index([row[1], row[2]])
          obj_row = objective_matrix[cell]
          markers_suggested = sorted(list(enumerate(obj_row)), key = lambda x: x[1], reverse=True)[:10]
          markers_suggested = [index for index, value in markers_suggested]
          if marker in markers:
            count2 += 1

      count3 = 0
      for cell in positives:
        markers = positives[cell]
        obj_row = objective_matrix[cell]
        markers_suggested = sorted(list(enumerate(obj_row)), key = lambda x: x[1], reverse=True)[:10]
        markers_suggested = [index for index, value in markers_suggested]
        count3 += len(set(markers).intersection(set(markers_suggested)))

      if count2 + count3 - count1 > min_score:
        min_score = count2 + count3 - count1
        max_set = (p, q, r)

      dic[(p, q, r)] = (count1, count2, count3)

print("The (nearly-) optimal parameters p, q, r:", max_set)
print("The number of hitted top 50 highly expression genes:", count1)
print("The number of hitted functional genes:", count2)
print("The number of previous markers:", count3)

The (nearly-) optimal parameters p, q, r: (-20.0, -424.2105263157895, -20.0)
The number of hitted top 50 highly expression genes: 1
The number of hitted functional genes: 0
The number of previous markers: 103


# **Form the score table**

In [33]:
p = max_set[0]
q = max_set[1]
r = max_set[2]
m = len(tissue_cells)
t = 1
penalty_matrix = [] #penalty for jth tissue_cell based on ith tissue_cell
for i in range(m):
  row = []
  for j in range(m):
    if j == i:
      row.append(1000)
    elif tissue_cells[j][1] == tissue_cells[i][1]:
      if tissue_cells[j][1] in common_cells:
        row.append(0) # same cell (common) and diff tissue
      else:
        row.append(p) # same cell (non common) and diff tissue
    elif tissue_cells[j][0] == tissue_cells[i][0]:
      row.append(q) # diff cell and same tissue
    else:
      row.append(r) # diff cell and diff tissue
  penalty_matrix.append(row)
penalty_matrix = np.array(penalty_matrix)
objective_matrix = np.dot(penalty_matrix, nTPM_matrix)

In [46]:
# Return the top 10 recommeded markers for all cells
whole_body_markers = {}
for i in range(m):
  obj_row = objective_matrix[i]
  markers = sorted(list(enumerate(obj_row)), key = lambda x: x[1], reverse=True)[: 10]
  markers_id = [x[0] for x in markers]
  print()
  whole_body_markers[tissue_cells[i][0] + " " + tissue_cells[i][1]] = [genes[x][0] for x in markers_id]

filename = 'recommended_whole_body_markers.csv'
with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["cell", "markers"])
    for key, value in whole_body_markers.items():
        row = [key, value]
        csvwriter.writerow(row)






























































































































































































































In [35]:
# Screen the cells and associted recommended markers meet the condition1: 1.5x
cells1 = set()
for i in range(m):
  top_3_indices = objective_matrix[i,:].argsort()[-3:][::-1]
  for j in top_3_indices:
    cell = np.argmax(nTPM_matrix[:, j])
    if cell == i:
      column_data = nTPM_matrix[:, j].copy()
      column_data[i] = -np.inf
      off_target = np.argmax(column_data)
      if nTPM_matrix[i, j] > 1.5 * nTPM_matrix[i, off_target]:
        cells1.add(i)

# Screen the cells and associted recommended markers meet the condition2: 1.5x
cells2 = set()
for i in range(m):
  top_3_indices = objective_matrix[i,:].argsort()[-3:][::-1]
  for j in top_3_indices:
    column_data = nTPM_matrix[:, j].copy()
    column_data = np.delete(column_data, i)
    average_value = np.mean(column_data)
    if nTPM_matrix[i, j] > 10 * average_value:
      cells2.add(i)

# Combine
union = cells1.union(cells2)

print("The number of cells satisfies condition1:", len(cells1))
print("The number of cells satisfies condition2:", len(cells2))
print("The number of cells in union set:", len(union))

The number of cells satisfies condition1: 102
The number of cells satisfies condition2: 193
The number of cells in union set: 194


In [45]:
# Return the top 10 recommeded markers for selected cells
whole_body_markers = {}
for i in union:
  obj_row = objective_matrix[i]
  markers = sorted(list(enumerate(obj_row)), key = lambda x: x[1], reverse=True)[: 10]
  markers_id = [x[0] for x in markers]
  print()
  whole_body_markers[tissue_cells[i][0] + " " + tissue_cells[i][1]] = [genes[x][0] for x in markers_id]

filename = 'selected_cells_recommended_whole_body_markers.csv'
with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["cell", "markers"])
    for key, value in whole_body_markers.items():
        row = [key, value]
        csvwriter.writerow(row)



































































































































































































