### This notebook loads in the markers found by Seurat and Conos, and finds the overlapping markers. The results are stored as a dictionary, and saved in the file "overlap_markers.pkl".

In [1]:
import pandas as pd

### 1. Create a list containing markers for each cluster from the result of Seurat
#### Tried top 5, 20 markers: too few markers; too few overlaps, also hard to break ties when matching clusters

In [2]:
seurat_markers = pd.read_csv("/Users/Miko/Downloads/6.10.top50.all.markers.txt", sep="\t")

In [3]:
seurat_markers.head()

Unnamed: 0,p_val,avg_logFC,pct.1,pct.2,p_val_adj,cluster,gene
1,3.921717e-219,0.716535,0.501,0.236,8.308157999999999e-215,0,ADAM19
2,9.852066000000001e-218,0.432682,0.666,0.399,2.08716e-213,0,CD69
3,1.749252e-207,0.535057,0.551,0.311,3.7057909999999997e-203,0,STOM
4,1.0227519999999999e-200,0.600771,0.524,0.265,2.1667000000000003e-196,0,CXCR6
5,3.090406e-188,1.053567,0.303,0.083,6.547023999999999e-184,0,ITGA1


In [4]:
seurat_markers.shape

(645, 7)

In [5]:
# test
seurat_cluster1_markers = list(seurat_markers[seurat_markers["cluster"] == 0]["gene"])

In [6]:
# a list, each element is the markers in one cluster
list_seurat_markers = [ list(seurat_markers[seurat_markers["cluster"] == i]["gene"]) 
                       for i in range(14)] 

### 2. Create a list containing markers for each cluster from the result of Conos

In [7]:
cluster_num = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"]


In [8]:
# a list, each element is the markers in one cluster
list_conos_markers = []

for i in range(14):
    conos_cluster = pd.read_csv("/Users/Miko/Downloads/3_Sample_Cluster_Markers/Cluster."+cluster_num[i], 
                                 sep="\t", 
                                 skiprows=1) # skip the first row: indicating cluster 1
   
    # take the top 5 markers (ranked by Z score)
    conos_cluster_markers = list(conos_cluster["Gene"])[:50] 
    list_conos_markers.append(conos_cluster_markers)
    

In [10]:
len(list_conos_markers[0])

50

### 3. Find intersection of markers
#### Iterate through Seurat clusters, find the clusters in Conos that has the largest number of overlapping markers with each Seurat cluster.
#### Then split the ties by matching the clusters that share the most number of overlapping markers.

In [11]:
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

In [12]:
# IF, we find overlapping markers just by comparing the same cluster index across two results
for i in range(14):
    print( len(intersection(list_conos_markers[i], list_seurat_markers[i])) )

0
19
8
0
0
1
0
14
0
2
1
0
0
4


In [13]:
# input: markers for in one particular cluster, generated by Seurat
# output: the matching cluster index (in conos), and the overlap markers
def find_largest_overlaps(seurat_cluster_markers):
    largest_overlaps = 0
    matching_conos_clust_index = "NA"
    overlapping_markers = []
    for i in range(14):
        if len(intersection(list_conos_markers[i], seurat_cluster_markers)) > largest_overlaps:
            largest_overlaps = len(intersection(list_conos_markers[i], seurat_cluster_markers))
            matching_conos_clust_index = i
            overlapping_markers = intersection(list_conos_markers[i], seurat_cluster_markers)
    return matching_conos_clust_index, overlapping_markers

# ATTENTION: returning the cluster INDEX (one smaller), not the actual cluster number   

In [14]:
def find_all_overlaps(list_markers):
    # key/value -- Seurat cluster number: [matching conos cluster index, list of overlap markers]
    overlap_dict = {}
    
    for i in range(14):
        conos_clust_index, overlap_markers = find_largest_overlaps(list_seurat_markers[i])
        overlap_dict[i] = [conos_clust_index, overlap_markers]
    
    return overlap_dict

In [15]:
overlap_dict = find_all_overlaps(list_seurat_markers)

In [16]:
overlap_dict.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

In [17]:
[overlap_dict[i][0] for i in range(14)]

[4, 1, 3, 4, 0, 1, 8, 7, 3, 6, 12, 12, 13, 9]

In [18]:
[len(overlap_dict[i][1]) for i in range(14)]

[14, 19, 27, 2, 19, 3, 15, 14, 10, 8, 4, 3, 3, 6]

### Found overlap markers between the results from Seurat vs. Conos, and assign the matching clusters based on the largest overlaps.

In [19]:
# input: markers for in one particular cluster, generated by Seurat
# output: the matching cluster index (in conos), and the overlap markers
def remaining_clusters_find_largest_overlaps(seurat_cluster_markers):
    largest_overlaps = 0
    matching_conos_clust_index = "NA"
    overlapping_markers = []
    for i in [2,5,10,11]:
        if len(intersection(list_conos_markers[i], seurat_cluster_markers)) > largest_overlaps:
            largest_overlaps = len(intersection(list_conos_markers[i], seurat_cluster_markers))
            matching_conos_clust_index = i
            overlapping_markers = intersection(list_conos_markers[i], seurat_cluster_markers)
    return matching_conos_clust_index, overlapping_markers

# ATTENTION: returning the cluster INDEX (one smaller), not the actual cluster number   

In [20]:
def remaining_clusters_find_all_overlaps(list_markers):
    # key/value -- Seurat cluster number: [matching conos cluster index, list of overlap markers]
    overlap_dict = {}
    
    for i in [3,5,8,11]:
        conos_clust_index, overlap_markers = remaining_clusters_find_largest_overlaps(list_seurat_markers[i])
        overlap_dict[i] = [conos_clust_index, overlap_markers]
    
    return overlap_dict

In [21]:
remaining_overlap_dict = remaining_clusters_find_all_overlaps(list_seurat_markers)

In [22]:
remaining_overlap_dict.keys()

dict_keys([3, 5, 8, 11])

In [23]:
[remaining_overlap_dict[i][0] for i in [3,5,8,11]]

[11, 11, 2, 'NA']

In [24]:
[len(remaining_overlap_dict[i][1]) for i in [3,5,8,11]]

[1, 3, 4, 0]

In [25]:
# input: markers for in one particular cluster, generated by Seurat
# output: the matching cluster index (in conos), and the overlap markers
def last_remaining_clusters_find_largest_overlaps(seurat_cluster_markers):
    largest_overlaps = 0
    matching_conos_clust_index = "NA"
    overlapping_markers = []
    for i in [5,10]:
        if len(intersection(list_conos_markers[i], seurat_cluster_markers)) > largest_overlaps:
            largest_overlaps = len(intersection(list_conos_markers[i], seurat_cluster_markers))
            matching_conos_clust_index = i
            overlapping_markers = intersection(list_conos_markers[i], seurat_cluster_markers)
    return matching_conos_clust_index, overlapping_markers

# ATTENTION: returning the cluster INDEX (one smaller), not the actual cluster number   

In [26]:
def last_remaining_clusters_find_all_overlaps(list_markers):
    # key/value -- Seurat cluster number: [matching conos cluster index, list of overlap markers]
    overlap_dict = {}
    
    for i in [3,11]:
        conos_clust_index, overlap_markers = last_remaining_clusters_find_largest_overlaps(list_seurat_markers[i])
        overlap_dict[i] = [conos_clust_index, overlap_markers]
    
    return overlap_dict

In [27]:
last_remaining_overlap_dict = last_remaining_clusters_find_all_overlaps(list_seurat_markers)

In [28]:
last_remaining_overlap_dict.keys()

dict_keys([3, 11])

In [30]:
[last_remaining_overlap_dict[i][0] for i in [3,11]]

['NA', 'NA']

In [31]:
[len(last_remaining_overlap_dict[i][1]) for i in [3,11]]

[0, 0]

### Showing number of overlapping markers between the matching clusters

#### Note: the key indicates Seurat cluster number, and the first element in each value indicates Conos cluster INDEX. For example, you see {0: [4, [] }, meaning Seurat cluster 0 corresponds to Conos cluster INDEX 4 (Cluster 5). 

In [33]:
for key in remaining_overlap_dict:
    overlap_dict[key] = remaining_overlap_dict[key]

In [34]:
for key in last_remaining_overlap_dict:
    overlap_dict[key] = last_remaining_overlap_dict[key]

In [35]:
[overlap_dict[i][0] for i in range(14)]

[4, 1, 3, 'NA', 0, 11, 8, 7, 2, 6, 12, 'NA', 13, 9]

In [40]:
overlap_dict

{0: [4,
  ['BHLHE40',
   'PTGER4',
   'CD69',
   'STOM',
   'ARHGAP18',
   'CXCR3',
   'CXCR6',
   'IFI44',
   'GPR171',
   'CD40LG',
   'ADAM19',
   'JAML',
   'CAPG',
   'LINC00892']],
 1: [1,
  ['LINC00861',
   'SERINC5',
   'PRKCQ-AS1',
   'MAL',
   'CCR7',
   'SATB1',
   'LDLRAP1',
   'RASGRP2',
   'TXK',
   'LEF1',
   'NOSIP',
   'ABLIM1',
   'TRABD2A',
   'LEF1-AS1',
   'ACTN1',
   'MYC',
   'SELL',
   'TCF7',
   'EEF1G']],
 2: [3,
  ['KLRG1',
   'C1orf21',
   'PRSS23',
   'TBX21',
   'ADGRG1',
   'CX3CR1',
   'FCGR3A',
   'FCRL6',
   'FGR',
   'A2M',
   'ZEB2-AS1',
   'PLEK',
   'PATL2',
   'GZMB',
   'TGFBR3',
   'S1PR5',
   'EFHD2',
   'ADRB2',
   'PRF1',
   'FCGR3B',
   'GZMH',
   'FGFBP2',
   'KLRD1',
   'SLAMF7',
   'ZEB2',
   'GNLY',
   'NKG7']],
 3: ['NA', []],
 4: [0,
  ['CCR8',
   'TNFRSF18',
   'MIR4632',
   'ICOS',
   'LAYN',
   'TNFRSF4',
   'DUSP4',
   'LAIR2',
   'TNFRSF9',
   'VDR',
   'CTLA4',
   'IL2RA',
   'BATF',
   'FOXP3',
   'ENTPD1',
   'SDC4',
   'IL1R2'

In [46]:
import pickle

In [47]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [48]:
save_obj(overlap_dict, 'overlap_markers')

In [49]:
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [51]:
list_seurat_markers[0]

['ADAM19',
 'CD69',
 'STOM',
 'CXCR6',
 'ITGA1',
 'GZMK',
 'CAPG',
 'GPR171',
 'CXCR3',
 'EMB',
 'SKIL',
 'F2R',
 'JAML',
 'ADGRE5',
 'ITGAE',
 'MIR4680',
 'TBCD',
 'AOAH',
 'SYTL2',
 'PDE4B',
 'PELO',
 'SLF1',
 'PTGER4',
 'LINC00892',
 'BHLHE40',
 'ARHGAP18',
 'ITM2C',
 'GLUL',
 'DPP4',
 'IFNGR1',
 'IFI44',
 'TNFSF14',
 'GAB3',
 'ARHGAP35',
 'RUNX2',
 'FOSB',
 'CD40LG',
 'PDE4DIP',
 'GYG1',
 'ANKRD28',
 'FMNL3',
 'PPP1R16B',
 'PDE4D',
 'LRRC75A',
 'ANAPC1P1',
 'IL4I1',
 'ANKRD11',
 'GAS5-AS1',
 'PPDPF',
 'IFITM3']

In [50]:
load_obj('overlap_markers')

{0: [4,
  ['BHLHE40',
   'PTGER4',
   'CD69',
   'STOM',
   'ARHGAP18',
   'CXCR3',
   'CXCR6',
   'IFI44',
   'GPR171',
   'CD40LG',
   'ADAM19',
   'JAML',
   'CAPG',
   'LINC00892']],
 1: [1,
  ['LINC00861',
   'SERINC5',
   'PRKCQ-AS1',
   'MAL',
   'CCR7',
   'SATB1',
   'LDLRAP1',
   'RASGRP2',
   'TXK',
   'LEF1',
   'NOSIP',
   'ABLIM1',
   'TRABD2A',
   'LEF1-AS1',
   'ACTN1',
   'MYC',
   'SELL',
   'TCF7',
   'EEF1G']],
 2: [3,
  ['KLRG1',
   'C1orf21',
   'PRSS23',
   'TBX21',
   'ADGRG1',
   'CX3CR1',
   'FCGR3A',
   'FCRL6',
   'FGR',
   'A2M',
   'ZEB2-AS1',
   'PLEK',
   'PATL2',
   'GZMB',
   'TGFBR3',
   'S1PR5',
   'EFHD2',
   'ADRB2',
   'PRF1',
   'FCGR3B',
   'GZMH',
   'FGFBP2',
   'KLRD1',
   'SLAMF7',
   'ZEB2',
   'GNLY',
   'NKG7']],
 3: ['NA', []],
 4: [0,
  ['CCR8',
   'TNFRSF18',
   'MIR4632',
   'ICOS',
   'LAYN',
   'TNFRSF4',
   'DUSP4',
   'LAIR2',
   'TNFRSF9',
   'VDR',
   'CTLA4',
   'IL2RA',
   'BATF',
   'FOXP3',
   'ENTPD1',
   'SDC4',
   'IL1R2'