In [1]:
import SinglePython
import pandas as pd
import numpy as np
import utils
import scipy

In [2]:
refDataset = pd.read_csv("./Data/Reference/hpca_data.csv", sep=";", decimal = ",", index_col = 0)
refDataset = refDataset.astype('float')

annot = pd.read_csv("./Data/Reference/hpca_annot.csv",sep=";", index_col = 0)
annot.columns=["cellType"]

In [3]:
data = utils.readData_SingleR("./Data/Datasets/filtered_feature_bc_matrix/", 250)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [4]:
def _FineTuneRoundByN(sc_data,refDataset,annot,top_labels,de,cols):
    """ Returns final annotations of single cells.
    Parameters
    ----------
    sc_data : DataFrame
        Sc-RNAseq data.
        
    refDataset : DataFrame
        The reference dataset gene expression matrix.
        
    annot : DataFrame
        Annotations for each column in ref_data.
        
    top_labels: List
        Most correlated cell types from the previous round.
        
    de : Dict
        Differentially expressed genes for each combination of cell types.
        
    cols : List
        Cell names to calculate correlations.
        
    Returns
    -------
    top_annotations : List
        A list of cell types that are associated with given cells.
        
    """
    refDataset=refDataset.T
    refDataset["cell_types"]=annot["cellType"].get_values()
    refDataset=refDataset[refDataset["cell_types"].isin(top_labels)]
    del refDataset["cell_types"]
    refDataset=refDataset.T
    annot=annot[annot["cellType"].isin(top_labels)]
    n=int(500*np.power(2/3,np.log2(len(np.unique(annot.cellType)))))
    de=utils.getDEgenes(refDataset,annot=annot,n=n)
    de_merged=[]
    [de_merged.extend(i) for i in  de.values()]
    de_merged=np.unique(de_merged)

    cor=scipy.stats.spearmanr(sc_data.loc[de_merged,cols],refDataset.loc[de_merged])
    cor=pd.DataFrame(cor[0]).iloc[:,0:len(cols)][-len(refDataset.columns):]
    cor.columns=cols
    cor.index=refDataset.columns
    cor["cellType"]=annot["cellType"].values
    scores=cor.groupby("cellType").quantile(q=0.8)
    return [scores.sort_index(by=i,ascending=False).index.values[0:len(top_labels)-1] for i in scores.columns]

In [5]:
sc_data=utils.convertAnnDataToDf(data)


intersect=np.intersect1d(refDataset.index.values,sc_data.index)
sc_data=sc_data.loc[intersect]
refDataset=refDataset.loc[intersect]

de=utils.getDEgenes(refDataset,annot)
de_merged=[]
[de_merged.extend(i) for i in  de.values()]
de_merged=np.unique(de_merged)

cor=scipy.stats.spearmanr(sc_data.loc[de_merged],refDataset.loc[de_merged])
cor=pd.DataFrame(cor[0]).iloc[:,0:len(sc_data.columns)][-len(refDataset.columns):]
cor.columns=sc_data.columns
cor.index=refDataset.columns
cor["cellType"]=annot["cellType"].values
scores=cor.groupby("cellType").quantile(q=0.8)

In [6]:
n=7


d={}
[d.update({i:np.sort(scores.sort_index(by=i,ascending=False).index.values[0:n])}) for i in scores.columns]
while(n>1):
    unique_types=[list(x) for x in set(tuple(x) for x in d.values())]
    for i in unique_types:
        cols=[j for j in d.keys() if list(d.get(j))==i]
        top_labels=i
        res=_FineTuneRoundByN(sc_data,refDataset,annot,top_labels,de,cols)
        [d.update({cols[t]:res[t]}) for t in range(len(cols))]
    n=n-1
pd.DataFrame(d,index=["final_annotations"]).sort_index(axis=1)

  """


Unnamed: 0,AAACCCAAGGAGAGTA-1,AAACGCTTCAGCCCAG-1,AAAGAACAGACGACTG-1,AAAGAACCAATGGCAG-1,AAAGAACGTCTGCAAT-1,AAAGGATAGTAGACAT-1,AAAGGATCACCGGCTA-1,AAAGGATTCAGCTTGA-1,AAAGGATTCCGTTTCG-1,AAAGGGCTCATGCCCT-1,...,TTTCACATCTCAGGCG-1,TTTCATGGTGCCTAAT-1,TTTCATGTCACTCACC-1,TTTCCTCCACAGAGCA-1,TTTCCTCTCCTACACC-1,TTTCCTCTCTCTTGCG-1,TTTGATCTCTTTGGAG-1,TTTGGTTAGTAACCTC-1,TTTGGTTGTAGAATAC-1,TTTGTTGCAATTAGGA-1
final_annotations,Monocyte:CD16-,B_cell:Naive,T_cell:CD4+_effector_memory,T_cell:CD8+,T_cell:CD4+_central_memory,B_cell:Memory,B_cell:Naive,Monocyte:CD16-,Monocyte:CD16-,T_cell:CD4+_Naive,...,B_cell:immature,B_cell:Memory,Monocyte:CD16-,B_cell:immature,T_cell:CD4+_effector_memory,Monocyte:CD16+,T_cell:CD4+_central_memory,T_cell:CD4+_central_memory,Monocyte:CD16-,B_cell:immature


In [7]:
from multiprocessing import Pool
from functools import partial
import temp
import os
def _FineTuneMP(unique_types,sc_data,refDataset,annot,de,d):
    p=Pool(processes=os.cpu_count()-1)
    prod=partial(temp._FineTunePerCluster,sc_data=sc_data,refDataset=refDataset,annot=annot,de=de,d=d)
    result=p.map(prod,unique_types) #update here
    return result

In [8]:
n=7

d={}
[d.update({i:np.sort(scores.sort_index(by=i,ascending=False).index.values[0:n])}) for i in scores.columns]
while(n>1):
    unique_types=[list(x) for x in set(tuple(x) for x in d.values())]
    if __name__ == '__main__':
        dicts=_FineTuneMP(unique_types,sc_data,refDataset,annot,de,d)
        d={}
        [d.update(i) for i in dicts]
    n=n-1
pd.DataFrame(d,index=["final_annotations"]).sort_index(axis=1)

  after removing the cwd from sys.path.


Unnamed: 0,AAACCCAAGGAGAGTA-1,AAACGCTTCAGCCCAG-1,AAAGAACAGACGACTG-1,AAAGAACCAATGGCAG-1,AAAGAACGTCTGCAAT-1,AAAGGATAGTAGACAT-1,AAAGGATCACCGGCTA-1,AAAGGATTCAGCTTGA-1,AAAGGATTCCGTTTCG-1,AAAGGGCTCATGCCCT-1,...,TTTCACATCTCAGGCG-1,TTTCATGGTGCCTAAT-1,TTTCATGTCACTCACC-1,TTTCCTCCACAGAGCA-1,TTTCCTCTCCTACACC-1,TTTCCTCTCTCTTGCG-1,TTTGATCTCTTTGGAG-1,TTTGGTTAGTAACCTC-1,TTTGGTTGTAGAATAC-1,TTTGTTGCAATTAGGA-1
final_annotations,Monocyte:CD16-,B_cell:Naive,T_cell:CD4+_effector_memory,T_cell:CD8+,T_cell:CD4+_central_memory,B_cell:Memory,B_cell:Naive,Monocyte:CD16-,Monocyte:CD16-,T_cell:CD4+_effector_memory,...,B_cell:Naive,B_cell:Memory,Monocyte:CD16-,B_cell:Naive,T_cell:CD4+_central_memory,Monocyte:CD16+,T_cell:CD4+_central_memory,T_cell:CD4+_central_memory,Monocyte:CD16-,B_cell:Naive


In [3]:
refDataset = pd.read_csv("./Data/Reference/hpca_data.csv", sep=";", decimal = ",", index_col = 0)
refDataset = refDataset.astype('float')

annot = pd.read_csv("./Data/Reference/hpca_annot.csv",sep=";", index_col = 0)
annot.columns=["cellType"]

In [13]:
refDataset

Unnamed: 0,GSM112490,GSM112491,GSM112540,GSM112541,GSM112661,GSM112664,GSM112665,GSM112666,GSM112667,GSM112668,...,GSM549583,GSM549584,GSM551183,GSM556647,GSM556663,GSM556665,GSM92231,GSM92232,GSM92233,GSM92234
A1BG,7.3651,6.9673,6.9751,7.0509,6.7073,6.5456,6.6415,6.8703,5.9347,7.2949,...,6.9851,6.6189,6.7427,7.4748,7.3222,6.1694,8.0555,7.7379,8.1694,8.0626
A1BG-AS1,6.3622,6.0945,6.3750,6.3942,5.8412,5.6421,6.2235,6.0272,6.0702,5.5895,...,6.5512,6.7696,5.4957,5.8311,5.6336,6.0093,5.9021,6.2551,6.2666,6.0233
A1CF,3.7219,3.6959,3.6518,3.6352,3.6508,3.4571,3.5403,3.6702,3.5236,3.5015,...,3.7974,3.6747,3.6636,3.7834,4.0732,3.9941,3.5665,3.7924,3.5257,3.5859
A2M,10.1296,11.6344,9.6626,9.9476,11.4929,9.3166,10.0752,11.3532,9.2001,10.1117,...,7.6072,6.7428,6.8858,4.5151,5.3012,3.9309,5.8201,8.5346,5.7163,6.3192
A2M-AS1,4.0154,3.7543,4.2488,4.1022,3.7049,3.8815,4.0189,3.7729,3.5946,4.2302,...,4.8949,5.4156,3.9843,4.3075,4.0557,3.8644,4.0683,4.0862,3.8441,4.3098
A2ML1,3.7658,3.5687,3.8155,3.9452,3.8574,3.7654,3.6557,3.8338,3.7383,3.5643,...,4.4720,3.9993,3.7865,3.7809,3.8999,4.4188,3.5851,3.6395,3.5721,3.6966
A4GALT,7.1449,7.4489,6.9887,7.6153,7.2981,7.3749,7.0656,7.5384,7.4477,6.4415,...,7.7508,7.8547,7.0449,6.1508,7.3171,7.1709,6.7046,7.3496,6.9740,6.8367
A4GNT,4.2586,4.4644,4.3037,4.1319,4.1468,4.0905,4.2572,4.2749,3.8570,4.0975,...,4.2363,4.1444,4.0985,4.1498,4.7986,4.9863,3.6864,3.9690,3.6444,3.9591
AAAS,7.9879,8.0856,7.8999,7.8630,7.6496,7.6513,7.9661,7.3333,7.6049,7.6721,...,7.8495,5.1762,7.2490,7.7051,6.9889,7.6226,7.8267,8.4833,8.1095,7.7710
AACS,6.6757,6.9312,6.8099,7.0222,6.9142,7.5282,6.9381,7.4129,7.2124,7.1166,...,5.1280,5.6459,7.7559,7.2843,5.8630,5.9639,8.6030,8.4610,8.4136,8.2808


In [14]:
refDataset1

Unnamed: 0,GSM1136119_EA07068_260297_MOGENE-1_0-ST-V1_MF.11C-11B+.LU_1.CEL,GSM1136120_EA07068_260298_MOGENE-1_0-ST-V1_MF.11C-11B+.LU_2.CEL,GSM1136121_EA07068_260299_MOGENE-1_0-ST-V1_MF.11C-11B+.LU_3.CEL,GSM1136122_EA07068_260300_MOGENE-1_0-ST-V1_MF.ALV.LU_1.CEL,GSM1136123_EA07068_260301_MOGENE-1_0-ST-V1_MF.ALV.LU_2.CEL,GSM1136124_EA07068_260302_MOGENE-1_0-ST-V1_MF.ALV.LU_3.CEL,GSM1136125_EA07068_260307_MOGENE-1_0-ST-V1_MO.6+I-.BL_1.CEL,GSM1136126_EA07068_260303_MOGENE-1_0-ST-V1_MO.6+2+.MLN_1.CEL,GSM1136127_EA07068_260304_MOGENE-1_0-ST-V1_MO.6+2+.SLN_1.CEL,GSM1136128_EA07068_260305_MOGENE-1_0-ST-V1_MO.6+2+.SLN_2.CEL,...,GSM920646_EA07068_108148_MoGene_B614WLNTREG_3.CEL,GSM920647_EA07068_116089_MOGENE-1_0-ST-V1_B6.14W.LN.TR_3.CEL,GSM920648_EA07068_201208_MOGENE-1_0-ST-V1_TGD.VG2+24AHI.E17.TH_1.CEL,GSM920649_EA07068_201209_MOGENE-1_0-ST-V1_TGD.VG2+24AHI.E17.TH_2.CEL,GSM920650_EA07068_201210_MOGENE-1_0-ST-V1_TGD.VG2+24AHI.E17.TH_3.CEL,GSM920651_EA07068_201205_MOGENE-1_0-ST-V1_TGD.VG4+24AHI.E17.TH_1.CEL,GSM920652_EA07068_201206_MOGENE-1_0-ST-V1_TGD.VG4+24AHI.E17.TH_2.CEL,GSM920653_EA07068_201207_MOGENE-1_0-ST-V1_TGD.VG4+24AHI.E17.TH_3.CEL,GSM920654_EA07068_201214_MOGENE-1_0-ST-V1_TGD.VG4+24ALO.E17.TH_1.CEL,GSM920655_EA07068_201215_MOGENE-1_0-ST-V1_TGD.VG4+24ALO.E17.TH_2.CEL
ZGLP1,5.2546,5.4619,4.9024,4.9337,4.8395,4.9925,5.0888,5.1562,5.2792,5.0802,...,4.7924,5.0867,4.7019,4.7563,5.0881,4.8878,5.0211,4.9543,5.4267,4.9282
VMN2R65,3.4000,3.4583,3.4317,3.5467,3.6162,3.6530,3.6005,3.5638,3.4511,3.6553,...,3.4972,3.7492,3.6095,3.5774,3.6388,3.4501,3.4208,3.5419,3.5182,3.5225
GM10024,6.4472,6.2102,6.3699,6.0181,6.6313,6.1632,6.2243,6.5226,6.4867,6.3582,...,6.6101,6.4444,6.0979,5.9790,5.6942,5.8580,6.0394,6.0920,5.9506,5.9533
OOG3,4.1594,4.1552,3.8835,4.1059,4.2113,4.1586,4.3084,4.1380,4.3137,4.1807,...,4.4702,4.2219,3.9697,4.1242,4.1314,4.0212,3.9209,4.0634,4.2289,4.1949
LDLRAP1,7.4409,7.6464,7.3759,8.2166,8.5026,8.2886,8.5939,8.4179,8.0947,8.2941,...,9.1930,9.3165,6.9430,7.1915,7.0439,7.0183,6.7374,6.8707,6.8657,6.8706
MDN1,6.9584,7.1735,7.4628,8.0434,7.8499,7.9434,7.2568,6.9892,7.3286,7.5322,...,9.8108,9.3001,8.5087,8.8542,8.7214,8.5522,8.9807,8.7976,8.6267,8.9005
IFI208,6.4913,7.1217,6.8166,3.8578,4.0883,4.6092,7.5684,6.4106,8.4383,8.8009,...,8.6029,8.7962,7.2397,6.9928,7.0601,7.8337,7.5833,7.5884,7.9950,7.9230
WFDC17,13.3888,13.2917,13.2896,10.4692,10.8212,10.9328,12.8836,12.5422,13.1400,12.6570,...,4.7873,4.7235,5.3506,4.5972,4.5300,4.9617,4.6568,4.7580,4.5061,4.5910
A930017K11RIK,6.3764,5.8388,6.2428,5.6008,6.0460,5.8809,5.9013,6.4009,6.1559,6.1604,...,6.2544,6.4559,5.7543,5.8892,5.6953,5.6452,6.0652,6.0410,6.2380,5.9096
GATA5OS,6.3609,6.1392,6.1397,6.0923,5.9716,6.2431,6.1861,6.5377,6.3666,6.0134,...,6.1282,6.0076,5.9142,5.9645,6.0496,5.7017,5.9605,5.9461,6.0615,5.8648


In [8]:
refDataset1 = pd.read_csv("C:/Users/murat_gga8ya6/Desktop/Thesis/immgen_data.csv", sep=";", decimal = ",", index_col = 0)
refDataset1 = refDataset1.astype('float')

annot1 = pd.read_csv("C:/Users/murat_gga8ya6/Desktop/Thesis/immgen_annot.csv",sep=";", index_col = 0)
annot1.columns=["cellType"]

In [3]:
from collections import Counter

In [5]:
[i for i in Counter(annot.cellType).keys() if Counter(annot.cellType).get(i) > 19 ]

['DC:monocyte-derived:immature', 'Monocyte', 'Macrophage:monocyte-derived']