In [1]:
from goatools.anno.gaf_reader import GafReader
from goatools.obo_parser import OBOReader, GODag
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary

import numpy as np
import h5py

ModuleNotFoundError: No module named 'goatools'

## Summary

1. [Reading the TISCH dataset](#Reading-the-TISCH-dataset)

2. [Reading gaf (annotations) and obo (terms)](#Reading-gaf-(annotations)-and-obo-(terms))

## Reading the TISCH dataset

In [2]:
filename = "../data_tisch/Glioma_GSE103224_expression.h5"
h5 = h5py.File(filename,'r')

In [3]:
h5.keys()

<KeysViewHDF5 ['matrix']>

In [30]:
h5['matrix']['indptr']

<HDF5 dataset "indptr": shape (17186,), type "<i4">

In [12]:
h5['matrix']['data']

<HDF5 dataset "data": shape (28973062,), type "<f4">

In [15]:
h5['matrix']['features']['name']

<HDF5 dataset "name": shape (28909,), type "|S100">

In [None]:
h

In [5]:
h5['matrix']['barcodes']

<HDF5 dataset "barcodes": shape (17185,), type "|S200">

In [6]:
np.unique(np.array(h5['matrix']['indices'])).shape

(28909,)

In [7]:
h5['matrix']['data'][:10]

array([1.7912148 , 0.9311621 , 1.0722069 , 0.32524088, 0.5702607 ,
       0.32524088, 0.76690507, 1.4048493 , 1.9118304 , 0.76690507],
      dtype=float32)

In [8]:
h5['matrix']['features'].keys()

<KeysViewHDF5 ['_all_tag_keys', 'feature_type', 'genome', 'id', 'name']>

In [9]:
h5['matrix']['features']['id'][:100]

array([b'SLC7A10', b'AC007228.11', b'AC092667.2', b'ZNF367', b'SULT1B1',
       b'TRIM63', b'AL590762.11', b'HDHD2', b'MORF4L2-AS1',
       b'RP11-384C12.1', b'RP11-298I3.4', b'TMEM53', b'CTA-796E4.4',
       b'EIF4HP2', b'RP11-517B11.7', b'HRAT92', b'RN7SL504P',
       b'AC108039.1', b'RP1-249H1.3', b'PCDHB19P', b'ZBTB12', b'SLC25A14',
       b'FAAP100', b'PRORSD1P', b'DNM1', b'DHFRL1', b'CCP110', b'STARD8',
       b'RP11-314C16.1', b'ANKRD20A8P', b'TICRR', b'AC074367.1', b'SNX11',
       b'CNTLN', b'SKA2', b'CIB1', b'CAPN6', b'KB-1125A3.11', b'PCNPP3',
       b'NDUFB9', b'SUMO2P7', b'RP1-266L20.9', b'SH3TC2', b'YBX1P1',
       b'MTRR', b'RP11-3D4.2', b'GS1-166A23.2', b'RP11-6N17.10',
       b'CCDC186', b'ZNF784', b'CYCSP24', b'AL139319.1', b'PIAS1',
       b'RTN1', b'HOXA11', b'ABHD12', b'SLC16A5', b'RHOG', b'KATNBL1P6',
       b'KDELR2', b'FLT4', b'CTC-510F12.4', b'RP11-270L13.1', b'ZSCAN9',
       b'RP11-136F16.2', b'RP11-831F12.2', b'RP4-590F24.2',
       b'RP13-514E23.1', b'CTA-3

In [10]:
h5['matrix']['features']['name'][:100]

array([b'SLC7A10', b'AC007228.11', b'AC092667.2', b'ZNF367', b'SULT1B1',
       b'TRIM63', b'AL590762.11', b'HDHD2', b'MORF4L2-AS1',
       b'RP11-384C12.1', b'RP11-298I3.4', b'TMEM53', b'CTA-796E4.4',
       b'EIF4HP2', b'RP11-517B11.7', b'HRAT92', b'RN7SL504P',
       b'AC108039.1', b'RP1-249H1.3', b'PCDHB19P', b'ZBTB12', b'SLC25A14',
       b'FAAP100', b'PRORSD1P', b'DNM1', b'DHFRL1', b'CCP110', b'STARD8',
       b'RP11-314C16.1', b'ANKRD20A8P', b'TICRR', b'AC074367.1', b'SNX11',
       b'CNTLN', b'SKA2', b'CIB1', b'CAPN6', b'KB-1125A3.11', b'PCNPP3',
       b'NDUFB9', b'SUMO2P7', b'RP1-266L20.9', b'SH3TC2', b'YBX1P1',
       b'MTRR', b'RP11-3D4.2', b'GS1-166A23.2', b'RP11-6N17.10',
       b'CCDC186', b'ZNF784', b'CYCSP24', b'AL139319.1', b'PIAS1',
       b'RTN1', b'HOXA11', b'ABHD12', b'SLC16A5', b'RHOG', b'KATNBL1P6',
       b'KDELR2', b'FLT4', b'CTC-510F12.4', b'RP11-270L13.1', b'ZSCAN9',
       b'RP11-136F16.2', b'RP11-831F12.2', b'RP4-590F24.2',
       b'RP13-514E23.1', b'CTA-3

In [11]:
barcodes = h5['matrix']['barcodes']

print("Nombre de cellules: ", barcodes.shape[0])
print("Premiers exemples de barcodes")

barcodes[:50]

Nombre de cellules:  17185
Premiers exemples de barcodes


array([b'PJ016_2', b'PJ016_4', b'PJ016_5', b'PJ016_6', b'PJ016_7',
       b'PJ016_10', b'PJ016_11', b'PJ016_14', b'PJ016_17', b'PJ016_19',
       b'PJ016_20', b'PJ016_24', b'PJ016_25', b'PJ016_26', b'PJ016_27',
       b'PJ016_28', b'PJ016_29', b'PJ016_30', b'PJ016_33', b'PJ016_35',
       b'PJ016_37', b'PJ016_40', b'PJ016_42', b'PJ016_43', b'PJ016_44',
       b'PJ016_45', b'PJ016_47', b'PJ016_49', b'PJ016_51', b'PJ016_52',
       b'PJ016_54', b'PJ016_55', b'PJ016_60', b'PJ016_62', b'PJ016_64',
       b'PJ016_65', b'PJ016_66', b'PJ016_67', b'PJ016_68', b'PJ016_69',
       b'PJ016_72', b'PJ016_73', b'PJ016_74', b'PJ016_75', b'PJ016_77',
       b'PJ016_78', b'PJ016_79', b'PJ016_80', b'PJ016_81', b'PJ016_82'],
      dtype='|S200')

In [12]:
h5['matrix']['indptr'][:100]

array([     0,   6810,  13589,  20347,  27286,  33850,  40210,  46613,
        52630,  58592,  64490,  70648,  76656,  82755,  88971,  94753,
       100453, 106209, 111929, 117616, 123546, 129270, 134975, 140676,
       146586, 152176, 158109, 164040, 169775, 176011, 181608, 187786,
       193660, 199636, 205001, 210442, 215723, 221297, 227903, 233453,
       239216, 244773, 250197, 255411, 261054, 266486, 271974, 277337,
       282749, 288174, 293649, 299145, 304326, 309735, 315144, 320529,
       325425, 330667, 335865, 341444, 346790, 352026, 356945, 362247,
       367103, 372725, 377963, 383498, 389186, 394598, 399527, 404442,
       409430, 414657, 419908, 425336, 430783, 435950, 440944, 446170,
       451615, 457496, 462475, 467290, 472295, 477578, 483045, 488228,
       493561, 498357, 504025, 509569, 514849, 519844, 525251, 530715,
       535405, 540879, 546241, 551343], dtype=int32)

In [13]:
h5['matrix']['data'][:100]

array([1.7912148 , 0.9311621 , 1.0722069 , 0.32524088, 0.5702607 ,
       0.32524088, 0.76690507, 1.4048493 , 1.9118304 , 0.76690507,
       0.32524088, 0.9311621 , 0.9311621 , 1.1957948 , 0.32524088,
       0.5702607 , 0.5702607 , 0.32524088, 0.5702607 , 0.32524088,
       1.85334   , 0.32524088, 0.9311621 , 1.305775  , 0.5702607 ,
       0.5702607 , 2.32481   , 0.32524088, 0.32524088, 0.5702607 ,
       1.4048493 , 2.116607  , 0.5702607 , 0.32524088, 1.4048493 ,
       0.32524088, 2.2051537 , 0.32524088, 1.1957948 , 0.5702607 ,
       0.32524088, 0.32524088, 0.32524088, 0.32524088, 0.32524088,
       1.305775  , 0.5702607 , 0.9311621 , 0.9311621 , 1.5776666 ,
       0.32524088, 0.32524088, 0.32524088, 0.9311621 , 0.32524088,
       0.5702607 , 0.5702607 , 0.32524088, 0.76690507, 0.32524088,
       0.32524088, 0.32524088, 0.32524088, 0.32524088, 0.32524088,
       0.32524088, 0.5702607 , 0.32524088, 0.5702607 , 0.76690507,
       0.5702607 , 0.76690507, 0.32524088, 0.76690507, 1.30577

In [14]:
h5['matrix']['indices'][6810]

5

In [15]:
h5['matrix']['features'].keys()

<KeysViewHDF5 ['_all_tag_keys', 'feature_type', 'genome', 'id', 'name']>

In [16]:
h5['matrix']['features']['name']

<HDF5 dataset "name": shape (28909,), type "|S100">

In [17]:
np.unique(np.array(h5['matrix']['features']['name'])).shape

(28909,)

In [18]:
np.array(h5['matrix']['indices']).max()

28908

In [19]:
j=np.array(h5['matrix']['indptr'])[1]

In [20]:
np.array(h5['matrix']['indices'])[1]

6

In [21]:
np.array(h5['matrix']['indices'])[j+1]

6

In [22]:
indcell = np.array(h5['matrix']['indptr'])

In [23]:
N_cells = indcell.shape[0]-1

In [24]:
indgenes = np.array(h5['matrix']['indices'])

In [25]:
indgenes.shape

(28973062,)

In [26]:
N_genes = np.unique(np.array(h5['matrix']['features']['name'])).shape[0]

In [27]:
data = np.array(h5['matrix']['data'])

In [28]:
data[0]

1.7912148

In [29]:
expr_mat = np.zeros((N_cells, N_genes))

In [30]:
for i in range(N_cells):
    expr_mat[i,indgenes[indcell[i]:indcell[i+1]]] = data[indcell[i]:indcell[i+1]]

In [31]:
list(expr_mat[0])

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.7912148237228394,
 0.9311621189117432,
 1.0722068548202515,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3252408802509308,
 0.0,
 0.0,
 0.0,
 0.5702607035636902,
 0.0,
 0.3252408802509308,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.7669050693511963,
 1.4048492908477783,
 0.0,
 0.0,
 0.0,
 1.9118304252624512,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.7669050693511963,
 0.3252408802509308,
 0.0,
 0.9311621189117432,
 0.0,
 0.9311621189117432,
 0.0,
 1.1957948207855225,
 0.0,
 0.0,
 0.0,
 0.3252408802509308,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5702607035636902,
 0.5702607035636902,
 0.3252408802509308,
 0.0,
 0.5702607035636902,
 0.3252408802509308,
 0.0,
 1.8533400297164917,
 0.0,
 0.0,
 0.3252408802509308,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [32]:
gene_names = np.array(h5['matrix']['features']['name'])

In [33]:
expr_mat.shape

(17185, 28909)

In [34]:
gene_names.shape

(28909,)

In [35]:
gene_names

array([b'SLC7A10', b'AC007228.11', b'AC092667.2', ..., b'KIAA1919',
       b'RP4-569M23.4', b'THAP6'], dtype='|S100')

## Reading gaf (annotations) and obo (terms)

In [35]:
ogaf = GafReader("../goa_human.gaf")

HMS:0:00:08.637758 606,840 annotations READ: ../goa_human.gaf 


In [36]:
obon = OBOReader("../go.obo")

In [37]:
i=0
for rec in obon:
    print(rec)
    
    i += 1
    
    if i==10:
        break

GO:0000001	mitochondrion inheritance [biological_process]
GO:0000002	mitochondrial genome maintenance [biological_process]
GO:0000003	reproduction [biological_process]
GO:0000005	obsolete ribosomal chaperone activity [molecular_function]obsolete
GO:0000006	high-affinity zinc transmembrane transporter activity [molecular_function]
GO:0000007	low-affinity zinc ion transmembrane transporter activity [molecular_function]
GO:0000008	obsolete thioredoxin [molecular_function]obsolete
GO:0000009	alpha-1,6-mannosyltransferase activity [molecular_function]
GO:0000010	trans-hexaprenyltranstransferase activity [molecular_function]
GO:0000011	vacuole inheritance [biological_process]


In [38]:
godag = GODag("../go.obo", optional_attrs={'consider', 'replaced_by'}, load_obsolete=True)

../go.obo: fmt(1.2) rel(2021-02-01) 50,515 GO Terms; optional_attrs(consider replaced_by)


In [39]:
godag['GO:0000001'].get_all_parents()

{'GO:0006996',
 'GO:0007005',
 'GO:0008150',
 'GO:0009987',
 'GO:0016043',
 'GO:0048308',
 'GO:0048311',
 'GO:0051179',
 'GO:0051640',
 'GO:0051641',
 'GO:0051646',
 'GO:0071840'}

In [40]:
godag['GO:0008150'].get_all_parents()

set()

In [41]:
godag['GO:0000001']

GOTerm('GO:0000001'):
  id:GO:0000001
  item_id:GO:0000001
  name:mitochondrion inheritance
  namespace:biological_process
  _parents: 2 items
    GO:0048308
    GO:0048311
  parents: 2 items
    GO:0048308	level-05	depth-05	organelle inheritance [biological_process]
    GO:0048311	level-05	depth-06	mitochondrion distribution [biological_process]
  children: 0 items
  level:6
  depth:7
  is_obsolete:False
  alt_ids: 0 items
  replaced_by:
  consider: 0 items

In [42]:
k=3

for i, term in godag.items():
    if term.level==k and not term.is_obsolete and term.namespace in ['biological_process', 'molecular_function']:
        print(term)

GO:0000031	level-03	depth-03	mannosylphosphate transferase activity [molecular_function]
GO:0000036	level-03	depth-04	acyl carrier activity [molecular_function]
GO:0000062	level-03	depth-09	fatty-acyl-CoA binding [molecular_function]
GO:0000149	level-03	depth-03	SNARE binding [molecular_function]
GO:0000150	level-03	depth-03	recombinase activity [molecular_function]
GO:0000166	level-03	depth-04	nucleotide binding [molecular_function]
GO:0000170	level-03	depth-03	sphingosine hydroxylase activity [molecular_function]
GO:0000212	level-03	depth-08	meiotic spindle organization [biological_process]
GO:0000226	level-03	depth-06	microtubule cytoskeleton organization [biological_process]
GO:0000278	level-03	depth-03	mitotic cell cycle [biological_process]
GO:0000279	level-03	depth-03	M phase [biological_process]
GO:0000320	level-03	depth-03	re-entry into mitotic cell cycle [biological_process]
GO:0000332	level-03	depth-06	template for synthesis of G-rich strand of telomere DNA activity [molecul

# Selecting unobsolete terms from level 3 in the right namespaces

In [43]:
def select_terms(godag, namespaces, level, take_obsolete):
    selected = {}
    
    for k, term in godag.items():
        if term.level == level and (take_obsolete or not term.is_obsolete) and term.namespace in namespaces:
            selected[k]=term
    
    return(selected)

selected_terms = select_terms(godag, ['biological_process', 'molecular_function'], 3, False)

In [44]:
selected_terms.keys()

dict_keys(['GO:0000031', 'GO:0000036', 'GO:0000062', 'GO:0000149', 'GO:0000150', 'GO:0000166', 'GO:0000170', 'GO:0000212', 'GO:0000226', 'GO:0000278', 'GO:0000279', 'GO:0000320', 'GO:0000332', 'GO:0000384', 'GO:0000386', 'GO:0000706', 'GO:0000709', 'GO:0000746', 'GO:0000747', 'GO:0000753', 'GO:0000755', 'GO:0000756', 'GO:0000758', 'GO:0000761', 'GO:0000768', 'GO:0000769', 'GO:0000900', 'GO:0000901', 'GO:0000902', 'GO:0000910', 'GO:0000913', 'GO:0000919', 'GO:0000981', 'GO:0001092', 'GO:0001094', 'GO:0001095', 'GO:0001096', 'GO:0001097', 'GO:0001098', 'GO:0001101', 'GO:0001216', 'GO:0001217', 'GO:0001402', 'GO:0001502', 'GO:0001505', 'GO:0001512', 'GO:0001525', 'GO:0001530', 'GO:0001539', 'GO:0001542', 'GO:0001543', 'GO:0001544', 'GO:0001547', 'GO:0001548', 'GO:0001549', 'GO:0001551', 'GO:0001552', 'GO:0001553', 'GO:0001554', 'GO:0001555', 'GO:0001556', 'GO:0001562', 'GO:0001565', 'GO:0001568', 'GO:0001618', 'GO:0001653', 'GO:0001659', 'GO:0001661', 'GO:0001662', 'GO:0001666', 'GO:00016

In [45]:
selected_genes = set()

for g in gene_names:
    selected_genes.add(g.decode('UTF-8'))

## Dicts with final genes-go associations, and the selected terms and genes id

In [46]:
genes_go = dict()
all_go = set()
all_genes = set()

for a in ogaf.get_associations():
    if a.DB_Symbol in selected_genes and a.GO_ID in selected_terms.keys():
        if a.DB_Symbol not in genes_go:
            genes_go[a.DB_Symbol] = {a.GO_ID}
        else:
            genes_go[a.DB_Symbol].add(a.GO_ID)
    
        all_go.add(a.GO_ID)
        all_genes.add(a.DB_Symbol)

In [47]:
i=0
kept_indices = []
for j, name in enumerate(gene_names):
    if name.decode('UTF-8') in all_genes:
        kept_indices.append (j)
        i+=1

In [48]:
len(kept_indices)

9549

In [49]:
genes_go

{'TRAV36DV7': {'GO:0002250', 'GO:0009617'},
 'TRDJ1': {'GO:0002250'},
 'CYP2D7': {'GO:0004497', 'GO:0006082', 'GO:0006805', 'GO:0016705'},
 'TRGV5': {'GO:0002250', 'GO:0045087'},
 'TRAV41': {'GO:0002250'},
 'MT-RNR1': {'GO:0043610'},
 'HLA-DRA': {'GO:0002250',
  'GO:0002504',
  'GO:0016032',
  'GO:0030247',
  'GO:0042605',
  'GO:0042608'},
 'TTC26': {'GO:0120170', 'GO:1905198'},
 'E2F8': {'GO:0000981', 'GO:0001217', 'GO:0042802'},
 'UBA6': {'GO:0007612', 'GO:0021764', 'GO:0021766', 'GO:0060996'},
 'ESYT2': {'GO:0006897', 'GO:0031210', 'GO:0042802'},
 'TRDV3': {'GO:0002250', 'GO:0002377', 'GO:0045087'},
 'UHRF1BP1L': {'GO:0062069'},
 'SHTN1': {'GO:0051015', 'GO:0061163'},
 'DAPL1': {'GO:0030154', 'GO:0097190'},
 'FEZF1': {'GO:0021772', 'GO:0043697'},
 'TMEM120B': {'GO:0034220'},
 'IRGM': {'GO:0006952', 'GO:0045087', 'GO:0061635'},
 'ARHGEF37': {'GO:0050790'},
 'PXDNL': {'GO:0006979', 'GO:0098869'},
 'ILVBL': {'GO:0030976'},
 'SYCE3': {'GO:0007131', 'GO:0007283'},
 'PLEKHG3': {'GO:005079

In [50]:
all_genes

{'ZNF829',
 'EIF5B',
 'GRM2',
 'LMO4',
 'RPL5',
 'P4HA1',
 'TPM4',
 'SCN11A',
 'HSP90AA5P',
 'SLU7',
 'IL10RB',
 'GCOM2',
 'DDX60L',
 'NOLC1',
 'TRIP11',
 'MYO1F',
 'HLA-DQA2',
 'EIF2B2',
 'HK1',
 'TTC39B',
 'SYDE2',
 'ZNF318',
 'SYTL3',
 'GPR173',
 'PARP15',
 'ATP2A3',
 'RAD17',
 'GRIP1',
 'NTNG1',
 'KANK1',
 'KLHL17',
 'DGKZ',
 'SYTL2',
 'CSNK1G1',
 'DICER1',
 'SCFD2',
 'SNX10',
 'ZNF891',
 'UNC5CL',
 'EIF1',
 'ZBTB43',
 'GADD45A',
 'NCOA5',
 'EDEM1',
 'CGREF1',
 'ARF4',
 'UPP1',
 'HLA-G',
 'LGALS4',
 'POFUT1',
 'EXOC3L2',
 'FGF11',
 'MCU',
 'ZNF260',
 'FSTL3',
 'CAMK2D',
 'GRIK5',
 'ZNF728',
 'LILRB3',
 'TSPAN12',
 'DGKD',
 'TAC1',
 'PLN',
 'TAS2R19',
 'MYD88',
 'TNFAIP2',
 'PRPF4B',
 'USP45',
 'ESM1',
 'XRCC1',
 'LGALS2',
 'NOTCH2',
 'ZSCAN5A',
 'ITGB7',
 'ZNF263',
 'DPM2',
 'TMPO',
 'PCBD1',
 'L3MBTL2',
 'GPC5',
 'GSK3A',
 'TSC2',
 'CHMP4B',
 'LGR6',
 'TNFRSF12A',
 'OLFM1',
 'KHDRBS1',
 'HOXC10',
 'SEC24A',
 'SEMA3A',
 'PIN1',
 'ADCY5',
 'POU5F1B',
 'C3AR1',
 'DDIT4',
 'TUBGCP2',


In [51]:
all_genes = sorted(list(all_genes))
all_go = sorted(list(all_go))

In [52]:
N_genes = len(all_genes)
N_go = len(all_go)

In [55]:
for t in genes_go['TFEB']:
    print(godag[t])

GO:0006959	level-03	depth-03	humoral immune response [biological_process]
GO:0001892	level-03	depth-05	embryonic placenta development [biological_process]
GO:0046983	level-03	depth-03	protein dimerization activity [molecular_function]
GO:0019899	level-03	depth-03	enzyme binding [molecular_function]
GO:0006914	level-03	depth-04	autophagy [biological_process]
GO:0002250	level-03	depth-03	adaptive immune response [biological_process]
GO:0000981	level-03	depth-03	DNA-binding transcription factor activity, RNA polymerase II-specific [molecular_function]


# Number of terms and genes retained

In [53]:
N_go

1387

In [54]:
N_genes

9549

In [55]:
all_genes

['A2M',
 'AAAS',
 'AACS',
 'AAK1',
 'AAMP',
 'AANAT',
 'AARSD1',
 'AASS',
 'AATF',
 'AATK',
 'ABAT',
 'ABCA1',
 'ABCA12',
 'ABCA2',
 'ABCA3',
 'ABCA4',
 'ABCA7',
 'ABCB1',
 'ABCB10',
 'ABCB4',
 'ABCB5',
 'ABCB6',
 'ABCC1',
 'ABCC2',
 'ABCC3',
 'ABCC4',
 'ABCC5',
 'ABCC6',
 'ABCC8',
 'ABCC9',
 'ABCD1',
 'ABCD2',
 'ABCD3',
 'ABCD4',
 'ABCE1',
 'ABCF1',
 'ABCF3',
 'ABCG1',
 'ABCG2',
 'ABCG4',
 'ABHD1',
 'ABHD10',
 'ABHD12',
 'ABHD12B',
 'ABHD13',
 'ABHD15',
 'ABHD16A',
 'ABHD17A',
 'ABHD17B',
 'ABHD17C',
 'ABHD2',
 'ABHD3',
 'ABHD5',
 'ABI1',
 'ABI2',
 'ABI3',
 'ABL1',
 'ABL2',
 'ABLIM1',
 'ABLIM2',
 'ABLIM3',
 'ABO',
 'ABR',
 'ABT1',
 'ACACA',
 'ACACB',
 'ACAD8',
 'ACAD9',
 'ACADL',
 'ACADM',
 'ACADSB',
 'ACADVL',
 'ACAN',
 'ACAT2',
 'ACBD3',
 'ACBD4',
 'ACBD5',
 'ACBD6',
 'ACBD7',
 'ACCS',
 'ACE',
 'ACE2',
 'ACHE',
 'ACIN1',
 'ACKR1',
 'ACKR2',
 'ACKR3',
 'ACO1',
 'ACO2',
 'ACOT11',
 'ACOT12',
 'ACOT7',
 'ACOT8',
 'ACOX1',
 'ACOX2',
 'ACOX3',
 'ACOXL',
 'ACR',
 'ACRBP',
 'ACSBG2',
 'ACS

In [56]:
gene_names

array([b'SLC7A10', b'AC007228.11', b'AC092667.2', ..., b'KIAA1919',
       b'RP4-569M23.4', b'THAP6'], dtype='|S100')

In [18]:
h5['matrix']['barcodes'][:100]

array([b'PJ016_2', b'PJ016_4', b'PJ016_5', b'PJ016_6', b'PJ016_7',
       b'PJ016_10', b'PJ016_11', b'PJ016_14', b'PJ016_17', b'PJ016_19',
       b'PJ016_20', b'PJ016_24', b'PJ016_25', b'PJ016_26', b'PJ016_27',
       b'PJ016_28', b'PJ016_29', b'PJ016_30', b'PJ016_33', b'PJ016_35',
       b'PJ016_37', b'PJ016_40', b'PJ016_42', b'PJ016_43', b'PJ016_44',
       b'PJ016_45', b'PJ016_47', b'PJ016_49', b'PJ016_51', b'PJ016_52',
       b'PJ016_54', b'PJ016_55', b'PJ016_60', b'PJ016_62', b'PJ016_64',
       b'PJ016_65', b'PJ016_66', b'PJ016_67', b'PJ016_68', b'PJ016_69',
       b'PJ016_72', b'PJ016_73', b'PJ016_74', b'PJ016_75', b'PJ016_77',
       b'PJ016_78', b'PJ016_79', b'PJ016_80', b'PJ016_81', b'PJ016_82',
       b'PJ016_83', b'PJ016_84', b'PJ016_85', b'PJ016_86', b'PJ016_87',
       b'PJ016_88', b'PJ016_89', b'PJ016_91', b'PJ016_92', b'PJ016_93',
       b'PJ016_94', b'PJ016_95', b'PJ016_96', b'PJ016_97', b'PJ016_98',
       b'PJ016_99', b'PJ016_100', b'PJ016_101', b'PJ016_103',
       

In [19]:
import pandas as pd

In [22]:
df = pd.read_csv('../data_tisch/Glioma_GSE103224_CellMetainfo_table.tsv', delimiter='\t')3

In [23]:
df

Unnamed: 0,Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Cluster,Patient,Source,Stage
0,PJ016_2,1.293257,10.748410,Malignant cells,AC-like Malignant,AC-like Malignant,18,PJ016,Tumor,Primary
1,PJ016_4,1.293991,10.674762,Malignant cells,AC-like Malignant,AC-like Malignant,18,PJ016,Tumor,Primary
2,PJ016_5,1.395297,11.095321,Malignant cells,AC-like Malignant,AC-like Malignant,18,PJ016,Tumor,Primary
3,PJ016_6,-6.619508,5.343307,Malignant cells,AC-like Malignant,AC-like Malignant,11,PJ016,Tumor,Primary
4,PJ016_7,1.064364,10.363591,Malignant cells,AC-like Malignant,AC-like Malignant,18,PJ016,Tumor,Primary
...,...,...,...,...,...,...,...,...,...,...
17180,PJ018_1929,-3.150876,1.861784,Malignant cells,OPC-like Malignant,OPC-like Malignant,16,PJ018,Tumor,Primary
17181,PJ018_1935,-2.560989,1.871758,Malignant cells,OPC-like Malignant,OPC-like Malignant,16,PJ018,Tumor,Primary
17182,PJ018_1936,-3.132766,2.135580,Malignant cells,OPC-like Malignant,OPC-like Malignant,16,PJ018,Tumor,Primary
17183,PJ018_1938,-1.445802,-0.715241,Malignant cells,OPC-like Malignant,OPC-like Malignant,24,PJ018,Tumor,Primary


In [27]:
np.array(df[df['Patient']=='PJ016'].index)

array([    0,     1,     2, ..., 16469, 16470, 16471])

In [29]:
df['Patient'].unique()

array(['PJ016', 'PJ032', 'PJ048', 'PJ017', 'PJ030', 'PJ035', 'PJ018',
       'PJ025'], dtype=object)