# Guide to Loading and Using HDF5/loom File Format Single-Cell Data #

In [1]:
import numpy as np
import pandas as pd
import h5py

## Filepaths Can Be Changed Here ##

In [2]:
in_filepath = "/gpfs/data/nneretti/data/datasets/scRNA_seq/example_out.loom"

A new h5py file connection is created using the h5py.File() method.

The returned connection is essentially the root group or root dictionary

In [3]:
hdf5file = h5py.File(in_filepath, "r")

In [4]:
hdf5file

<HDF5 file "example_out.loom" (mode r)>

## Exploring the file ##
We will show how to explore and use an HDF5 file and how to access the main datapoints of interest

In [5]:
hdf5file.keys()

<KeysViewHDF5 ['attrs', 'col_attrs', 'col_graphs', 'layers', 'matrix', 'row_attrs', 'row_graphs', 'scaled']>

In [6]:
hdf5file["layers"].keys()

<KeysViewHDF5 ['normalized']>

In [7]:
hdf5file["layers"]["normalized"]

<HDF5 dataset "normalized": shape (1234, 17594), type "<f8">

In [8]:
hdf5file["row_attrs"].keys()

<KeysViewHDF5 ['Barcode', 'MID', 'nCount_RNA', 'nCount_SCT', 'nFeature_RNA', 'nFeature_SCT', 'old.ident', 'orig.ident', 'percent.mito', 'percent.mt', 'percent.ribo', 'res.0.8', 'res.0.9', 'res.1', 'res.1.2']>

In [9]:
hdf5file["row_attrs"]["Barcode"]

<HDF5 dataset "Barcode": shape (1234,), type "|O">

In [10]:
hdf5file["col_attrs"].keys()

<KeysViewHDF5 ['Name']>

## Reading into memory ##
Here is an example of reading the whole array into memory.

First we get a raw count numpy array, then we put this into a pandas DataFrame

In [11]:
raw_count_matrix = np.array(hdf5file["matrix"], copy=True)
# Column metadata, including gene names, is found in hdf5file["col_attrs"]
gene_names = np.array(hdf5file["col_attrs"]["Name"], dtype=str)
# Row metadata, including barcodes, is found in hdf5file["row_attrs"]
cell_barcodes = np.array(hdf5file["row_attrs"]["Barcode"], dtype=str)
print(raw_count_matrix.dtype)
raw_count_matrix.shape

float64


(1234, 17594)

In [12]:
count_df = pd.DataFrame(data=raw_count_matrix, columns=gene_names, index=cell_barcodes)
count_df

Unnamed: 0,Xkr4,Rp1,Sox17,Mrpl15,Lypla1,Gm37988,Tcea1,Rgs20,Atp6v1h,Npbwr1,...,Vamp7,Spry3,Tmlhe,AC133103.1,Csprs,AC125149.3,AC168977.1,PISD,DHRSX,CAAA01147332.1
AAACCTGGTCCTCCAT-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACGGGAGCTCCCAG-4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
AAAGCAACAAGTTGTC-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
AAAGCAAGTCACACGC-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
AAAGCAAGTGTAATGA-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTATCTTTCGTGGACC-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
TGGTTAGAGGCATGGT-4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
TTATGCTTCTATCCTA-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
TTCTCAAGTACACCGC-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


#### Another example, Scaled and Normalized data ####
scaled and normalied data is located here:

In [13]:
scaled_group = hdf5file["scaled"]
scaled_matrix = np.array(scaled_group["matrix"], copy=True)

In [14]:
variable_genes = np.array(scaled_group["vargenes"], dtype=str)

In [15]:
scaled_df = pd.DataFrame(data=scaled_matrix, columns=variable_genes, index=cell_barcodes)
scaled_df

Unnamed: 0,Sox17,Npbwr1,3110035E14Rik,Prex2,A830018L16Rik,Msc,Kcnb2,Sbspon,Crispld1,B3gat2,...,Ina,Mirt1,Dusp5,Afap1l2,Ablim1,Pnlip,Csf2ra,Csprs,AC125149.3,AC168977.1
AAACCTGGTCCTCCAT-4,-0.389810,-0.002234,0.147991,0.029215,0.017594,-0.010550,-0.037063,-0.119546,-0.121452,0.089870,...,-0.022553,-0.217105,-0.513541,-0.128289,-0.454488,0.055433,-0.739308,-0.079176,-0.133580,-0.185398
AAACGGGAGCTCCCAG-4,-0.336413,-0.005402,-0.039682,-0.124821,0.014271,-0.011089,-0.034454,-0.098531,-0.108412,-0.046512,...,-0.039216,-0.125006,-0.379505,-0.189542,-0.384298,-0.020391,2.646950,-0.060354,-0.088561,-0.097287
AAAGCAACAAGTTGTC-4,-0.314689,-0.014356,-0.169104,-0.254909,-0.033705,-0.019301,-0.059423,-0.093948,-0.107659,-0.134619,...,-0.082605,-0.071715,-0.282000,-0.234238,-0.358785,-0.064570,-0.524086,-0.046376,-0.060659,-0.051883
AAAGCAAGTCACACGC-4,-0.288434,-0.014778,-0.253514,-0.320652,-0.028426,-0.018380,-0.054281,-0.083027,-0.100573,-0.196841,...,-0.085370,-0.028776,-0.222466,-0.261424,-0.323822,-0.099876,-0.477441,-0.038058,-0.039944,-0.009976
AAAGCAAGTGTAATGA-4,-0.356835,0.000769,0.066431,-0.022287,0.045710,-0.005711,-0.018608,-0.104001,-0.110453,0.026754,...,-0.009155,-0.170467,-0.458573,-0.153320,-0.409173,0.017233,0.753442,-0.071643,-0.111982,-0.137168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTATCTTTCGTGGACC-4,-0.430895,-0.137370,-0.660051,-1.062269,-0.816642,-0.153595,-0.506266,-0.206920,-0.213197,-0.390656,...,-0.666795,-0.004009,0.154595,-0.436086,-0.563115,-0.125476,0.204734,0.019791,0.003832,-0.081676
TGGTTAGAGGCATGGT-4,-0.400720,-0.111608,3.225892,-0.906987,10.000000,-0.125169,-0.411226,-0.180784,-0.189464,-0.350475,...,7.851316,-0.008832,0.075877,-0.399619,-0.512512,-0.120410,2.471746,0.007707,-0.005184,-0.066234
TTATGCTTCTATCCTA-4,-0.423068,-0.131954,-0.646849,-1.033129,10.000000,-0.147542,10.000000,-0.200796,-0.207793,-0.385622,...,-0.641153,-0.002647,0.141276,-0.429894,-0.550492,-0.126364,0.177136,0.017701,0.003077,-0.076092
TTCTCAAGTACACCGC-4,-0.445826,-0.154130,-0.729741,-1.174402,-0.923046,-0.171846,-0.566922,-0.221929,-0.227325,-0.427617,...,10.000000,0.006662,0.216045,-0.464485,-0.589748,-0.134962,3.657875,0.029080,0.013312,-0.084314


#### Now we will also add a column for cell type to the DataFrame ####

In [16]:
cell_types = np.array(hdf5file["row_attrs"]["old.ident"], dtype=str)
cell_types

array(['Microglia', 'Microglia', 'Microglia', ..., 'Neurons', 'Neurons',
       'Neurons'], dtype='<U16')

In [21]:
scaled_df.insert(loc=0, column="CellType", value=cell_types)
scaled_df

Unnamed: 0,CellType,Sox17,Npbwr1,3110035E14Rik,Prex2,A830018L16Rik,Msc,Kcnb2,Sbspon,Crispld1,...,Ina,Mirt1,Dusp5,Afap1l2,Ablim1,Pnlip,Csf2ra,Csprs,AC125149.3,AC168977.1
AAACCTGGTCCTCCAT-4,Microglia,-0.389810,-0.002234,0.147991,0.029215,0.017594,-0.010550,-0.037063,-0.119546,-0.121452,...,-0.022553,-0.217105,-0.513541,-0.128289,-0.454488,0.055433,-0.739308,-0.079176,-0.133580,-0.185398
AAACGGGAGCTCCCAG-4,Microglia,-0.336413,-0.005402,-0.039682,-0.124821,0.014271,-0.011089,-0.034454,-0.098531,-0.108412,...,-0.039216,-0.125006,-0.379505,-0.189542,-0.384298,-0.020391,2.646950,-0.060354,-0.088561,-0.097287
AAAGCAACAAGTTGTC-4,Microglia,-0.314689,-0.014356,-0.169104,-0.254909,-0.033705,-0.019301,-0.059423,-0.093948,-0.107659,...,-0.082605,-0.071715,-0.282000,-0.234238,-0.358785,-0.064570,-0.524086,-0.046376,-0.060659,-0.051883
AAAGCAAGTCACACGC-4,Microglia,-0.288434,-0.014778,-0.253514,-0.320652,-0.028426,-0.018380,-0.054281,-0.083027,-0.100573,...,-0.085370,-0.028776,-0.222466,-0.261424,-0.323822,-0.099876,-0.477441,-0.038058,-0.039944,-0.009976
AAAGCAAGTGTAATGA-4,Microglia,-0.356835,0.000769,0.066431,-0.022287,0.045710,-0.005711,-0.018608,-0.104001,-0.110453,...,-0.009155,-0.170467,-0.458573,-0.153320,-0.409173,0.017233,0.753442,-0.071643,-0.111982,-0.137168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTATCTTTCGTGGACC-4,Neurons,-0.430895,-0.137370,-0.660051,-1.062269,-0.816642,-0.153595,-0.506266,-0.206920,-0.213197,...,-0.666795,-0.004009,0.154595,-0.436086,-0.563115,-0.125476,0.204734,0.019791,0.003832,-0.081676
TGGTTAGAGGCATGGT-4,Neurons,-0.400720,-0.111608,3.225892,-0.906987,10.000000,-0.125169,-0.411226,-0.180784,-0.189464,...,7.851316,-0.008832,0.075877,-0.399619,-0.512512,-0.120410,2.471746,0.007707,-0.005184,-0.066234
TTATGCTTCTATCCTA-4,Neurons,-0.423068,-0.131954,-0.646849,-1.033129,10.000000,-0.147542,10.000000,-0.200796,-0.207793,...,-0.641153,-0.002647,0.141276,-0.429894,-0.550492,-0.126364,0.177136,0.017701,0.003077,-0.076092
TTCTCAAGTACACCGC-4,Neurons,-0.445826,-0.154130,-0.729741,-1.174402,-0.923046,-0.171846,-0.566922,-0.221929,-0.227325,...,10.000000,0.006662,0.216045,-0.464485,-0.589748,-0.134962,3.657875,0.029080,0.013312,-0.084314


## Accessing slices ##

We can also access slices of hdf5 datasets directly, without loading the whole dataset into memory. This can be useful for batch training of machine learning algorithms on very large datasets. We will show this using the normalized data matrix:

In [18]:
normalized_counts_dataset = hdf5file["layers"]["normalized"]
example_row = normalized_counts_dataset[0]
print("Example row shape and dtype: ", example_row.shape, example_row.dtype)
k=5
for i in range(0, len(normalized_counts_dataset)-k+1, k):
    curr_slice = normalized_counts_dataset[i:i+k]
    break
print("2d slice:")
print(curr_slice.dtype)
curr_slice.shape 

Example row shape and dtype:  (17594,) float64
2d slice:
float64


(5, 17594)

## Finish ##
Again, we want to make sure to close the connection to the file to allow other processes to read and write access to it. 

In [19]:
hdf5file.close()