In [18]:
# Import required modules
import collections
import scipy.sparse as sp_sparse
from scipy.sparse import csr_matrix
import tables
import os
import scipy.io
import gzip
import shutil
import pandas as pd
import h5py

In [2]:
# Format CellRanger >= v1.2 <= v2.2
# Function for reading the h5 file and getting a namedtuple containing barcodes, genes and matrix fo each file
GeneBCMatrix = collections.namedtuple('GeneBCMatrix', ['gene_ids', 'gene_names', 'barcodes', 'matrix'])
 
def get_matrix_from_h5(filename, genome):
    with tables.open_file(filename, 'r') as f:
        try:
            group = f.get_node(f.root, genome)
        except tables.NoSuchNodeError:
            print("That genome does not exist in this file.")
            return None
        gene_ids = getattr(group, 'genes').read()
        gene_names = getattr(group, 'gene_names').read()
        barcodes = getattr(group, 'barcodes').read()
        data = getattr(group, 'data').read()
        indices = getattr(group, 'indices').read()
        indptr = getattr(group, 'indptr').read()
        shape = getattr(group, 'shape').read()
        matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape)
        return GeneBCMatrix(gene_ids, gene_names, barcodes, matrix)
        #return GeneBCMatrix(gene_names, barcodes, matrix.todense())

In [3]:
# Get a list of all the h5 files to be processed
path = "/Users/sha6hg/Desktop/IPF_scRNA/GSE122960/RawData/"
os.chdir(path)
file_list = os.listdir(path)
file_list1 = [i for i in file_list if "filtered" in i] 

In [7]:
# Read each file and create barcodes, features and matrix in a loop
for file in file_list1:
    # Call the function for reading h5 file
    f = h5py.File(file, 'r')
    genome = str(list(f.keys())[0])
    gene_bc_matrix = get_matrix_from_h5(file, genome)
    
    # Create a directory for each sample
    sample_name = file.split("_")[0]
    sample_directory = path + sample_name + "/"
    os.mkdir(sample_directory)
    
    # BARCODES
    # Get barcodes from the namedtuple object
    barcodes = gene_bc_matrix.barcodes.tolist()

    # Remove the Byte encoding from barcodes list
    barcodes_final = []
    for x in barcodes:
        barcodes_final.append(x.decode('utf-8'))

    # Write the barcodes to a file
    filename = sample_directory + "barcodes.tsv"
    with open(filename, 'w') as f:
        for item in barcodes_final:
            f.write("%s\n" % item)
    
    # FEATURES
    # Get gene_ids and gene_names from the namedtuple object
    gene_ids = gene_bc_matrix.gene_ids.tolist()

    # Remove the Byte encoding from gene_ids list
    gene_ids_final = []
    for x in gene_ids:
            gene_ids_final.append(x.decode('utf-8'))

    # Get gene_names from the namedtuple object
    gene_names = gene_bc_matrix.gene_names.tolist()

    # Remove the Byte encoding from gene_names list
    gene_names_final = []
    for x in gene_names:
            gene_names_final.append(x.decode('utf-8'))

    # Write the features to a file
    features_df = pd.DataFrame(list(zip(gene_ids_final, gene_names_final)))
    filename = sample_directory + "features.tsv"
    features_df.to_csv(filename, header = False, index = False, sep = "\t")
    
    # MATRIX
    # Write the matrix to Market matrix format
    filename = sample_directory + "matrix.mtx"
    scipy.io.mmwrite(filename, gene_bc_matrix.matrix)
    
    # GZIP all the files present in the sample folder 
    sample_files = os.listdir(sample_directory)
    for file1 in sample_files:
        filename = sample_directory + file1
        filename_compressed = sample_directory + file1 + ".gz"
        with open(filename, 'rb') as f_in:
            with gzip.open(filename_compressed, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
                os.remove(filename)