# TGFB1 Part 1 - Importing the data

In this notebook, we take the data from the matrix, gene and cell files and create an AnnData matrix we can use for future analyses.

In [1]:
!echo hi

hi


In [2]:
from collections import defaultdict

import numpy as np
import pandas
import scanpy
import scanpy as sc
import anndata
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()



scanpy==1.4 anndata==0.6.19 numpy==1.16.3 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


# Barcode filtering

In order to filter the barcodes, we will open the barcodes done through Cell Ranger and return a set of the barcodes we want to filter the dropest matrix by. It is of the format:

    AAACCTGAGACATAAC-1

where ``AAACCTGAGACATAAC`` is a nucleotide barcode and ``1`` is the batch it corresponds to. We load this into a dictionary mapping barcodes to batches. We will not use the batches in the AnnData matrix, but it is still useful to cross-reference.

In [3]:
def get_barcode_set(path):
    barcodes = {}

    with open("barcodes.tsv", "r") as f:
        for line in f:
            barcode, batch = line.rstrip().split('-')
            batch = int(batch)
            
            barcodes[barcode] = batch
    
    return barcodes

Populate the dictionary from the Cell Ranger set.

In [4]:
pawel_barcodes = get_barcode_set("barcodes.tsv")
len(pawel_barcodes)

12267

# Loading genes

Create a function to load all the gene data from the dimension 1 CSV file, creating a list mapping gene IDs to genes.

The gene file is of the format:

    "1","TGFB1"

where `1` is a strictly increasing gene ID as found in the count matrix and `TGFB1` is the name of the gene. Create a list whose `n`th index is the name of the corresponding gene ID.

In [5]:
def load_genes(batch, variant):
    genes_result = []
    genes_path = "source/{}_{}_dim1.csv".format(batch, variant)

    with open(genes_path, "r") as genes:
        line_no = 0

        for gene in genes:
            if not line_no:
                line_no += 1
                continue

            id, name = gene.rstrip().replace('"', '').split(",")
            id = int(id)
            
            # These should be strictly increasing.
            assert id == line_no

            genes_result.append(name)
            line_no += 1
    
    return genes_result

In [6]:
genes1 = set(load_genes("S1", "exon"))
genes1

{'CCDC102B',
 'AC007283.2',
 'AC136475.6',
 'LINC01278',
 'LINC00921',
 'AC015813.6',
 'NCOA3',
 'C1orf162',
 'GPER1',
 'AC025580.3',
 'AL133330.1',
 'LINC02313',
 'NSFP1',
 'AL136984.1',
 'AL031719.2',
 'KIF27',
 'PPIAL4E',
 'AC117503.4',
 'CPSF1',
 'BTBD6',
 'ZNF530',
 'PCDHGC5',
 'RPL35AP31',
 'THOC5',
 'TMEM8A',
 'ZNF705G',
 'RFPL4A',
 'MRPL21',
 'SHROOM2',
 'RBM17P2',
 'AC009139.1',
 'ZNF841',
 'GAPDHP55',
 'AC009097.4',
 'PEX13',
 'ROM1',
 'HR',
 'SYPL2',
 'BX255925.2',
 'KBTBD2',
 'SLC35E2A',
 'PLEKHM1P1',
 'AC090152.1',
 'EML2',
 'FAM21FP',
 'SYBU',
 'AC138811.1',
 'NECAP1',
 'LINC02408',
 'AC103740.1',
 'PRPH',
 'MIR4697HG',
 'CCNB1',
 'LIPT1',
 'PEG13',
 'IPO7',
 'IL1RL2',
 'AC010469.1',
 'CSAG1',
 'CHKB-DT',
 'AC008813.1',
 'AL133373.2',
 'PGDP1',
 'LINC00309',
 'AL157777.1',
 'CFAP44',
 'AC073130.3',
 'AC008498.2',
 'DIDO1',
 'ZNF322P1',
 'SNAPC4',
 'SH3D19',
 'AC110373.1',
 'HOXA1',
 'BNIPL',
 'AC011472.1',
 'CLP1',
 'PHETA1',
 'AC111182.1',
 'CEP112',
 'AL133353.1',
 'CFH

In [7]:
genes2 = set(load_genes("S2", "exon"))
genes2

{'CCDC102B',
 'AC007283.2',
 'AC136475.6',
 'LINC01278',
 'LINC00921',
 'C11orf97',
 'AC015813.6',
 'NCOA3',
 'FOXD4L4',
 'AL512413.1',
 'C1orf162',
 'GPER1',
 'AC025580.3',
 'AL133330.1',
 'LINC02313',
 'NSFP1',
 'AL121809.2',
 'AL136984.1',
 'AC023469.1',
 'AL031719.2',
 'KIF27',
 'PPIAL4E',
 'AC117503.4',
 'CPSF1',
 'BTBD6',
 'ZNF530',
 'RPL35AP31',
 'THOC5',
 'TMEM8A',
 'MRPL21',
 'SHROOM2',
 'RBM17P2',
 'AC009139.1',
 'ZNF841',
 'GAPDHP55',
 'AC009097.4',
 'PEX13',
 'ROM1',
 'HR',
 'SYPL2',
 'KBTBD2',
 'B3GALT5',
 'SLC35E2A',
 'PLEKHM1P1',
 'AC090152.1',
 'EML2',
 'FAM21FP',
 'SYBU',
 'NECAP1',
 'LINC02408',
 'AC103740.1',
 'PRPH',
 'MIR4697HG',
 'CCNB1',
 'LIPT1',
 'IPO7',
 'IL1RL2',
 'AC010469.1',
 'AC144573.1',
 'RPL17P45',
 'CSAG1',
 'CHKB-DT',
 'AC008813.1',
 'AL133373.2',
 'PGDP1',
 'LINC00309',
 'CFAP44',
 'AC073130.3',
 'AC008498.2',
 'DIDO1',
 'ZNF322P1',
 'SNAPC4',
 'SH3D19',
 'AC110373.1',
 'HOXA1',
 'BNIPL',
 'AC011472.1',
 'CLP1',
 'PHETA1',
 'AC111182.1',
 'CEP112',


In [8]:
print(len(genes1), len(genes2), len(genes1) + len(genes2))

28926 28438 57364


In [9]:
symdiff = genes1.symmetric_difference(genes2)
symdiff

{'LIM2',
 'C11orf97',
 'AC006017.1',
 'FOXD4L4',
 'AL512413.1',
 'AL353743.4',
 'GIPC3',
 'AL121809.2',
 'AC023469.1',
 'PCDHGC5',
 'AC012498.2',
 'ZNF705G',
 'RFPL4A',
 'BX255925.2',
 'EIF3KP3',
 'B3GALT5',
 'PALD1',
 'AL390718.1',
 'FAM84A',
 'AC138811.1',
 'TAS2R19',
 'HSPA6',
 'AL122034.1',
 'TMEM273',
 'PEG13',
 'LINC00276',
 'AC144573.1',
 'RPL17P45',
 'AC078883.3',
 'AL138976.1',
 'AL157777.1',
 'AC022336.1',
 'RAD51AP1P1',
 'AC104447.1',
 'RPL17P3',
 'A2MP1',
 'RNU4-25P',
 'C5orf64',
 'AC106017.2',
 'IDO1',
 'KPNA7',
 'RPS23P2',
 'AC015908.1',
 'C17orf102',
 'HTR1DP1',
 'RIT2',
 'AL133227.1',
 'AL354707.3',
 'KCNMA1-AS3',
 'FP565260.3',
 'RNY1P10',
 'GTF2IP23',
 'LINC00500',
 'CALM2P1',
 'AL592295.3',
 'AC010301.1',
 'AC022148.2',
 'AC026523.2',
 'Z99774.1',
 'AC011411.1',
 'TPM3P4',
 'GAPDHP39',
 'AP001542.1',
 'DGCR5',
 'TEX36',
 'AL807757.1',
 'AC010280.2',
 'AL513548.3',
 'AC005828.1',
 'CARM1P1',
 'LINC01890',
 'AL109614.1',
 'AC083904.1',
 'AL163193.1',
 'NPY4R2',
 'NPM1P

In [10]:
len(symdiff)

5418

# Loading barcodes

Create a function to load barcode data from the dimension 2 CSV file. It is of the format:

    "3","AACCGTAATCGGGACCATCATCCC"

where `3` is a strictly increasing cell ID as found in the count matrix, and `AACCGTAATCGGGACCATCATCCC` is the full barcode according to this data set.

Whenever a barcode is encountered, we check if its last 16 nt's have been found in the Cell Ranger barcode set.

This returns a list whose `n`th element corresponds to the barcode of the cell with the corresponding ID, or `None` if the cell ID's barcode is not found in the Cell Ranger barcode set.

In [11]:
def load_barcodes(batch, variant, good_barcodes):
    result = []
    barcode_path = "source/{}_{}_dim2.csv".format(batch, variant)  
    
    with open(barcode_path, "r") as barcodes:
        # Skip R crap in first line.
        next(barcodes)
        
        for i, line in enumerate(barcodes):
            cell_id, full_barcode = line.rstrip().replace('"', '').split(",")
            barcode = full_barcode[-16:]
            cell_id = int(cell_id)
            
            # The cell IDs should be continuous.
            assert cell_id == (i + 1)
            
            if barcode not in good_barcodes:
                result.append(None)
            elif good_barcodes[barcode] != int(batch[-1]):
                result.append(None)
                #print("ERROR:", barcode, "is in good set but corresponds to batch", good_barcodes[barcode])
            else:
                result.append(barcode)
    
    return result

As an experiment, create a histogram to see how many cell IDs appear per cellular barcode. We can see that all of them except one have 4 cell IDs corresponding to a barcode, the rest of the barcode is the sample barcode.

In [12]:
# Uncomment to run experiment.

"""
barcodes = load_barcodes("S1", "exon", pawel_barcodes)
histogram = defaultdict(int)

for barcode in barcodes:
    if barcode:
        histogram[barcode] += 1

histogram
"""

'\nbarcodes = load_barcodes("S1", "exon", pawel_barcodes)\nhistogram = defaultdict(int)\n\nfor barcode in barcodes:\n    if barcode:\n        histogram[barcode] += 1\n\nhistogram\n'

# Loading transcript count matrix

The count matrix is of the format:

    %
    no_genes no_cells no_counts
    19 1 1
    34 1 1
    [...]
    33663 12318 1
    33665 12318 35
    33678 12318 1

where the first number represents the gene, the second number represents the cell, and the third number represents the count.

We then get a `cells` by `genes` count matrix indexed by the cell IDs and gene IDs as described above.

In [13]:
def load_transcript_matrix(batch, variant, genes, barcodes):
    matrix_path = "source/{}_{}.mmf".format(batch, variant)
    
    matrix = None
    
    with open(matrix_path, "r") as f:
        first_line = True
        
        for line in f:
            if "%" in line:
                continue
            
            gene, cell, count = [int(i) for i in line.rstrip().split()]
            
            if first_line:
                assert gene == len(genes)
                assert cell == len(barcodes)
                
                first_line = False
                matrix = np.ndarray(shape=(cell, gene), dtype=np.dtype(np.int16))
                continue
            
            
            gene_pos = gene - 1
            cell_pos = cell - 1
            
            matrix[cell_pos, gene_pos] = count
    
    return matrix

# Create Pandas DataFrame from count matrix

Given a count matrix of size `genes x cells`, a list of length `genes` and a list of length `cells`, create a Pandas DataFrame with index `cells` and column labels from `genes`.

As we have seen from the barcode loading function, we will have four rows in our table corresponding to the same barcode, and some of the data we get from the matrix does not correspond to a barcode in the Cell Ranger set. During this process, we will filter out all rows that do not have a valid Pawel barcode (i.e. index is None), and group and sum all (4) rows that have the same barcode.

In [14]:
def dataframe_from_matrix(m, genes, barcodes):
    assert m.shape == (len(barcodes), len(genes))
    
    # Create a data frame from the count matrix, keyed by the barcode
    # and gene annotations we created.
    result = pandas.DataFrame(m)
    result.index = barcodes
    result.columns = genes
    
    # First, get rid of all the rows whose barcode is None. This means those
    # cells were not found in the reference barcode set.
    result = result[result.index.notnull()]
    
    # Then, since each barcode appears to correspond to four cell IDs,
    # we get counts spread out across these four IDs. Let's collapse the duplicates.
    result = result.groupby(by=lambda x: x).sum()
    
    return result

# Construct DataFrame

Given a batch identifier list (i.e. `["S1", "S2"]`) and matrix name list (i.e. `["exon", "intron"]`), create a Pandas DataFrame from the corresponding count matrices and gene/cell dimension file for each. In addition, update a set `genes_set` with all genes that have been encountered in this count matrix, as these are not normalized by default across matrices in a batch (unlike cell IDs).

In [15]:
def construct_dataframes(batches, matrices, genes_set):
    for batch_name in batches:    
        prev_barcodes = None

        for matrix_name in matrices:
            print("Now processing", batch_name, matrix_name)

            genes = load_genes(batch_name, matrix_name)        
            print("Genes list loaded at", len(genes), "items.")

            barcodes = load_barcodes(batch_name, matrix_name, pawel_barcodes)
            print("Barcodes list loaded at", len(barcodes), "items.")

            print("Loading transcription matrix and converting to dataframe...")
            m = load_transcript_matrix(batch_name, matrix_name, genes, barcodes)
            m = dataframe_from_matrix(m, genes, barcodes)

            genes_set.update(m.columns)
            print("Total gene set is now", len(genes_set), "long.")

            if prev_barcodes is not None:
                assert m.index.equals(prev_barcodes)
            else:
                prev_barcodes = m.index
            
            # TODO REMOVE THIS
            return m

            m.to_pickle("/tmp/{}_{}.pickle".format(batch_name, matrix_name))
            print("Count matrix with dimensions", m.shape, "created and pickled for", batch_name, matrix_name)
            print("")

        print("")

# Normalize genes in DataFrames

Given a batch identifier list (i.e. `["S1", "S2"]`) and matrix name list (i.e. `["exon", "intron"]`), load the corresponding DataFrames. Take the `genes_set` generated from the previous process and add to each dataframe the genes that are not encountered in that count matrix, and the counts to 0 for all cells. This is needed as all count matrices need to be the same dimension when the AnnData layers are generated. To normalize the list of genes, we will sort the columns alphabetically as well.

In [16]:
def normalize_dataframe_genes(batches, matrices, genes_set):
    print("Normalizing dimensions of count matrices along genes...")

    for batch_name in batches:    
        for matrix_name in matrices:
            m = pandas.read_pickle("/tmp/{}_{}.pickle".format(batch_name, matrix_name))

            print("Old shape of", batch_name, matrix_name, "was", m.shape)
            missing_genes = genes_set - set(m.columns)
            for missing_gene in missing_genes:
                m[missing_gene] = 0
            print("New shape is", m.shape)

            print("Reordering genes in alphabetical order...")
            print("Old column order was", m.columns)
            m = m.reindex(sorted(m.columns), axis=1)
            print("New column order is", m.columns)

            m.to_pickle("/tmp/{}_{}.pickle".format(batch_name, matrix_name))
            print("Normalized count matrix for", batch_name, matrix_name, "pickled.")
            print("")

        print("")

# Create layered AnnData object from batch matrices

Create a layered AnnData object from frames for each individual batch. In practice, this takes all frames with names in a matrix list (i.e. `["intron", "exon", "spanning"]`) for each batch (i.e. `["S1", "S2"]`), and creates a layered AnnData object with layer names corresponding to `layer_names` (i.e. `["unspliced", "spliced", "ambiguous"]`) from these frames.

Frames should have identical dimensions, indices and column names for this to work.

The layer with name `spliced` is implicitly used as the main count matrix of the AnnData object.

In [17]:
def create_anndata_from_frames(batches, matrices, layer_names):
    results = []
    
    for batch in batches:
        result = None
        print("Now processing batch", batch, "...")
        
        for matrix, layer in zip(matrices, layer_names):
            print("Loading matrix", matrix, "...")
            m = pandas.read_pickle("/tmp/{}_{}.pickle".format(batch, matrix))
            
            if not result:
                print("Creating AnnData object...")
                
                result = anndata.AnnData(shape=m.shape)
                result.obs = pandas.DataFrame({"cellular_barcode": list(m.index)})
                result.var = pandas.DataFrame({"gene_names": list(m.columns)})
                
            if layer == "spliced":
                print("Layer is", layer, ", adding it as main count matrix to object...")
                result.X = m
            
            print("Adding count matrix as layer", layer, "to AnnData object...")
            result.layers[layer] = m.values
            print("")

        result.var_names = [gene for gene in result.var["gene_names"]]
        result.obs_names = [barcode for barcode in result.obs["cellular_barcode"]]
        result.obs_names_make_unique()
        results.append(result)
                
        print("Final AnnData object is", result)
        
        print("Saving layered AnnData object for batch", batch, "...")
        result.write("./write/{}-final.h5ad".format(batch))
        print("")

# Merge AnnData objects along batches

Given a list of AnnData objects corresponding to each batch, create one large AnnData object comprising all batches. Save merged AnnData object for future processing.

In [18]:
def create_merged_anndata(batches):
    result = None
    to_merge = []
    
    for batch in batches:
        if not result:
            print("Merged AnnData file not yet created, making it from batch", batch)
            result = anndata.read_h5ad("./write/{}-final.h5ad".format(batch))
            print("Merged AnnData file is now", result)
        else:
            print("Loading batch", batch)
            curr = anndata.read_h5ad("./write/{}-final.h5ad".format(batch), backed="r+")
            print("Loaded AnnData file is", curr, ", appending to list of objects to concatenate...")
            to_merge.append(curr)
        print("")
    
    print("")        
    print("Concatenating list", to_merge, "to original file")
    result = result.concatenate(*to_merge, batch_categories=batches, index_unique=None)
    print("Concatenated AnnData object is", result, ", saving it to disk...")
    result.write("./write/merged-final.h5ad")

# Run the procedures above
Create the variables for this experiment and run the procedures above, creating an AnnData matrix used in future notebooks.

In [19]:
batches = ('S1', 'S2')
matrices = ('exon', 'intron', 'spanning')
layer_names = ('spliced', 'unspliced', 'ambiguous')
all_genes = set()

In [20]:
m = construct_dataframes(batches, matrices, all_genes)

Now processing S1 exon
Genes list loaded at 28926 items.
Barcodes list loaded at 26709 items.
Loading transcription matrix and converting to dataframe...
Total gene set is now 28926 long.


In [21]:
m

Unnamed: 0,MT-CO1,RPL10,MORF4L2,TMSB4X,RPL3,PDXK,MYL9,EIF2S2,RPL13A,RPS4X,...,C20orf144,TLR10,PPIAP3,SST,AC009127.2,AC013400.1,AL021877.2,AC027796.4,AC011270.1,LINC01980
AAACCTGAGACATAAC,135,117,2,257,59,1,2,10,98,49,...,0,0,0,0,0,0,0,0,0,0
AAACCTGAGAGTAAGG,153,313,10,284,151,2,12,8,282,143,...,0,0,0,0,0,0,0,0,0,0
AAACCTGAGATCCCAT,94,244,2,83,90,0,8,1,134,116,...,0,0,0,0,0,0,0,0,0,0
AAACCTGCAAGCCGCT,112,263,1,123,126,0,10,4,167,108,...,0,0,0,0,0,0,0,0,0,0
AAACCTGCAATTGCTG,151,236,8,178,109,3,12,4,245,124,...,0,0,0,0,0,0,0,0,0,0
AAACCTGCACGGTAAG,227,350,5,322,155,4,5,4,235,133,...,0,0,0,0,0,0,0,0,0,0
AAACCTGGTAGCAAAT,163,72,5,284,16,0,2,6,84,40,...,0,0,0,0,0,0,0,0,0,0
AAACCTGGTAGGCTGA,76,180,3,145,88,0,10,4,148,100,...,0,0,0,0,0,0,0,0,0,0
AAACCTGGTCAGGACA,63,230,2,148,85,0,21,3,150,105,...,0,0,0,0,0,0,0,0,0,0
AAACCTGGTCTCAACA,66,169,0,56,71,0,4,5,125,72,...,0,0,0,0,0,0,0,0,0,0


In [None]:
normalize_dataframe_genes(batches, matrices, all_genes)

In [None]:
create_anndata_from_frames(batches, matrices, layer_names)

In [None]:
create_merged_anndata(batches)