# TGFB1 Part 1 - Importing the data

In this notebook, we take the data from the matrix, gene and cell files and create an AnnData matrix we can use for future analyses.

In [1]:
import numpy as np
import pandas
import scanpy
import scanpy as sc
import anndata
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
results_file = './write/tgfb1-1.h5ad'  # the file that will store the analysis results



scanpy==1.4 anndata==0.6.18 numpy==1.16.2 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


# Load the sparse transcript count matrix

The count matrix is of the format:

    %
    no_genes no_cells no_counts
    19 1 1
    34 1 1
    [...]
    33663 12318 1
    33665 12318 35
    33678 12318 1

where the first number represents the gene, the second number represents the cell, and the third number represents the count.

First, we will load the data into a matrix called ``matrix``, accessing the count as ``matrix[gene][cell]``.
    

In [2]:
matrix = None

with open("matrix.mtx", "r") as f:
    first_line = True
    
    for line in f:
        if "%" in line:
            continue
        
        line = [int(i) for i in line.rstrip().split()]
        
        if first_line:
            genes, cells, count = line
            first_line = False
            matrix = np.ndarray(shape=(genes,cells))
        else:
            genes, cells, count = line
            matrix[genes-1, cells-1] = count

In [3]:
matrix.shape

(33694, 12318)

Note, we first index by genes (33694), and then by cells (12318)

# Load batch labels

Load batch barcode data. It is of the format:

    AAACCTGAGACATAAC-1

where ``AAACCTGAGACATAAC`` is a nucleotide barcode and ``1`` is the batch it corresponds to. Load it into a Python list of lists accessed as e.g. `batches[batch][5]`, where `batch` is the batch number minus one, and `5` is the fifth barcode for that batch in the list.

In [4]:
with open("barcodes.tsv") as f:
    batches = [[], []]
    batch_per_cell = []
    barcodes = []
    
    for i, line in enumerate(f):
        
        barcode, batch = line.rstrip("\n").split("-")
        barcodes.append(barcode)
        batch = int(batch) - 1
        batches[batch].append(i)
        batch_per_cell.append(batch + 1)

In [5]:
barcodes

['AAACCTGAGACATAAC',
 'AAACCTGAGAGTAAGG',
 'AAACCTGAGATCCCAT',
 'AAACCTGCAAGCCGCT',
 'AAACCTGCAATTGCTG',
 'AAACCTGCACGGTAAG',
 'AAACCTGCATACGCCG',
 'AAACCTGGTAGCAAAT',
 'AAACCTGGTAGGCTGA',
 'AAACCTGGTCAGGACA',
 'AAACCTGGTCTCAACA',
 'AAACCTGTCAACACGT',
 'AAACCTGTCCGCATCT',
 'AAACGGGAGTATTGGA',
 'AAACGGGAGTGTGAAT',
 'AAACGGGCAAGTCATC',
 'AAACGGGCACCAGCAC',
 'AAACGGGCACCTATCC',
 'AAACGGGCATCATCCC',
 'AAACGGGGTAAGTAGT',
 'AAACGGGGTACCATCA',
 'AAACGGGGTAGAGTGC',
 'AAACGGGGTCTTCAAG',
 'AAACGGGGTGCACGAA',
 'AAACGGGGTGTATGGG',
 'AAACGGGTCCCAAGAT',
 'AAACGGGTCCGCAAGC',
 'AAACGGGTCGAGGTAG',
 'AAACGGGTCGATCCCT',
 'AAACGGGTCGGAGCAA',
 'AAAGATGAGAGTGACC',
 'AAAGATGAGGAGTTGC',
 'AAAGATGAGGTGCTTT',
 'AAAGATGCACCACCAG',
 'AAAGATGCACGACGAA',
 'AAAGATGGTAAACCTC',
 'AAAGATGGTTATTCTC',
 'AAAGATGGTTTGCATG',
 'AAAGCAAAGAGGTAGA',
 'AAAGCAAAGGCTAGAC',
 'AAAGCAAAGTGGACGT',
 'AAAGCAAGTAGCGCTC',
 'AAAGCAAGTCGCGGTT',
 'AAAGCAAGTGATAAAC',
 'AAAGCAATCAGATAAG',
 'AAAGCAATCAGTTAGC',
 'AAAGCAATCGTAGGAG',
 'AAAGCAATCTT

As expected, the last batch label is 12317, one less than the number of cells

In [6]:
batches[1][-1]

12317

# Load gene ids

Load a dictionary mapping gene IDs to gene names, such as:
    
    ENSG00000277630 BX072566.1
    ENSG00000278384 AL354822.1

For example, `genes[1] = ("ENSG00000278384", "AL354822.1")`. To get all gene IDs, use `genes.keys()` and to get all gene names, use `genes.values()`.


In [7]:
genes = []

with open("genes.tsv") as f:
    for id_gene in f:
        
        gene_id, gene_name = id_gene.split()
        genes.append((gene_id, gene_name))

As expected, the number of genes is equal to the number of genes.

In [8]:
len(genes)

33694

# Check if there is a significant coverage disparity between batches

In [9]:
batch_1 = matrix[:,batches[0]]

In [None]:
batch_2 = matrix[:,batches[1]]

In [None]:
cell_count_1 = batch_1.sum(0)

In [None]:
cell_count_2 = batch_2.sum(0)

This is the number of cells in the first batch

In [None]:
len(cell_count_1)

This is the number of cells in the second batch

In [None]:
len(cell_count_2)

# Scale histograms by the number of cells

In [None]:
len(cell_count_1)//50

In [None]:
len(cell_count_2)//50

In [None]:
plt.hist(cell_count_1, bins=len(cell_count_1)//50, histtype='step')
plt.hist(cell_count_2, bins=len(cell_count_2)//50, histtype='step', color="r")
pass

# Underscale the first batch

In [None]:
plt.hist(cell_count_1, bins=len(cell_count_1)//80, histtype='step')
plt.hist(cell_count_2, bins=len(cell_count_2)//50, histtype='step', color="r")
pass

# Let's convert our data into scanpy format

Our `matrix` will now be transposed, that is, it will be accessed as `matrix[cell][gene]`.

In [10]:
matrix = matrix.T

In [11]:
matrix.shape

(12318, 33694)

In [12]:
df = pandas.DataFrame(matrix)

In [13]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33684,33685,33686,33687,33688,33689,33690,33691,33692,33693
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df.index = barcodes
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33684,33685,33686,33687,33688,33689,33690,33691,33692,33693
AAACCTGAGACATAAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGAGTAAGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGATCCCAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCAAGCCGCT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCAATTGCTG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCACGGTAAG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCATACGCCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTAGCAAAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTAGGCTGA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTCAGGACA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df.columns = [gene[1] for gene in genes]
df

Unnamed: 0,RP11-34P13.3,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,RP11-34P13.14,RP11-34P13.9,FO538757.3,FO538757.2,AP006222.2,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231B
AAACCTGAGACATAAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGAGTAAGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGATCCCAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCAAGCCGCT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCAATTGCTG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCACGGTAAG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCATACGCCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTAGCAAAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTAGGCTGA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTCAGGACA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.to_pickle('df-old.pickle')

In [None]:
gene_annotation = pandas.DataFrame({"gene_names": [gene[0] for gene in genes], "gene_ids": [gene[1] for gene in genes]})

In [None]:
gene_annotation.shape

In [None]:
batch_annotation = pandas.DataFrame({"batch": batch_per_cell})

In [None]:
batch_annotation.shape

In [None]:
matrix = anndata.AnnData(X=matrix, obs=batch_annotation, var=gene_annotation)

In [None]:
matrix

# Running scanpy analysis

In [None]:
matrix.var_names = [gene[1] for gene in genes]
matrix.var_names_make_unique()
matrix.obs_names = [str(batch) for batch in batch_per_cell]
matrix.obs_names_make_unique()

# Save imported matrix

In [None]:
matrix.write(results_file)