In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import re
import os
import random
import subprocess
from collections import defaultdict
from tqdm import tqdm
from collections import Counter

In [2]:
PATH = "~/data/paired/10x"

In [3]:
bcr = ['IGH', 'IGL', 'IGK']
tcr = ['TRA', 'TRB', 'TRG']
trash = ['Multi', 'None']

In [4]:
# help functions

def dict_to_sorted_df(dict_name, column_labels):
    df = pd.DataFrame.from_dict(dict_name, orient='index')
    df.columns = column_labels
    return df.sort_index(0)

def add_perc_to_df(dict_name, column_name):
    dict_name['% ' + column_name] = dict_name['# '+column_name] / sum(dict_name['# '+column_name]) * 100
    return dict_name

In [5]:
def get_info_by_chain_type(filtered_contigs_info, chain_type):
    return filtered_contigs_info.loc[filtered_contigs_info['chain'].isin(chain_type)]
    

def chains_simple_stats(filtered_contigs_info):
    
    # dict: chain_type -> number of contigs
    contigs_types = {chain_type: len(rows) for chain_type, rows in filtered_contigs_info.groupby('chain')}
    
    stats_by_contig_types = dict_to_sorted_df(contigs_types, ['# contigs'])
    stats_by_contig_types = add_perc_to_df(stats_by_contig_types, 'contigs')
    
    def summary_from_stats_by_contig_types(chain_type):
        return sum(stats_by_contig_types.loc[stats_by_contig_types.index.isin(chain_type)]['% contigs'])
    
    summary = dict_to_sorted_df({"bcr" : [summary_from_stats_by_contig_types(bcr), "ok"], 
           "tcr" : [summary_from_stats_by_contig_types(tcr), "filter out"], 
           "trash" : [summary_from_stats_by_contig_types(trash), "igblast for fun + filter out"]}, ["% contigs", "TODO"])
    
    # print stats
    print("> Number of all contigs: {}\n".format(sum(stats_by_contig_types['# contigs'])))
    
    print("> Contigs by chain types:\n")
    print(stats_by_contig_types)
    print("\n(A value of Multi indicates that segments from multiple chains were present)")
    
    print("\n> Summary:\n")
    print(summary)
    

def cells_simple_stats(bcr_contigs_info):
    
    # dict : cell_barcode -> number of bcr chains
    cells_types = {cell_barcode : len(rows) for cell_barcode, rows in bcr_contigs_info.groupby('barcode')}
    
    summary = dict_to_sorted_df(dict(Counter(cells_types.values())), ['# cells'])
    summary = add_perc_to_df(summary, 'cells')
    summary.index = [str(name) + " chains" for name in summary.index]
     
    # dict: cell_barcode -> counts of bcr chains 
    chains_counts_by_cell = {cell_barcode : str(dict(Counter(rows['chain']))) for cell_barcode, rows in bcr_contigs_info.groupby('barcode')}
    accurate_counts = dict_to_sorted_df(dict(Counter(chains_counts_by_cell.values())), ['# cells'])
    accurate_counts = accurate_counts.sort_values(by='# cells',  ascending=False)
    accurate_counts = add_perc_to_df(accurate_counts, 'cells')
    
    
    # print stats
    print("> Number of all cells: {}\n".format(len(cells_types)))
    
    print("> Cells by number of all chains\n")
    print(summary)
    
    print("\n> Cells by combination of chains\n")
    print(accurate_counts)
    
    

In [6]:
def process_filtered_contigs_csv(dataset_name):
    
    annotation_fname = PATH + '/{}/vdj_v1_{}_b_filtered_contig_annotations.csv'.format(dataset_name, dataset_name)
    
    # read filtered_contig_annotations.csv
    filtered_contigs_info = pd.read_csv(annotation_fname, delimiter=',')
    
    print("\n\n=========== contigs ===========\n\n")
    chains_simple_stats(filtered_contigs_info)
    
    print("\n\n=========== bcr chains ===========\n\n")    
    bcr_contigs_info = get_info_by_chain_type(filtered_contigs_info, bcr)    
    cells_simple_stats(bcr_contigs_info)
    

# CD19

CD19+ B cells isolated from peripheral blood mononuclear cells (PBMCs) of a healthy donor, purchased from AllCells (Catalog #: PB010-0), 93% viable by Trypan blue stain.

CD19+ B cells are primary cells with relatively small amounts of RNA (~1pg RNA/cell).

Libraries were prepared following the Single Cell V(D)J Reagent Kits User Guide (CG000086 RevC).



In [7]:
process_filtered_contigs_csv('cd19')





> Number of all contigs: 33486

> Contigs by chain types:

       # contigs  % contigs
IGH        11074  33.070537
IGK         7234  21.603058
IGL         5191  15.502001
Multi       7488  22.361584
None           4   0.011945
TRA          190   0.567401
TRB         2296   6.856597
TRG            9   0.026877

(A value of Multi indicates that segments from multiple chains were present)

> Summary:

       % contigs                          TODO
bcr    70.175596                            ok
tcr     7.450875                    filter out
trash  22.373529  igblast for fun + filter out




> Number of all cells: 9467

> Cells by number of all chains

          # cells    % cells
1 chains       81   0.855604
2 chains     5815  61.423894
3 chains     2737  28.910954
4 chains      639   6.749762
5 chains      156   1.647829
6 chains       33   0.348579
7 chains        5   0.052815
8 chains        1   0.010563

> Cells by combination of chains

                                # cells    %

# GM12878

B-lymphoblastoid cell line GM12878, purchased from Coriell (Catalog #: GM12878), 87% viable by Trypan blue stain.

GM12878 cells are a B-lymphoblastoid cell line with high expression level of Ig transcripts (mostly IGHM/D and IGL isotype).

Libraries were prepared following the Single Cell V(D)J Reagent Kits User Guide (CG000086 RevC).

In [8]:
process_filtered_contigs_csv('gm12878')





> Number of all contigs: 2711

> Contigs by chain types:

       # contigs  % contigs
IGH         1019  37.587606
IGK           17   0.627075
IGL         1188  43.821468
Multi        447  16.488381
TRA            9   0.331981
TRB           31   1.143489

(A value of Multi indicates that segments from multiple chains were present)

> Summary:

       % contigs                          TODO
bcr    82.036149                            ok
tcr     1.475470                    filter out
trash  16.488381  igblast for fun + filter out




> Number of all cells: 865

> Cells by number of all chains

          # cells    % cells
1 chains       14   1.618497
2 chains      475  54.913295
3 chains      259  29.942197
4 chains      102  11.791908
5 chains       15   1.734104

> Cells by combination of chains

                                # cells    % cells
{'IGL': 1, 'IGH': 1}                463  53.526012
{'IGL': 2, 'IGH': 1}                211  24.393064
{'IGL': 2, 'IGH': 2}                