In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pysam
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
from collections import Counter

In [4]:
import logging
logging.basicConfig(level=logging.INFO)

In [5]:
# !ls -a '/data/parastou/Star-Lab/test/10samples'

In [6]:
samples = ['AAACAT', 'AGGCGG', 'CACCGG', 'CATCAG', 'CGCCTC', 'GAAGAC', 'GCCTGC', 'GGACAT', 'TATCAT', 'TGTATT']

In [7]:
path = '/data/parastou/Star-Lab/test/10samples'

In [8]:
def generate_infile_names(path, sample):
    
    f1 = os.path.join(path, 'HGsample_%s_NStar25.Aligned.out.tagged.bam' %sample)
    f2 = os.path.join(path, 'HGsample_%s_NStar25.Aligned.out.tagged.emc.pkl' %sample)
    f3 = os.path.join(path, 'HGsample_%s_NStar25.Aligned.out.tagged.mc.pkl' %sample)
    f4 = os.path.join(path, 'HGsample_%s_NStar25.Aligned.out.tagged.emc_dup.pkl' %sample)
    f5 = os.path.join(path, 'HGsample_%s_NStar25.Aligned.out.tagged.mc_dup.pkl' %sample)

    return f1, f2, f3, f4, f5

In [9]:
def cluster_stats(clusterMap):
    
    cls_stats = []
    for key, val in clusterMap.items():
        xm = key[:10]
        ref = key[11:]
        reads = [i for i,j,k in val]
        cls_stats.append((xm, ref, reads))
    cls = pd.DataFrame(cls_stats, columns=['XM','Ref','Reads'])
    
    return cls

In [10]:
def unique_max_cls(clusterMap):
    
    unique_mcs = []
    reads = []
    for item, group in cluster_stats(clusterMap).groupby(['XM']):
        if len(group) ==1:
            unique_mcs.append(group)
            reads += group['Reads'].item()

    return len(unique_mcs), len(set(reads))

In [11]:
def generate_bam_view(pysam_iter):

    pysam_iter.reset()
    reads = pysam_iter.fetch(until_eof=True)

    records = []
    for r in reads:
        if r.get_tag('NH') > 0:
            xm = r.get_tag('XM')
            qn = r.query_name
            ref = r.reference_name
            start = r.reference_start
            nh = r.get_tag('NH')
            records.append((xm, qn, ref, start, nh))
    df = pd.DataFrame(records, columns=['XM', 'QName', 'Ref', 'Start', 'NH'])

    return df

In [12]:
def generate_umi_group_map(df):

    umi_group_map = {}
    for i, j in df.groupby(['XM']):
        size = len(set(j['QName']))
        umi_group_map.update({i: size})

    return umi_group_map

---------------------

In [13]:
def separate_singular_reads(pysamIter):
    
    ugm = generate_umi_group_map(generate_bam_view(pysamIter))
    cluster_reads = 0
    single_reads = 0
    total_reads = 0
    for value in ugm.values():
        if value > 1:
            cluster_reads += value
        else:
            single_reads += value
        total_reads += value
    
    return cluster_reads, single_reads, total_reads

In [14]:
def only_unique_mcs(cluster_map):

    names = []
    total_reads = 0
    
    for key, value in cluster_map.items():
        nhs = [k for i,j,k in value]
        if set(nhs) == set([1]):
            names.append(key)
            ns = [i for i,j,k in value]
            total_reads += len(set(ns))
            
    return names, total_reads

In [15]:
def separate_only_uniques_mcs(cluster_map):
    
    oumc = {}
    mc = {}
    
    for key, value in cluster_map.items():
        nhs = [k for i,j,k in value]
        if set(nhs) == set([1]):
            oumc.update({key:value})
        else:
            mc.update({key:value})

    return mc, oumc

In [16]:
def mc_stats(cluster_map):
    
    reads = []
    clusters = len(cluster_map)
    
    for key, value in cluster_map.items():
        r = [i for i,j,k in value]
        reads += r
        
    return clusters, len(set(reads))

### 1 - Sample :  AAACAT

In [17]:
sample = 'AAACAT'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [18]:
st = pysam.AlignmentFile(f1, 'rb')

In [19]:
emc = pickle.load(open(f2, 'r'))

In [20]:
mc = pickle.load(open(f3, 'r'))

In [21]:
emc_dup = pickle.load(open(f4, 'r'))

In [22]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [23]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_AAACAT.fastq')) / 4

679800

Number of non-singular reads, singular reads, total mapped reads:

In [24]:
separate_singular_reads(st)

(543423, 25323, 568746)

### CASE 1 

In [25]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [26]:
mc_stats(emc)

(9571, 11385)

MCs, total reads

In [27]:
mc_stats(mc)

(4349, 243738)

EMCs with only unique reads:

In [28]:
mc_stats(b)

(422, 1629)

MCs with only unique reads

In [29]:
mc_stats(d)

(106, 793)

Unique EMCs, total reads 

In [30]:
unique_max_cls(a)

(578, 4637)

Unique MCs, total reads 

In [31]:
unique_max_cls(c)

(496, 169553)

### CASE 2 

EMCs (with duplicate reads), total reads

In [32]:
mc_stats(emc_dup)

(6915, 4373)

MCs (with duplicate reads), total reads

In [33]:
mc_stats(mc_dup)

(1644, 290597)

### 2 - Sample :  AGGCGG

In [34]:
sample = 'AGGCGG'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [35]:
st = pysam.AlignmentFile(f1, 'rb')

In [36]:
emc = pickle.load(open(f2, 'r'))

In [37]:
mc = pickle.load(open(f3, 'r'))

In [38]:
emc_dup = pickle.load(open(f4, 'r'))

In [39]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [40]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_AGGCGG.fastq')) / 4

474200

Number of non-singular reads, singular reads, total mapped reads:

In [41]:
separate_singular_reads(st)

(267444, 33605, 301049)

### CASE 1 

In [42]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [43]:
mc_stats(emc)

(23583, 20957)

MCs, total reads

In [44]:
mc_stats(mc)

(3438, 107967)

EMCs with only unique reads:

In [45]:
mc_stats(b)

(2998, 8170)

MCs with only unique reads

In [46]:
mc_stats(d)

(343, 1327)

Unique EMCs, total reads 

In [47]:
unique_max_cls(a)

(1640, 6017)

Unique MCs, total reads 

In [48]:
unique_max_cls(c)

(391, 72222)

### CASE 2 

EMCs (with duplicate reads), total reads

In [49]:
mc_stats(emc_dup)

(7831, 3317)

MCs (with duplicate reads), total reads

In [50]:
mc_stats(mc_dup)

(1945, 143522)

### 3 - Sample :  GGACAT

In [51]:
sample = 'GGACAT'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [52]:
st = pysam.AlignmentFile(f1, 'rb')

In [53]:
emc = pickle.load(open(f2, 'r'))

In [54]:
mc = pickle.load(open(f3, 'r'))

In [55]:
emc_dup = pickle.load(open(f4, 'r'))

In [56]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [57]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_GGACAT.fastq')) / 4

467804

Number of non-singular reads, singular reads, total mapped reads:

In [58]:
separate_singular_reads(st)

(324907, 29454, 354361)

### CASE 1 

In [59]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [60]:
mc_stats(emc)

(18645, 13750)

MCs, total reads

In [61]:
mc_stats(mc)

(4267, 143383)

EMCs with only unique reads:

In [62]:
mc_stats(b)

(1089, 3083)

MCs with only unique reads

In [63]:
mc_stats(d)

(147, 520)

Unique EMCs, total reads 

In [64]:
unique_max_cls(a)

(910, 4242)

Unique MCs, total reads 

In [65]:
unique_max_cls(c)

(322, 74208)

### CASE 2 

EMCs (with duplicate reads), total reads

In [66]:
mc_stats(emc_dup)

(4979, 3035)

MCs (with duplicate reads), total reads

In [67]:
mc_stats(mc_dup)

(1698, 188358)

### 4 - Sample : TGTATT

In [68]:
sample = 'TGTATT'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [69]:
st = pysam.AlignmentFile(f1, 'rb')

In [70]:
emc = pickle.load(open(f2, 'r'))

In [71]:
mc = pickle.load(open(f3, 'r'))

In [72]:
emc_dup = pickle.load(open(f4, 'r'))

In [73]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [74]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_TGTATT.fastq')) / 4

654408

Number of non-singular reads, singular reads, total mapped reads:

In [75]:
separate_singular_reads(st)

(462126, 22533, 484659)

### CASE 1 

In [76]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [77]:
mc_stats(emc)

(15156, 15209)

MCs, total reads

In [78]:
mc_stats(mc)

(5326, 200253)

EMCs with only unique reads:

In [79]:
mc_stats(b)

(1540, 4288)

MCs with only unique reads

In [80]:
mc_stats(d)

(178, 675)

Unique EMCs, total reads 

In [81]:
unique_max_cls(a)

(1168, 4543)

Unique MCs, total reads 

In [82]:
unique_max_cls(c)

(451, 133350)

### CASE 2 

EMCs (with duplicate reads), total reads

In [83]:
mc_stats(emc_dup)

(5123, 2449)

MCs (with duplicate reads), total reads

In [84]:
mc_stats(mc_dup)

(1965, 256776)

### 5 - Sample :  CACCGG

In [85]:
sample = 'CACCGG'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [86]:
st = pysam.AlignmentFile(f1, 'rb')

In [87]:
emc = pickle.load(open(f2, 'r'))

In [88]:
mc = pickle.load(open(f3, 'r'))

In [89]:
emc_dup = pickle.load(open(f4, 'r'))

In [90]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [91]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_CACCGG.fastq')) / 4

511167

Number of non-singular reads, singular reads, total mapped reads:

In [92]:
separate_singular_reads(st)

(354676, 19821, 374497)

### CASE 1 

In [93]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [94]:
mc_stats(emc)

(8473, 9630)

MCs, total reads

In [95]:
mc_stats(mc)

(2918, 148353)

EMCs with only unique reads:

In [96]:
mc_stats(b)

(331, 1417)

MCs with only unique reads

In [97]:
mc_stats(d)

(49, 1318)

Unique EMCs, total reads 

In [98]:
unique_max_cls(a)

(480, 2967)

Unique MCs, total reads 

In [99]:
unique_max_cls(c)

(269, 89501)

### CASE 2 

EMCs (with duplicate reads), total reads

In [100]:
mc_stats(emc_dup)

(6148, 4269)

MCs (with duplicate reads), total reads

In [101]:
mc_stats(mc_dup)

(1365, 212542)

### 6 - Sample :  GAAGAC

In [102]:
sample = 'GAAGAC'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [103]:
st = pysam.AlignmentFile(f1, 'rb')

In [104]:
emc = pickle.load(open(f2, 'r'))

In [105]:
mc = pickle.load(open(f3, 'r'))

In [106]:
emc_dup = pickle.load(open(f4, 'r'))

In [107]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [108]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_GAAGAC.fastq')) / 4

729888

Number of non-singular reads, singular reads, total mapped reads:

In [109]:
separate_singular_reads(st)

(499091, 41343, 540434)

### CASE 1 

In [110]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [111]:
mc_stats(emc)

(26626, 27814)

MCs, total reads

In [112]:
mc_stats(mc)

(7846, 185323)

EMCs with only unique reads:

In [113]:
mc_stats(b)

(2513, 7616)

MCs with only unique reads

In [114]:
mc_stats(d)

(443, 2009)

Unique EMCs, total reads 

In [115]:
unique_max_cls(a)

(1805, 8669)

Unique MCs, total reads 

In [116]:
unique_max_cls(c)

(623, 125750)

### CASE 2 

EMCs (with duplicate reads), total reads

In [117]:
mc_stats(emc_dup)

(9181, 5805)

MCs (with duplicate reads), total reads

In [118]:
mc_stats(mc_dup)

(2328, 277197)

### 7 - Sample :  CATCAG

In [119]:
sample = 'CATCAG'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [120]:
st = pysam.AlignmentFile(f1, 'rb')

In [121]:
emc = pickle.load(open(f2, 'r'))

In [122]:
mc = pickle.load(open(f3, 'r'))

In [123]:
emc_dup = pickle.load(open(f4, 'r'))

In [124]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [125]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_CATCAG.fastq')) / 4

265258

Number of non-singular reads, singular reads, total mapped reads:

In [126]:
separate_singular_reads(st)

(70729, 27028, 97757)

### CASE 1 

In [127]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [128]:
mc_stats(emc)

(19486, 15302)

MCs, total reads

In [129]:
mc_stats(mc)

(2397, 13228)

EMCs with only unique reads:

In [130]:
mc_stats(b)

(2234, 5849)

MCs with only unique reads

In [131]:
mc_stats(d)

(251, 903)

Unique EMCs, total reads 

In [132]:
unique_max_cls(a)

(1282, 4258)

Unique MCs, total reads 

In [133]:
unique_max_cls(c)

(260, 7126)

### CASE 2 

EMCs (with duplicate reads), total reads

In [134]:
mc_stats(emc_dup)

(4996, 2066)

MCs (with duplicate reads), total reads

In [135]:
mc_stats(mc_dup)

(1155, 33484)

### 8 - Sample :  GCCTGC

In [136]:
sample = 'GCCTGC'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [137]:
st = pysam.AlignmentFile(f1, 'rb')

In [138]:
emc = pickle.load(open(f2, 'r'))

In [139]:
mc = pickle.load(open(f3, 'r'))

In [140]:
emc_dup = pickle.load(open(f4, 'r'))

In [141]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [142]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_GCCTGC.fastq')) / 4

997692

Number of non-singular reads, singular reads, total mapped reads:

In [143]:
separate_singular_reads(st)

(673089, 67514, 740603)

### CASE 1 

In [144]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [145]:
mc_stats(emc)

(75365, 43351)

MCs, total reads

In [146]:
mc_stats(mc)

(19161, 241260)

EMCs with only unique reads:

In [147]:
mc_stats(b)

(4329, 11457)

MCs with only unique reads

In [148]:
mc_stats(d)

(1058, 3502)

Unique EMCs, total reads 

In [149]:
unique_max_cls(a)

(2667, 9270)

Unique MCs, total reads 

In [150]:
unique_max_cls(c)

(1190, 141441)

### CASE 2 

EMCs (with duplicate reads), total reads

In [151]:
mc_stats(emc_dup)

(10230, 7648)

MCs (with duplicate reads), total reads

In [152]:
mc_stats(mc_dup)

(5740, 381537)

### 9 - Sample :  TATCAT

In [153]:
sample = 'TATCAT'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [154]:
st = pysam.AlignmentFile(f1, 'rb')

In [155]:
emc = pickle.load(open(f2, 'r'))

In [156]:
mc = pickle.load(open(f3, 'r'))

In [157]:
emc_dup = pickle.load(open(f4, 'r'))

In [158]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [159]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_TATCAT.fastq')) / 4

986475

Number of non-singular reads, singular reads, total mapped reads:

In [160]:
separate_singular_reads(st)

(797327, 40750, 838077)

### CASE 1 

In [161]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [162]:
mc_stats(emc)

(46140, 31101)

MCs, total reads

In [163]:
mc_stats(mc)

(9228, 351467)

EMCs with only unique reads:

In [164]:
mc_stats(b)

(3624, 8918)

MCs with only unique reads

In [165]:
mc_stats(d)

(573, 2261)

Unique EMCs, total reads 

In [166]:
unique_max_cls(a)

(2059, 7807)

Unique MCs, total reads 

In [167]:
unique_max_cls(c)

(729, 226073)

### CASE 2 

EMCs (with duplicate reads), total reads

In [168]:
mc_stats(emc_dup)

(9510, 5637)

MCs (with duplicate reads), total reads

In [169]:
mc_stats(mc_dup)

(3594, 462122)

### 10 - Sample :  CGCCTC

In [170]:
sample = 'CGCCTC'
f1, f2, f3, f4, f5 = generate_infile_names(path, sample)

In [171]:
st = pysam.AlignmentFile(f1, 'rb')

In [172]:
emc = pickle.load(open(f2, 'r'))

In [173]:
mc = pickle.load(open(f3, 'r'))

In [174]:
emc_dup = pickle.load(open(f4, 'r'))

In [175]:
mc_dup = pickle.load(open(f5, 'r'))

-------------------------

Total number of reads in .fastq file :

In [176]:
sum(1 for line in open('/data/parastou/UMI/data/HG/RawReads/HGsample_CGCCTC.fastq')) / 4

1021496

Number of non-singular reads, singular reads, total mapped reads:

In [177]:
separate_singular_reads(st)

(641462, 56087, 697549)

### CASE 1 

In [178]:
a, b = separate_only_uniques_mcs(emc)
c, d = separate_only_uniques_mcs(mc)

EMCs, total reads

In [179]:
mc_stats(emc)

(20242, 19530)

MCs, total reads

In [180]:
mc_stats(mc)

(18388, 306128)

EMCs with only unique reads:

In [181]:
mc_stats(b)

(1103, 3835)

MCs with only unique reads

In [182]:
mc_stats(d)

(234, 1899)

Unique EMCs, total reads 

In [183]:
unique_max_cls(a)

(1650, 7270)

Unique MCs, total reads 

In [184]:
unique_max_cls(c)

(1569, 211187)

### CASE 2 

EMCs (with duplicate reads), total reads

In [185]:
mc_stats(emc_dup)

(11646, 9481)

MCs (with duplicate reads), total reads

In [186]:
mc_stats(mc_dup)

(7259, 297891)