In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pysam
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
from collections import Counter

In [4]:
import logging
logging.basicConfig(level=logging.INFO)

In [220]:
def gapsofsize(l, length):
    
    x = sorted(list(set(l)))
    a = x[1:]
    b = x[:-1]
    it = zip(a,b)
    
    gaps = []
    for i,j in zip(a,b):
        if i - j > length:
            gaps += [j, i]
            
    return gaps

In [11]:
def decode_locus(locus):
    
    l = locus.split(':')
    ref = l[0]
    start = int(l[1])
    
    return ref, start

In [236]:
def loci_clusters(l, length):

    interval = sorted(l)
    gaps = gapsofsize(l, 1500)
    first = interval[0]
    last = interval[-1]
    gaps.append(first)
    gaps.append(last)
    gaps.sort()
    
    clusters = [(i, j + 50) for i, j in zip(*2 * [iter(gaps)]) if not i == j]

    if clusters:
        return clusters
    else:
        return None

In [None]:
def assign_cluster(clusters, loci, xm, ref):
    
    prefix = xm + ':' + ref + ':'
    cluster_map = [None] * len(loci)
    for index, locus in enumerate(loci):
        for i, j in clusters:
            if i <= locus <= j:
                cluster_map[index] = prefix + str(i) + ':' + str(j)
                break
             
    return cluster_map

In [285]:
def update_cluster_map(cluster_map, cluster_id, props):
    
    if not cluster_id in cluster_map:
        cluster_map.update({cluster_id:[props]})
    else:
        cluster_map[cluster_id].append(props)
    

In [144]:
in_file = '/data/parastou/Star-Lab/test/NStar25.Aligned.out.tagged.bam'

In [145]:
st = pysam.AlignmentFile(in_file, 'rb')

In [297]:
def generate_bam_view(st):
    
    st.reset()
    reads = st.fetch(until_eof=True)

    records = []
    for r in reads:
        if r.get_tag('NH') > 0:
            xm = r.get_tag('XM')
            qn = r.query_name
            ref = r.reference_name
            start = r.reference_start
            nh = r.get_tag('NH')
            records.append((xm, qn, ref, start, nh))
    df = pd.DataFrame(records, columns=['XM', 'QName', 'Ref', 'Start', 'NH'])
    
    return df

In [298]:
df = generate_bam_view(st)

In [299]:
df

Unnamed: 0,XM,QName,Ref,Start,NH
0,TGGGGTTACT,L183:338:CAGAAANXX:5:2103:19095:7288,chr1,17632,10
1,GTGCCAACCA,L183:338:CAGAAANXX:4:2313:19904:5163,chr1,18295,10
2,GGGGGCCTAA,L183:338:CAGAAANXX:5:1214:15648:39769,chr1,22502,8
3,ACCAATTCTT,L183:338:CAGAAANXX:4:2312:20639:64473,chr1,31317,8
4,ATGGAATTGA,L183:338:CAGAAANXX:5:1210:11806:84753,chr1,31497,10
5,GTGTGAGTTC,L183:338:CAGAAANXX:4:2102:5550:30193,chr1,38289,3
6,GTGTGAGTTC,L183:338:CAGAAANXX:5:2314:10559:68270,chr1,38289,3
7,GTGTGAGTTC,L183:338:CAGAAANXX:6:1310:13487:97612,chr1,38289,3
8,GCAGGGGTCC,L183:338:CAGAAANXX:6:2301:5021:36991,chr1,51793,968
9,GCAGGGGTCC,L183:338:CAGAAANXX:5:1105:12091:100933,chr1,51797,34


In [345]:
reads = set(df['QName'])

In [346]:
len(reads)

418210

In [300]:
def generate_cluster_map(df):

    cluster_map = {}
    for i, j in df.groupby(['XM','Ref']):

        if len(j) > 1:

            locs = list(j['Start'])
            clusters = loci_clusters(locs, 1500)

            if clusters:
                c_map = assign_cluster(clusters, locs, xm, ref)
                sub_map = {}
                xm, ref = i
                names = list(j['QName'])

                for index, item in enumerate(c_map):

                    if item:
                        props = names[index], locs[index]
                        update_cluster_map(cluster_map, item, props)
                        
    return cluster_map

In [329]:
def generate_umi_group_map(df):
    
    umi_group_map = {}
    for i, j in df.groupby(['XM']):
        size = len(set(j['QName']))
        umi_group_map.update({i:size})
                       
    return umi_group_map

In [330]:
%%time
z = generate_umi_group_map(df)

CPU times: user 5.55 s, sys: 110 ms, total: 5.66 s
Wall time: 5.68 s


In [336]:
maximal_cluster_map = {}
for key, value in cluster_map.items():
    
    a = key.split(':')
    xm = a[0]
    gs = z[xm]
    if  gs - 1 <= len(value) <= gs:
        maximal_cluster_map.update({key:value})

In [338]:
import cPickle as pickle

In [340]:
mcm = pickle.load(open('/data/parastou/pyUMI/pyUMI/MaximalClusters.pkl', 'rb'))

In [None]:
df.to_csv('ClusterMap.csv', sep='\t')

In [352]:
total_reads = [] 
for i in mcm.values():
    for item in i:
        total_reads.append(item[0])


In [354]:
len(set(total_reads))

26829

In [351]:
mcm

{'CAGGGCATTA:chr5:80651373:80651666': [('L183:338:CAGAAANXX:4:2102:12201:70159',
   80651373),
  ('L183:338:CAGAAANXX:5:1209:7777:68868', 80651412),
  ('L183:338:CAGAAANXX:5:1316:10188:80918', 80651412),
  ('L183:338:CAGAAANXX:4:2214:18379:15897', 80651435),
  ('L183:338:CAGAAANXX:5:2105:9829:29729', 80651435),
  ('L183:338:CAGAAANXX:5:2204:19940:86072', 80651616)],
 'CAGTGGAGGG:chr5_KI270897v1_alt:514009:514127': [('L183:338:CAGAAANXX:4:2314:14551:66910',
   514009),
  ('L183:338:CAGAAANXX:6:1311:2304:37304', 514009),
  ('L183:338:CAGAAANXX:4:2313:16884:84604', 514077),
  ('L183:338:CAGAAANXX:6:2308:10669:48145', 514077)],
 'ATTCGTAGGT:chrY:10197779:10197834': [('L183:338:CAGAAANXX:4:1105:19052:50497',
   10197779),
  ('L183:338:CAGAAANXX:4:1204:13256:48628', 10197784)],
 'AGTACTGCCG:chr11:10509712:10510079': [('L183:338:CAGAAANXX:4:2103:10649:8745',
   10509712),
  ('L183:338:CAGAAANXX:6:1207:18517:92493', 10509712),
  ('L183:338:CAGAAANXX:5:1105:9107:58308', 10510029)],
 'AAGTGATAAG

In [348]:
for item in mcm:
    if 'GTTTTTCCCC' in item:
        print 'mm for GTTTTTCCCC'
        print len()

mm for GTTTTTCCCC
mm for GTTTTTCCCC
mm for GTTTTTCCCC
mm for GTTTTTCCCC
mm for GTTTTTCCCC
mm for GTTTTTCCCC
mm for GTTTTTCCCC
mm for GTTTTTCCCC
