In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pysam
import numpy as np
import cPickle as pickle
from collections import Counter
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
import logging
logging.basicConfig(level=logging.INFO)

In [5]:
import Bio.SeqIO
import StringIO

In [6]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna

In [7]:
import os

In [79]:
in_file = '/data/parastou/UMI/data/HG/RawReads/HGsample_AAATCA.fastq' #'/data/parastou/UMI/data/test.fastq'
in_umi = '/data/parastou/UMI/data/HG/RawReads/HGsample_AAATCA.umi' #'/data/parastou/UMI/data/test.umi'

---------------------

BUILD A TEST DATA STRUCTURE

In [84]:
def names_to_umi_dict(in_file, in_umi):
    
    qnames = []
    with open(in_file, 'rb') as f:
        for line in f:
            if 'L183:338:CAGAAANXX' in line:
                qnames.append(line.strip('@').strip('\n'))
    
    umis = []
    with open(in_umi, 'rb') as f:
        for line in f:
            umis.append(line.strip('\n'))
            
    qu_map = {}
    for index, q in enumerate(qnames):
        qu_map.update({q: umis[index]})

    return qu_map

In [85]:
validation_map = names_to_umi_dict(in_file, in_umi)

------------------------------

In [52]:
def fastq_umi_to_df(in_file, in_umi):

    fastq_records = []
    for record in Bio.SeqIO.parse(in_file, 'fastq'):
        r = (record.id, str(record.seq), record.letter_annotations["phred_quality"])
        fastq_records.append(r)
    
    umis = []
    with open(in_umi, 'r') as fu:
        for rec in fu:
            umis.append(rec.rstrip())
            
    records = []
    for index, (i,j,k) in enumerate(fastq_records):
        records.append((i,j,k,umis[index]))
    
    x = pd.DataFrame(records, columns=['NAME', 'SEQ', 'QUAL', 'XM'])

    return x

In [100]:
x = fastq_umi_to_df(in_file, in_umi)

TEST DATAFRAME COHERENCY WITH VALIDATION MAP

In [95]:
for key, value in validation_map.items():
    
    if not (x[x['NAME'] == key]['XM'].item()) == value:
        print 'Error!'

KeyboardInterrupt: 

-----------------------------

In [11]:
def modify_qual(quals):
    
    a = np.vstack(quals)
    new_qual = np.amax(a, axis = 0)

    return new_qual

In [60]:
def to_IOSeq_rec(row):
    
    qname = row['NAME']
    seq = row['SEQ']
    qual = row['QUAL']
    xm = row['XM']

    record = SeqRecord(Seq(seq, generic_dna), id=qname, name=qname, description='', dbxrefs=[])
    record.letter_annotations["phred_quality"] = qual

    return record, xm

In [104]:
def dedup_df_records(df):
    
    new_records = []
    dup_count = 0
    rec_count = 0

    for (xm, seq), g in df.groupby(['XM', 'SEQ']):

        if len(g) > 1:
            name = list(g['NAME'])[0] + 'M'
            qual = modify_qual(list(g['QUAL']))
            new_records.append((name, seq, qual, xm))
            dup_count += (len(g) - 1)
        else:
            new_records.append((g['NAME'].item(), seq, g['QUAL'].item(), xm))

        rec_count += 1
        
    print 'Number of duplicate records:\t\t%s ' %format(dup_count, ',')
    print 'Total records in deduplicated file:\t%s' %format(rec_count,',')
    print 'Checksum (duplicates + new):\t\t%s' %format(dup_count + rec_count, ',')
    
    return new_records

In [115]:
def df_to_file(df, out_file, out_umi):
    
    with open(out_file, 'w') as fq:
        with open(out_umi, 'w') as fu:
            for index, row in df.iterrows():

                record, xm = to_IOSeq_rec(row)
                    
                Bio.SeqIO.write(record, fq, 'fastq')
                fu.write(xm + '\n')

    fu.close()
    fq.close()
    print 'Write operation done successfully!'

-----------------------------------------

In [105]:
y = pd.DataFrame(dedup_df_records(x), columns = ['NAME', 'SEQ', 'QUAL', 'XM'])

Number of duplicate records:		427,029 
Total records in deduplicated file:	207,094
Checksum (duplicates + new):		634,123


CHECK DEDUPLICATED DATAFRAME COHERENCY WITH VALIDATION MAP

In [111]:
for index, row in x.iterrows():
    
    qname = row['NAME'].rstrip('M')
    xm = row['XM']
    
    if not xm == validation_map[qname]:
        print 'Error!'

In [112]:
path = '/data/parastou/UMI/data/HG/PreprocReads/'

out_file = os.path.join(path, '%s.dedup.fastq' % os.path.basename(in_file).split('.')[0])
out_umi = os.path.join(path, '%s.dedup.umi' % os.path.basename(in_umi).split('.')[0])

In [116]:
df_to_file(y, out_file, out_umi)

Write operation done successfully!


-----------------------------------

LOAD DEDUPLICATED FASTQ AND UMI FILE AND CHECK MAP COHERENCY

In [117]:
in_file2 = '/data/parastou/UMI/data/HG/PreprocReads/HGsample_AAATCA.dedup.fastq'
in_umi2 = '/data/parastou/UMI/data/HG/PreprocReads/HGsample_AAATCA.dedup.umi'
dc2 = names_to_umi_dict(in_file2, in_umi2)

In [120]:
check = []
for key, value in dc2.items():
    
    qname = key.rstrip('M')
    if validation_map[qname] == value:
        check.append('Coorect')