In [9]:
from __future__ import print_function
import itertools
import pandas as pd
import numpy as np
import os
import logging
import glob
import networkx as nx
import math
from tqdm import tqdm

In [10]:
sampdirs = glob.glob('../../sailfish_quant/*')
auxDir = "aux"
cutoff = 0.5
netfile = 'netfile'
# output = rapObj.buildNetFile(sampdirs, netfile, cutoff, auxDir, False)
ofile ='filterGraph_svm.txt'
expDict = {
    'scramble': {'SRR493366_quant': '../../sailfish_quant/SRR493366_quant',
                 'SRR493367_quant': '../../sailfish_quant/SRR493367_quant',
                 'SRR493368_quant': '../../sailfish_quant/SRR493368_quant'},
    'HOXA1KD': {'SRR493369_quant': '../../sailfish_quant/SRR493369_quant',
                'SRR493370_quant': '../../sailfish_quant/SRR493370_quant',
                'SRR493371_quant': '../../sailfish_quant/SRR493371_quant'}
}

In [11]:
class EquivCollection(object):
    def __init__(self):
        self.tnames = []
        self.eqClasses = {}
        self.hasNames = False

    def setNames(self, names):
        self.tnames = names
        self.hasNames = True

    def add(self, tids, count):
        if tids in self.eqClasses:
            self.eqClasses[tids] += count
        else:
            self.eqClasses[tids] = count


In [12]:
def readEqClass(eqfile, eqCollection):
    with open(eqfile) as ifile:
        numTran = int(ifile.readline().rstrip())
        numEq = int(ifile.readline().rstrip())
        print("\nfile: {}; # tran = {}; # eq = {}".format(eqfile, numTran, numEq))
        if not eqCollection.hasNames:
            tnames = []
            for i in range(numTran):
                tnames.append(ifile.readline().rstrip())
            eqCollection.setNames(tnames)
        else:
            for i in range(numTran):
                ifile.readline()

        for i in range(numEq):
            toks = list(map(int, ifile.readline().rstrip().split('\t')))
            nt = toks[0]
            tids = tuple(toks[1:-1])
            count = toks[-1]
            eqCollection.add(tids, count)


In [13]:
def getCountsFromEquiv(eqCollection):
        countDict = {}
        tn = eqCollection.tnames
        for tids, count in eqCollection.eqClasses.items():
            for t in tids:
                if tn[t] in countDict:
                    countDict[tn[t]] += count
                else:
                    countDict[tn[t]] = count
        # ensure no division by 0
        for t in eqCollection.tnames:
            if t in countDict:
                countDict[t] += 1.0
            else:
                countDict[t] = 1.0
        return countDict

In [14]:
# Get just the set of condition names
conditions = expDict.keys()
logging.info("conditions = {}".format(conditions))
eqClasses = {}
for cond in conditions:
    print(expDict[cond])
    for sampNum, sampPath in expDict[cond].items():
        if cond not in eqClasses:
            eqClasses[cond] = EquivCollection()
        eqPath = os.path.sep.join([sampPath, auxDir, "eq_classes.txt"])
        readEqClass(eqPath, eqClasses[cond])

{'SRR493368_quant': '../../sailfish_quant/SRR493368_quant', 'SRR493366_quant': '../../sailfish_quant/SRR493366_quant', 'SRR493367_quant': '../../sailfish_quant/SRR493367_quant'}

file: ../../sailfish_quant/SRR493368_quant/aux/eq_classes.txt; # tran = 107389; # eq = 101801

file: ../../sailfish_quant/SRR493366_quant/aux/eq_classes.txt; # tran = 107389; # eq = 95472

file: ../../sailfish_quant/SRR493367_quant/aux/eq_classes.txt; # tran = 107389; # eq = 98035
{'SRR493371_quant': '../../sailfish_quant/SRR493371_quant', 'SRR493370_quant': '../../sailfish_quant/SRR493370_quant', 'SRR493369_quant': '../../sailfish_quant/SRR493369_quant'}

file: ../../sailfish_quant/SRR493371_quant/aux/eq_classes.txt; # tran = 107389; # eq = 104868

file: ../../sailfish_quant/SRR493370_quant/aux/eq_classes.txt; # tran = 107389; # eq = 102891

file: ../../sailfish_quant/SRR493369_quant/aux/eq_classes.txt; # tran = 107389; # eq = 100141


In [15]:
sep = os.path.sep
sffiles = [sep.join([sd, 'quant.sf']) for sd in sampdirs]
eqfiles = [sep.join([sd, auxDir, 'eq_classes.txt']) for sd in sampdirs]

In [16]:
def process_data(eqClass_counter, eqfile, numSamp, firstSamp, tnames, dict_eqClass, dic_uniq_transcripts):
    print('Entered function')
    with open(eqfile) as ifile:
        numSamp += 1
        numTran = int(ifile.readline().rstrip())
        numEq = int(ifile.readline().rstrip())
        if firstSamp:
            for i in range(numTran):
                tnames.append(ifile.readline().rstrip())
        else:
            for i in range(numTran):
                ifile.readline()
        for i in range(numEq): #numEq
            eqClass_counter += 1
            toks = list(map(int, ifile.readline().rstrip().split('\t')))
            tids = tuple(toks[1:-1])
            readCount = toks[-1]
            if tids not in dict_eqClass:
                dict_eqClass[tids] = readCount    #eqClass_counter
            else:
                dict_eqClass[tids] += readCount
            for ids in tids:
                if ids not in dic_uniq_transcripts:
                    dic_tup = {}
                    dic_tup[eqClass_counter] = readCount
                    dic_uniq_transcripts[ids] = dic_tup
                else:
                    dic_tup = dic_uniq_transcripts[ids]
                    if eqClass_counter not in dic_tup:
                        dic_tup[eqClass_counter] = readCount
                    else:
                        dic_tup[eqClass_counter] += readCount
                    dic_uniq_transcripts[ids] = dic_tup


In [18]:
cond = ['scramble', 'HOXA1KD']
conditions_data = []
condition1_data = None
condition2_data = None
for cond_val in cond:
    condition1_paths = expDict[cond_val].values()
    dict_eqClass = {}
    dic_uniq_transcripts = {}
    eqClass_counter = 0
    for path in condition1_paths:
        tnames = []
        firstSamp = True
        numSamp = 0
        eqfile = os.path.sep.join([path, auxDir, "eq_classes.txt"])
        print(eqfile)
        process_data(eqClass_counter, eqfile, numSamp, firstSamp, tnames, dict_eqClass, dic_uniq_transcripts)
    
    lst_uniq_tids = dic_uniq_transcripts.keys()
    dict_prob = {}
    lst_keys = dict_eqClass.keys()
    flag = True
    for tup in lst_keys:
        for t1,t2 in itertools.combinations(tup,2):  
            readCount_t1 = 0
            readCount_t2 = 0
            readCount_t1t2 = 0
            dic_t1 = dic_uniq_transcripts[t1]
            dic_t2 = dic_uniq_transcripts[t2]
            flag = False
            for eqClass,readCount in dic_t1.items():
                if eqClass in dic_t2:
                    readCount_t1t2 += readCount
                else:
                    readCount_t1 += readCount
            for eqClass,readCount in dic_t2.items():
                if eqClass not in dic_t1:
                    readCount_t2 += readCount
            dict_prob[(t1,t2)] = readCount_t1t2 / (readCount_t1+readCount_t2+readCount_t1t2 + 1)
    
    conditions_data.append(dict_prob)

condition1_data = conditions_data[0]
condition2_data = conditions_data[1]
        

../../sailfish_quant/SRR493368_quant/aux/eq_classes.txt
Entered function
../../sailfish_quant/SRR493366_quant/aux/eq_classes.txt
Entered function
../../sailfish_quant/SRR493367_quant/aux/eq_classes.txt
Entered function
../../sailfish_quant/SRR493371_quant/aux/eq_classes.txt
Entered function
../../sailfish_quant/SRR493370_quant/aux/eq_classes.txt
Entered function
../../sailfish_quant/SRR493369_quant/aux/eq_classes.txt
Entered function


In [19]:
probSuccess_diff  = {}
for k,cond1_val in condition1_data.items():
    if k in condition2_data:
        cond2_val = condition2_data[k]
        diff = abs(cond2_val - cond1_val)
        probSuccess_diff[k] = diff

In [21]:
new_probSuccess = {}
with open('new_output_readCount3.txt', 'w') as ofile:
    for k,v in probSuccess_diff.items():
        if v < 0.2 :   
            t1_name = tnames[k[0]]
            t2_name = tnames[k[1]]
            new_probSuccess[(t1_name, t2_name)] = v
            ofile.write("{}\t{}\t{}\n".format(t1_name, t2_name, v))