In [2]:
from __future__ import print_function
import itertools
import pandas as pd
import numpy as np
import os
import logging
import glob
import networkx as nx
import math
from tqdm import tqdm




In [3]:
sampdirs = glob.glob('../../sailfish_quant/*')
auxDir = "aux"
cutoff = 0.5
netfile = 'netfile'
# output = rapObj.buildNetFile(sampdirs, netfile, cutoff, auxDir, False)
ofile ='filterGraph_svm.txt'
expDict = {
    'scramble': {'SRR493366_quant': '../../sailfish_quant/SRR493366_quant',
                 'SRR493367_quant': '../../sailfish_quant/SRR493367_quant',
                 'SRR493368_quant': '../../sailfish_quant/SRR493368_quant'},
    'HOXA1KD': {'SRR493369_quant': '../../sailfish_quant/SRR493369_quant',
                'SRR493370_quant': '../../sailfish_quant/SRR493370_quant',
                'SRR493371_quant': '../../sailfish_quant/SRR493371_quant'}
}

In [4]:
class EquivCollection(object):
    def __init__(self):
        self.tnames = []
        self.eqClasses = {}
        self.hasNames = False

    def setNames(self, names):
        self.tnames = names
        self.hasNames = True

    def add(self, tids, count):
        if tids in self.eqClasses:
            self.eqClasses[tids] += count
        else:
            self.eqClasses[tids] = count


In [5]:
def readEqClass(eqfile, eqCollection):
    with open(eqfile) as ifile:
        numTran = int(ifile.readline().rstrip())
        numEq = int(ifile.readline().rstrip())
        print("\nfile: {}; # tran = {}; # eq = {}".format(eqfile, numTran, numEq))
        if not eqCollection.hasNames:
            tnames = []
            for i in range(numTran):
                tnames.append(ifile.readline().rstrip())
            eqCollection.setNames(tnames)
        else:
            for i in range(numTran):
                ifile.readline()

        for i in range(numEq):
            toks = list(map(int, ifile.readline().rstrip().split('\t')))
            nt = toks[0]
            tids = tuple(toks[1:-1])
            count = toks[-1]
            eqCollection.add(tids, count)


In [6]:
def getCountsFromEquiv(eqCollection):
        countDict = {}
        tn = eqCollection.tnames
        for tids, count in eqCollection.eqClasses.items():
            for t in tids:
                if tn[t] in countDict:
                    countDict[tn[t]] += count
                else:
                    countDict[tn[t]] = count
        # ensure no division by 0
        for t in eqCollection.tnames:
            if t in countDict:
                countDict[t] += 1.0
            else:
                countDict[t] = 1.0
        return countDict

In [6]:
# Get just the set of condition names
conditions = expDict.keys()
logging.info("conditions = {}".format(conditions))
eqClasses = {}
for cond in conditions:
    print(expDict[cond])
    for sampNum, sampPath in expDict[cond].items():
        if cond not in eqClasses:
            eqClasses[cond] = EquivCollection()
        eqPath = os.path.sep.join([sampPath, auxDir, "eq_classes.txt"])
        readEqClass(eqPath, eqClasses[cond])

{'SRR493371_quant': '../../sailfish_quant/SRR493371_quant', 'SRR493369_quant': '../../sailfish_quant/SRR493369_quant', 'SRR493370_quant': '../../sailfish_quant/SRR493370_quant'}

file: ../../sailfish_quant/SRR493371_quant/aux/eq_classes.txt; # tran = 107389; # eq = 104868

file: ../../sailfish_quant/SRR493369_quant/aux/eq_classes.txt; # tran = 107389; # eq = 100141

file: ../../sailfish_quant/SRR493370_quant/aux/eq_classes.txt; # tran = 107389; # eq = 102891
{'SRR493366_quant': '../../sailfish_quant/SRR493366_quant', 'SRR493367_quant': '../../sailfish_quant/SRR493367_quant', 'SRR493368_quant': '../../sailfish_quant/SRR493368_quant'}

file: ../../sailfish_quant/SRR493366_quant/aux/eq_classes.txt; # tran = 107389; # eq = 95472

file: ../../sailfish_quant/SRR493367_quant/aux/eq_classes.txt; # tran = 107389; # eq = 98035

file: ../../sailfish_quant/SRR493368_quant/aux/eq_classes.txt; # tran = 107389; # eq = 101801


In [7]:
ambigCounts = {cond : getCountsFromEquiv(eqClasses[cond]) for cond in conditions}

sailfish = {}
for cond in conditions:
    sailfish[cond] = ambigCounts[cond]

In [8]:
ambigCounts

{'HOXA1KD': {'comp217262_c0_seq1': 1259.0,
  'comp203954_c0_seq2': 41.0,
  'comp215945_c0_seq2': 73.0,
  'comp215874_c0_seq12': 460.0,
  'comp325096_c0_seq1': 18.0,
  'comp187151_c0_seq1': 4.0,
  'comp455410_c0_seq1': 11.0,
  'comp212851_c0_seq12': 1849.0,
  'comp216029_c2_seq1': 1094.0,
  'comp215299_c0_seq4': 104.0,
  'comp59268_c0_seq1': 3.0,
  'comp217722_c0_seq1': 1004.0,
  'comp200000_c0_seq1': 12.0,
  'comp213086_c0_seq11': 89.0,
  'comp214875_c0_seq1': 13.0,
  'comp417354_c0_seq1': 8.0,
  'comp600578_c0_seq1': 7.0,
  'comp1328790_c0_seq1': 3.0,
  'comp36037_c0_seq1': 1.0,
  'comp35706_c0_seq1': 4.0,
  'comp593737_c0_seq1': 6.0,
  'comp195762_c0_seq1': 6.0,
  'comp213841_c0_seq3': 4.0,
  'comp964850_c0_seq1': 3.0,
  'comp207646_c0_seq2': 3.0,
  'comp1010167_c0_seq1': 3.0,
  'comp169056_c0_seq1': 553.0,
  'comp133028_c0_seq1': 1574.0,
  'comp205780_c0_seq1': 8.0,
  'comp86625_c0_seq1': 11.0,
  'comp217360_c0_seq1': 2133.0,
  'comp707630_c0_seq1': 11.0,
  'comp176233_c0_seq1': 13.

In [9]:
# ambigCounts = {cond : getCountsFromEquiv(eqClasses[cond]) for cond in conditions}

# sailfish = {}
# for cond in conditions:
#     sailfish[cond] = ambigCounts[cond]

logging.info("Done Reading")
count = 0
numTrimmed = 0
with open(netfile) as f, open(ofile, 'w') as ofile:
    data = pd.read_table(f, header=None)
    print(len(data))
    for i in tqdm(range(100)):
        count += 1
        #print("\r{} done".format(count), end="")
        #Alternative hypo
        x = data[0][i]
        print('x : ', x)
        y = data[1][i]
        print('y :', y)
        non_null=0
        x_all=0
        y_all=0
        # Calculating l1
        for cond in conditions:
            y_g = sailfish[cond][y]
            print('y_g', y_g)
            x_g = sailfish[cond][x]
            print('x_g', x_g)
            r = y_g / x_g
            print('r', r)
            non_null += (y_g * math.log(r*x_g)) - (r*x_g)
#             print('non_null',non_null)
            non_null += (x_g * math.log(x_g)) - x_g
            print('non_null',non_null)
            x_all += x_g
            print('x_all',x_all)
            y_all += y_g
            print('y_all', y_all)
        #null hypothesis
        null = 0
        r_all = y_all / x_all
        # Calculating l0
        print('.......')
        for cond in conditions:
            y_g = sailfish[cond][y]
            print('y_g', y_g)
            x_g = sailfish[cond][x]
            print('x_g', x_g)
            mean_x = (x_g + y_g) / (1+r_all)
            null += (y_g * math.log(r_all * mean_x)) - (r_all * mean_x)
            null += (x_g * math.log(mean_x)) - mean_x
            print('null : ', null)
            
        D = 2*(non_null-null)
        print('D', D)
        print('label', data[2][i])
        if D <= 20:
            ofile.write("{}\t{}\t{}\n".format(x, y, data[2][i]))
        else:
            numTrimmed += 1
logging.info("Trimmed {} edges".format(numTrimmed))


  0%|          | 0/100 [00:00<?, ?it/s]

244221
x : 

  1%|          | 1/100 [00:00<00:16,  6.13it/s]

 comp202220_c0_seq1
y : comp202220_c0_seq1
y_g 30.0
x_g 30.0
r 1.0
non_null 144.07184289972932
x_all 30.0
y_all 30.0
y_g 45.0
x_g 45.0
r 1.0
non_null 396.6714669790581
x_all 75.0
y_all 75.0
.......
y_g 30.0
x_g 30.0
null :  144.07184289972932
y_g 45.0
x_g 45.0
null :  396.6714669790581
D 0.0
label 1.1
x :  comp209419_c0_seq1
y : comp209419_c0_seq8
y_g 5794.0
x_g 5439.0
r 1.0652693509836366
non_null 85752.3113316014
x_all 5439.0
y_all 5794.0
y_g 3843.0
x_g 3644.0
r 1.0546103183315039
non_null 139869.31731467828
x_all 9083.0
y_all 9637.0
.......
y_g 5794.0
x_g 5439.0
null :  85752.28863551188
y_g 3843.0
x_g 3644.0
null :  139869.26057022513
D 0.11348890629597008
label 2.83441549106
x :  comp172653_c0_seq2
y : comp172653_c0_seq2
y_g 11.0
x_g 11.0
r 1.0
non_null 30.75369600156415
x_all 11.0
y_all 11.0
y_g 13.0
x_g 13.0
r 1.0
non_null 71.44237929556411
x_all 24.0
y_all 24.0
.......
y_g 11.0
x_g 11.0
null :  30.75369600156415
y_g 13.0
x_g 13.0
null :  71.44237929556411
D 0.0
label 1.1
x :  c

 14%|█▍        | 14/100 [00:00<00:10,  8.52it/s]

 comp212950_c0_seq11
y : comp212950_c0_seq12
y_g 454.0
x_g 541.0
r 0.8391866913123844
non_null 5187.355957766718
x_all 541.0
y_all 454.0
y_g 435.0
x_g 515.0
r 0.8446601941747572
non_null 10095.87743513212
x_all 1056.0
y_all 889.0
.......
y_g 454.0
x_g 541.0
null :  5187.354712791846
y_g 435.0
x_g 515.0
null :  10095.87488645243
D 0.0050973593824892305
label 29.6203340652
x :  comp70963_c0_seq1
y : comp70963_c0_seq1
y_g 17.0
x_g 17.0
r 1.0
non_null 62.329253697911355
x_all 17.0
y_all 17.0
y_g 6.0
x_g 6.0
r 1.0
non_null 71.83036732864801
x_all 23.0
y_all 23.0
.......
y_g 17.0
x_g 17.0
null :  62.329253697911355
y_g 6.0
x_g 6.0
null :  71.83036732864801
D 0.0
label 1.1
x :  comp213445_c0_seq67
y : comp213445_c0_seq74
y_g 1120.0
x_g 1117.0
r 1.0026857654431514
non_null 13466.168849564134
x_all 1117.0
y_all 1120.0
y_g 875.0
x_g 878.0
r 0.9965831435079726
non_null 23591.38845933868
x_all 1995.0
y_all 1995.0
.......
y_g 1120.0
x_g 1117.0
null :  13466.166837940822
y_g 875.0
x_g 878.0
null :  

 68%|██████▊   | 68/100 [00:00<00:02, 12.03it/s]

x_all 60.0
y_all 60.0
y_g 44.0
x_g 44.0
r 1.0
non_null 616.3300352514591
x_all 104.0
y_all 104.0
.......
y_g 60.0
x_g 60.0
null :  371.32134746665207
y_g 44.0
x_g 44.0
null :  616.3300352514591
D 0.0
label 1.1
x :  comp214021_c0_seq1
y : comp214021_c0_seq8
y_g 389.0
x_g 1913.0
r 0.20334553058024046
non_null 14473.27907020678
x_all 1913.0
y_all 389.0
y_g 376.0
x_g 1780.0
r 0.21123595505617979
non_null 27868.980773170693
x_all 3693.0
y_all 765.0
.......
y_g 389.0
x_g 1913.0
null :  14473.22334439498
y_g 376.0
x_g 1780.0
null :  27868.866041738427
D 0.22946286453225184
label 465.893355968
x :  comp215956_c0_seq1
y : comp215956_c0_seq11
y_g 226.0
x_g 196.0
r 1.153061224489796
non_null 1837.551383044718
x_all 196.0
y_all 226.0
y_g 644.0
x_g 551.0
r 1.1687840290381126
non_null 8285.515242499177
x_all 747.0
y_all 870.0
.......
y_g 226.0
x_g 196.0
null :  1837.5461279002802
y_g 644.0
x_g 551.0
null :  8285.508130932441
D 0.014223133472114569
label 10.0623292531
x :  comp211323_c2_seq4
y : comp

100%|██████████| 100/100 [00:00<00:00, 170.67it/s]

x_g 4.0
r 1.0
non_null 20.3614195558365
x_all 12.0
y_all 12.0
.......
y_g 8.0
x_g 8.0
null :  17.271064666877372
y_g 4.0
x_g 4.0
null :  20.3614195558365
D 0.0
label 1.1
x :  comp212358_c0_seq22
y : comp212358_c0_seq30
y_g 1419.0
x_g 1800.0
r 0.7883333333333333
non_null 20571.662692881764
x_all 1800.0
y_all 1419.0
y_g 1388.0
x_g 1884.0
r 0.7367303609341825
non_null 41550.23328615922
x_all 3684.0
y_all 2807.0
.......
y_g 1419.0
x_g 1800.0
null :  20571.203391836996
y_g 1388.0
x_g 1884.0
null :  41549.32074468912
D 1.8250829402095405
label inf
x :  comp215900_c0_seq33
y : comp215900_c0_seq36
y_g 87.0
x_g 29.0
r 3.0
non_null 370.1855853925565
x_all 29.0
y_all 87.0
y_g 63.0
x_g 15.0
r 4.2
non_null 593.8238261717563
x_all 44.0
y_all 150.0
.......
y_g 87.0
x_g 29.0
null :  370.01168568699603
y_g 63.0
x_g 15.0
null :  593.3749057986827
D 0.897840746147267
label 0.613876742653
x :  comp206870_c0_seq2
y : comp206870_c0_seq4
y_g 1624.0
x_g 445.0
r 3.649438202247191
non_null 12650.302629215892
x_




In [10]:
data = None
with open(netfile) as f:
    data = pd.read_table(f, header=None)

In [11]:
data

Unnamed: 0,0,1,2
0,comp202220_c0_seq1,comp202220_c0_seq1,1.100000e+00
1,comp209419_c0_seq1,comp209419_c0_seq8,2.834415e+00
2,comp172653_c0_seq2,comp172653_c0_seq2,1.100000e+00
3,comp210373_c0_seq32,comp210373_c0_seq44,1.331837e+02
4,comp213948_c0_seq18,comp213948_c0_seq24,1.851824e+00
5,comp214740_c0_seq13,comp214740_c0_seq26,5.776098e+00
6,comp165532_c0_seq1,comp165532_c0_seq1,1.100000e+00
7,comp812035_c0_seq1,comp812035_c0_seq1,1.100000e+00
8,comp214020_c0_seq3,comp214020_c0_seq4,2.729649e+00
9,comp213668_c0_seq18,comp213668_c0_seq18,1.100000e+00


In [12]:
sep = os.path.sep
sffiles = [sep.join([sd, 'quant.sf']) for sd in sampdirs]
eqfiles = [sep.join([sd, auxDir, 'eq_classes.txt']) for sd in sampdirs]

In [13]:
quant = pd.read_table(sffiles[0])
quant.set_index('Name', inplace=True)
quant

Unnamed: 0_level_0,Length,EffectiveLength,TPM,NumReads
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
comp4886_c0_seq1,232,74.8985,0.00000,0.0
comp4888_c0_seq1,276,111.4320,1.10047,1.0
comp4893_c0_seq1,211,59.1090,4.14919,2.0
comp4899_c0_seq1,218,64.1702,0.00000,0.0
comp4899_c0_seq2,243,83.4422,0.00000,0.0
comp4905_c0_seq1,220,65.7597,3.72956,2.0
comp4910_c0_seq1,239,80.3647,0.00000,0.0
comp4929_c0_seq1,201,52.0029,0.00000,0.0
comp4942_c0_seq1,214,61.2790,0.00000,0.0
comp4946_c0_seq1,266,102.7850,2.38609,2.0


In [9]:
eqfiles[0]

NameError: name 'eqfiles' is not defined

In [15]:
eqfile = eqfiles[0]
numSamp = 0
firstSamp = True
tnames = []
weightDict = {}
diagCounts = None
sumCounts = None
ambigCounts = None
firstSamp = True
numSamp = 0
tot = 0
eqClasses = {}

with open(eqfile) as ifile:
    numSamp += 1
    numTran = int(ifile.readline().rstrip())
    numEq = int(ifile.readline().rstrip())
#     logging.info("quant file: {}; eq file: {}; # tran = {}; # eq = {}".format(sffile, eqfile, numTran, numEq))
    if firstSamp:
        for i in range(numTran):
            tnames.append(ifile.readline().rstrip())
        diagCounts = np.zeros(len(tnames))
        sumCounts = np.zeros(len(tnames))
        ambigCounts = np.zeros(len(tnames))
    else:
        for i in range(numTran):
            ifile.readline()

    # for easy access to quantities of interest
    tpm = quant.loc[tnames, 'TPM'].values
    estCount = quant.loc[tnames, 'NumReads'].values
    efflens = quant.loc[tnames, 'EffectiveLength'].values
    sumCounts = np.maximum(sumCounts, estCount)

    for i in range(10): #numEq
        toks = list(map(int, ifile.readline().rstrip().split('\t')))
        print(toks)
        nt = toks[0]
        print('nt', nt)
        tids = tuple(toks[1:-1])
        print('tids', tids)
        count = toks[-1]
        print('count', count)
        if tids in eqClasses:
            eqClasses[tids] += count
        else:
            eqClasses[tids] = count

        # Add the contribution to the graph
        denom = sum([tpm[t] for t in tids])
        for t1, t2 in itertools.combinations(tids,2):
            tpm1 = tpm[t1]
            tpm2 = tpm[t2]
            w = count * ((tpm1 + tpm2) / denom)
            key = (t1, t2)
            if key in weightDict:
                weightDict[key] += w
            else:
                weightDict[key] = w
        for t in tids:
            diagCounts[t] += count * (tpm[t] / denom)
            ambigCounts[t] += count
    firstSamp = False

[1, 16960, 171]
nt 1
tids (16960,)
count 171
[8, 55802, 55803, 55804, 55807, 55810, 55811, 55812, 55813, 46]
nt 8
tids (55802, 55803, 55804, 55807, 55810, 55811, 55812, 55813)
count 46
[1, 13840, 3379]
nt 1
tids (13840,)
count 3379
[1, 14462, 10]
nt 1
tids (14462,)
count 10
[1, 79280, 1]
nt 1
tids (79280,)
count 1
[1, 28499, 129]
nt 1
tids (28499,)
count 129
[2, 60307, 60309, 72]
nt 2
tids (60307, 60309)
count 72
[1, 28528, 2]
nt 1
tids (28528,)
count 2
[1, 3631, 1]
nt 1
tids (3631,)
count 1
[1, 8719, 1]
nt 1
tids (8719,)
count 1


In [18]:
tnames = []
with open(eqfile) as ifile:
    for i in range(10):
        tnames.append(ifile.readline().rstrip())
    eqCollection.setNames(tnames)

NameError: name 'eqCollection' is not defined

In [21]:
netfile = pd.read_csv('netfile', sep='\t')

In [22]:
netfile

Unnamed: 0,comp202220_c0_seq1,comp202220_c0_seq1.1,1.1
0,comp209419_c0_seq1,comp209419_c0_seq8,2.834415e+00
1,comp172653_c0_seq2,comp172653_c0_seq2,1.100000e+00
2,comp210373_c0_seq32,comp210373_c0_seq44,1.331837e+02
3,comp213948_c0_seq18,comp213948_c0_seq24,1.851824e+00
4,comp214740_c0_seq13,comp214740_c0_seq26,5.776098e+00
5,comp165532_c0_seq1,comp165532_c0_seq1,1.100000e+00
6,comp812035_c0_seq1,comp812035_c0_seq1,1.100000e+00
7,comp214020_c0_seq3,comp214020_c0_seq4,2.729649e+00
8,comp213668_c0_seq18,comp213668_c0_seq18,1.100000e+00
9,comp206631_c0_seq2,comp206631_c0_seq25,1.315774e-01


In [28]:
eqfile = eqfiles[0]
numSamp = 0
firstSamp = True
tnames = []
firstSamp = True
numSamp = 0
eqClasses = {}

dict_pairTids_count = {} 
dict_eqClass = {}
dic_uniq_transcripts = {}

with open(eqfile) as ifile:
    numSamp += 1
    numTran = int(ifile.readline().rstrip())
    numEq = int(ifile.readline().rstrip())
    if firstSamp:
        for i in range(numTran):
            tnames.append(ifile.readline().rstrip())
    else:
        for i in range(numTran):
            ifile.readline()

    counter = 1
    for i in range(numEq): #numEq
        toks = list(map(int, ifile.readline().rstrip().split('\t')))
        tids = tuple(toks[1:-1])
#         print(tids)
        if tids not in dict_eqClass:
            dict_eqClass[tids] = i+1
            for ids in tids:
                if ids not in dic_uniq_transcripts:
                    dic_uniq_transcripts[ids] = set([i+1])
                else:
#                     print('duplicate')
                    set_counter = dic_uniq_transcripts[ids]
                    set_counter.add(i+1)
                    dic_uniq_transcripts[ids] = set_counter
                    

In [30]:
dict_eqClass.keys()

dict_keys([(68040, 68041, 68045, 68047, 68048, 68049, 68050, 68052, 68054), (13793,), (91298,), (12744,), (90249,), (15807,), (93552,), (14758,), (92519,), (11892,), (10843,), (52493, 52494), (39817, 39818, 39819), (73395, 73396), (47240, 47243, 47245, 47246), (67050,), (75630, 75637), (55815,), (12331,), (69048,), (50926,), (22214,), (68015,), (13936, 13937), (81533,), (77612, 77615), (39303,), (29426, 29427), (74290,), (65368,), (33899, 33900), (64335,), (59190,), (61188,), (35631, 35634), (48246, 48247, 48248, 48249, 48250), (48622, 48623), (80519, 80520, 80522, 80523, 80524, 80525, 80526, 80527, 80528, 80529, 80530, 80531, 80532, 80533, 80534, 80535, 80536, 80537, 80538, 80539, 80540, 80542, 80543, 80545, 80546, 80547, 80548, 80550, 80551, 80552, 80553, 80554), (40864,), (23601,), (39831,), (80469, 80470, 80471, 80472, 80473, 80474, 80475, 80477, 80478, 80479, 80480, 80482, 80483, 80484, 80486, 80487, 80489, 80491, 80492, 80493, 80494, 80495, 80496, 80497), (33918,), (20644,), (328

In [31]:
lst_uniq_tids = dic_uniq_transcripts.keys()
dict_prob = {}
lst_keys = dict_eqClass.keys()
for tup in lst_keys:
    for t1,t2 in itertools.combinations(tup,2):
        set_t1 = dic_uniq_transcripts[t1]
        set_t2 = dic_uniq_transcripts[t2]
        length_intersection = len(set_t1.intersection(set_t2))
        length_union = len(set_t1.union(set_t2))
        if length_intersection > 0:
            dict_prob[(t1,t2)] = length_intersection / length_union

In [42]:
from operator import itemgetter
sorted(dict_prob.items(), key=itemgetter(1), reverse=False)

[((61020, 61049), 0.034482758620689655),
 ((61005, 61030), 0.034482758620689655),
 ((61056, 61090), 0.034482758620689655),
 ((61020, 61085), 0.03571428571428571),
 ((79727, 79737), 0.03571428571428571),
 ((61056, 61061), 0.03571428571428571),
 ((61049, 61077), 0.03571428571428571),
 ((61005, 61077), 0.03571428571428571),
 ((61077, 61090), 0.03571428571428571),
 ((79730, 79737), 0.03571428571428571),
 ((61005, 61036), 0.037037037037037035),
 ((61077, 61085), 0.037037037037037035),
 ((61020, 61055), 0.037037037037037035),
 ((79737, 79758), 0.037037037037037035),
 ((61036, 61049), 0.037037037037037035),
 ((79733, 79737), 0.037037037037037035),
 ((61036, 61090), 0.037037037037037035),
 ((61020, 61042), 0.037037037037037035),
 ((61061, 61077), 0.037037037037037035),
 ((79737, 79740), 0.037037037037037035),
 ((61045, 61056), 0.037037037037037035),
 ((61007, 61056), 0.037037037037037035),
 ((61088, 61090), 0.038461538461538464),
 ((70529, 70578), 0.038461538461538464),
 ((79737, 79751), 0.038

In [44]:
netfile = pd.read_csv('netfile', sep='\t')

In [48]:
len(dict_prob)

134504

In [49]:
len(netfile)

244220

In [55]:
conditions = expDict.keys()
conditions

dict_keys(['scramble', 'HOXA1KD'])

In [8]:
# code analysis Part 1
conditions = expDict.keys()
for cond in conditions:
    print(expDict[cond])
    
    for sampNum, sampPath in expDict[cond].items():
        

{'SRR493368_quant': '../../sailfish_quant/SRR493368_quant', 'SRR493367_quant': '../../sailfish_quant/SRR493367_quant', 'SRR493366_quant': '../../sailfish_quant/SRR493366_quant'}
../../sailfish_quant/SRR493368_quant
../../sailfish_quant/SRR493367_quant
../../sailfish_quant/SRR493366_quant
{'SRR493369_quant': '../../sailfish_quant/SRR493369_quant', 'SRR493371_quant': '../../sailfish_quant/SRR493371_quant', 'SRR493370_quant': '../../sailfish_quant/SRR493370_quant'}
../../sailfish_quant/SRR493369_quant
../../sailfish_quant/SRR493371_quant
../../sailfish_quant/SRR493370_quant


In [18]:
condition1_paths = expDict['scramble'].values()
condition1_paths

dict_values(['../../sailfish_quant/SRR493368_quant', '../../sailfish_quant/SRR493367_quant', '../../sailfish_quant/SRR493366_quant'])

In [67]:
# Condition 1: dump

condition1_paths = expDict['scramble'].values()
# condition1_paths = expDict['HOXA1KD'].values()
dict_eqClass = {}
dic_uniq_transcripts = {}
eqClass_counter = 0
for path in condition1_paths:
    # initialize
    tnames = []
    firstSamp = True
    numSamp = 0
    eqfile = os.path.sep.join([path, auxDir, "eq_classes.txt"])
    print(eqfile)
    process_data(eqClass_counter, eqfile, numSamp, firstSamp, tnames, dict_eqClass, dic_uniq_transcripts)
#     [dict_eqClass, dic_uniq_transcripts, eqClass_counter] = 
    


../../sailfish_quant/SRR493368_quant/aux/eq_classes.txt
Entered function
../../sailfish_quant/SRR493367_quant/aux/eq_classes.txt
Entered function
../../sailfish_quant/SRR493366_quant/aux/eq_classes.txt
Entered function


In [31]:
def process_data(eqClass_counter, eqfile, numSamp, firstSamp, tnames, dict_eqClass, dic_uniq_transcripts):
    print('Entered function')
    with open(eqfile) as ifile:
        numSamp += 1
        numTran = int(ifile.readline().rstrip())
        numEq = int(ifile.readline().rstrip())
        if firstSamp:
            for i in range(numTran):
                tnames.append(ifile.readline().rstrip())
        else:
            for i in range(numTran):
                ifile.readline()
        for i in range(numEq): #numEq
            eqClass_counter += 1
            toks = list(map(int, ifile.readline().rstrip().split('\t')))
            tids = tuple(toks[1:-1])
            if tids not in dict_eqClass:
                dict_eqClass[tids] = eqClass_counter
                for ids in tids:
                    if ids not in dic_uniq_transcripts:
                        dic_uniq_transcripts[ids] = set([eqClass_counter])
                    else:
    #                     print('duplicate')
                        set_counter = dic_uniq_transcripts[ids]
                        set_counter.add(eqClass_counter)
                        dic_uniq_transcripts[ids] = set_counter


In [34]:
len(dic_uniq_transcripts)

105056

In [57]:
len(dic_uniq_transcripts)

106186

In [35]:
len(dict_eqClass)

116256

In [58]:
len(dict_eqClass)

118317

In [68]:
lst_uniq_tids = dic_uniq_transcripts.keys()
dict_prob = {}
lst_keys = dict_eqClass.keys()
for tup in lst_keys:
    for t1,t2 in itertools.combinations(tup,2):
        set_t1 = dic_uniq_transcripts[t1]
        set_t2 = dic_uniq_transcripts[t2]
        length_intersection = len(set_t1.intersection(set_t2))
        length_union = len(set_t1.union(set_t2))
        if length_intersection > 0:
            dict_prob[(t1,t2)] = length_intersection / length_union

In [69]:
from operator import itemgetter
dict_prob

{(64825, 64829): 0.25,
 (51631, 51638): 0.3333333333333333,
 (53454, 53458): 0.2,
 (56340, 56345): 0.2857142857142857,
 (54334, 54336): 0.19047619047619047,
 (63843, 63853): 0.16666666666666666,
 (54353, 54365): 0.4,
 (31080, 31081): 0.5,
 (67679, 67685): 0.35714285714285715,
 (72220, 72233): 0.13333333333333333,
 (37646, 37649): 0.16666666666666666,
 (54472, 54474): 0.5,
 (68040, 68041): 0.18181818181818182,
 (45362, 45385): 0.09523809523809523,
 (21039, 21040): 0.5,
 (62599, 62603): 0.24,
 (69761, 69771): 0.21052631578947367,
 (63398, 63400): 0.2,
 (76549, 76552): 0.14285714285714285,
 (65527, 65534): 0.6842105263157895,
 (78937, 78942): 0.38461538461538464,
 (49196, 49198): 0.4,
 (76226, 76233): 0.5,
 (71501, 71513): 0.5454545454545454,
 (67096, 67099): 0.5,
 (49550, 49554): 0.3333333333333333,
 (70314, 70316): 0.2857142857142857,
 (70586, 70601): 0.5,
 (41753, 41754): 0.1,
 (78672, 78674): 0.2,
 (64896, 64898): 0.5,
 (60461, 60477): 0.6666666666666666,
 (75630, 75637): 0.5,
 (62315

In [70]:
condition1_data = dict_prob
condition1_data

{(64825, 64829): 0.25,
 (51631, 51638): 0.3333333333333333,
 (53454, 53458): 0.2,
 (56340, 56345): 0.2857142857142857,
 (54334, 54336): 0.19047619047619047,
 (63843, 63853): 0.16666666666666666,
 (54353, 54365): 0.4,
 (31080, 31081): 0.5,
 (67679, 67685): 0.35714285714285715,
 (72220, 72233): 0.13333333333333333,
 (37646, 37649): 0.16666666666666666,
 (54472, 54474): 0.5,
 (68040, 68041): 0.18181818181818182,
 (45362, 45385): 0.09523809523809523,
 (21039, 21040): 0.5,
 (62599, 62603): 0.24,
 (69761, 69771): 0.21052631578947367,
 (63398, 63400): 0.2,
 (76549, 76552): 0.14285714285714285,
 (65527, 65534): 0.6842105263157895,
 (78937, 78942): 0.38461538461538464,
 (49196, 49198): 0.4,
 (76226, 76233): 0.5,
 (71501, 71513): 0.5454545454545454,
 (67096, 67099): 0.5,
 (49550, 49554): 0.3333333333333333,
 (70314, 70316): 0.2857142857142857,
 (70586, 70601): 0.5,
 (41753, 41754): 0.1,
 (78672, 78674): 0.2,
 (64896, 64898): 0.5,
 (60461, 60477): 0.6666666666666666,
 (75630, 75637): 0.5,
 (62315

In [52]:
import csv
def writeToCSV(filename, mydict):
    print (filename)
    with open(filename, 'w') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in mydict.items():
            writer.writerow([key, value])

In [61]:
writeToCSV('condition2_probability_success.csv', dict_prob)

condition2_probability_success.csv


In [62]:
condition1_successProb = pd.read_csv('condition1_probability_success.csv')
condition2_successProb = pd.read_csv('condition2_probability_success.csv')


In [74]:
probSuccess_diff  = {}
for k,cond1_val in condition1_data.items():
    if k in condition2_data:
        cond2_val = condition2_data[k]
        diff = abs(cond2_val - cond1_val)
        probSuccess_diff[k] = diff

In [75]:
probSuccess_diff

{(64825, 64829): 0.08333333333333334,
 (51631, 51638): 0.0,
 (56340, 56345): 0.0,
 (54334, 54336): 0.0165631469979296,
 (63843, 63853): 0.012820512820512803,
 (54353, 54365): 0.04285714285714287,
 (67679, 67685): 0.09740259740259738,
 (72220, 72233): 0.020512820512820523,
 (62982, 62985): 0.0,
 (67912, 67914): 0.0,
 (68040, 68041): 0.015151515151515166,
 (45362, 45385): 0.0082815734989648,
 (62599, 62603): 0.0,
 (71144, 71178): 0.0,
 (69761, 69771): 0.0,
 (76549, 76552): 0.038961038961038974,
 (65527, 65534): 0.052631578947368474,
 (78937, 78942): 0.03205128205128205,
 (49196, 49198): 0.06666666666666671,
 (76226, 76233): 0.0,
 (71501, 71513): 0.054545454545454564,
 (67096, 67099): 0.125,
 (49550, 49554): 0.06666666666666671,
 (70314, 70316): 0.0,
 (70586, 70601): 0.020000000000000018,
 (41753, 41754): 0.024999999999999994,
 (65983, 65984): 0.14285714285714285,
 (60461, 60477): 0.0,
 (75630, 75637): 0.0,
 (62315, 62322): 0.0,
 (62154, 62166): 0.07500000000000001,
 (21923, 21924): 0.166

In [78]:
filterGraph = pd.read_csv('filterGraph.txt', sep='\t')

In [81]:
len(tnames)

107389

In [97]:
ofile = 'new_output.txt'
new_probSuccess = {}
with open('new_output.txt', 'w') as ofile:
    for k,v in probSuccess_diff.items():
        if v < 0.01 :   
            t1_name = tnames[k[0]]
            t2_name = tnames[k[1]]
            new_probSuccess[(t1_name, t2_name)] = v
            ofile.write("{}\t{}\t{}\n".format(t1_name, t2_name, v))

In [90]:
new_probSuccess

KeyError: 0

In [86]:
len(probSuccess_diff)

137005

In [87]:
len(filterGraph)

239832

In [88]:
ofile.write("{}\t{}\t{}\n".format(x, y, data[2][i]))

Unnamed: 0,comp202220_c0_seq1,comp202220_c0_seq1.1,1.1
0,comp209419_c0_seq1,comp209419_c0_seq8,2.834415e+00
1,comp172653_c0_seq2,comp172653_c0_seq2,1.100000e+00
2,comp210373_c0_seq32,comp210373_c0_seq44,1.331837e+02
3,comp213948_c0_seq18,comp213948_c0_seq24,1.851824e+00
4,comp214740_c0_seq13,comp214740_c0_seq26,5.776098e+00
5,comp165532_c0_seq1,comp165532_c0_seq1,1.100000e+00
6,comp812035_c0_seq1,comp812035_c0_seq1,1.100000e+00
7,comp214020_c0_seq3,comp214020_c0_seq4,2.729649e+00
8,comp213668_c0_seq18,comp213668_c0_seq18,1.100000e+00
9,comp206631_c0_seq2,comp206631_c0_seq25,1.315774e-01
