In [1]:
import re
from tqdm import tqdm
import sys

In [2]:
from prettytable import PrettyTable as PT

## <b> Parsing CDHIT CLSTR FILE </br>

In [3]:
CDH_seqs = {}
# Reading CDHIT transcripts to clusterID
clstr_file = open("cdhit/cdhit_pc2_transcripts_80.fa.clstr", "r")
clstr_data = clstr_file.read()
clstr_file.close()

rep = {"\t": ",", "at +/": "", "at -/": "","...": ",", "nt": "", "%": "", " ": ""}
rep = dict((re.escape(k), v) for k, v in rep.iteritems())
pattern = re.compile("|".join(rep.keys()))
clstr_data = pattern.sub(lambda m: rep[re.escape(m.group(0))], clstr_data)
all_clusters = clstr_data.split(">Cluster")

cdhit_tr_cluster = {}
for i in range(1, len(all_clusters), 1):
    cluster = all_clusters[i]
    cluster = cluster.split("\n")
    cluster_id = cluster[0]
    for item in cluster[1:-1]:
        if cluster_id in CDH_seqs:
            CDH_seqs[cluster_id] += 1
        else:
            CDH_seqs[cluster_id] = 1

## <b>Parsing kCluster Clusters file</b>

In [4]:
kCl_seqs = {}
with open("clusters_c66.0_66%_clusters.tsv", "r") as kCL:
    next(kCL)
    for line in kCL:
        cline = line.split()
        cluster_id = cline[0]
        no_of_seqs = len(cline[1].split(","))
        kCl_seqs[cluster_id] = no_of_seqs

## <b> Constructing kCl to CDH </b>

In [5]:
kCl_to_CDH = {"CC":dict(), "IC":dict(), "CM":dict(), "IM":dict()}
with open("uniq_bio_assess.tsv", 'r') as tsv:
    next(tsv)
    for line in tsv:
        line = line.split()
        kCl_ID = line[0]
        kCl_type = line[1]
        CDH_ID = line[2]
        CDH_type = line[3]
        
        if kCl_ID in kCl_to_CDH[kCl_type]:
            kCl_to_CDH[kCl_type][kCl_ID].append(CDH_ID)
        else:
            kCl_to_CDH[kCl_type][kCl_ID] = [CDH_ID]

## <b>CDHIT To kCluster</b>

In [7]:
CDH_to_kCl = {"CC":dict(), "IC":dict(), "CM":dict(), "IM":dict()}
with open("uniq_bio_assess.tsv", 'r') as tsv:
    next(tsv)
    for line in tsv:
        line = line.split()
        kCl_ID = line[0]
        kCl_type = line[1]
        CDH_ID = line[2]
        CDH_type = line[3]
        
        if CDH_ID in CDH_to_kCl[kCl_type]:
            CDH_to_kCl[CDH_type][CDH_ID].append(kCl_ID)
        else:
            CDH_to_kCl[CDH_type][CDH_ID] = [kCl_ID]

## kCl -> CDHIT Summary

In [60]:
a = []
summary = PT()
summary.field_names = ["kCl Type", "kCluster", "CDHIT", "kCluster Seqs", "CDHIT Seqs"]
summary_total=["Total", 0, 0, 0, 0]
for TYPE in ["CC","IC","IM","CM"]:
    values_len = 0
    keys_len = 0
    kCL_total_seqs = 0
    CDH_total_seqs = 0
    for key, val in kCl_to_CDH[TYPE].iteritems():
        values_len += len(val)
        keys_len += 1
        kCL_total_seqs += kCl_seqs[key]
        for _CDH_cluster in val:
            CDH_total_seqs += CDH_seqs[_CDH_cluster]
            ##
            if TYPE == "IM":
                a.append(_CDH_cluster)
    
    summary.add_row([TYPE, keys_len, values_len,kCL_total_seqs,CDH_total_seqs])
    summary_total[1] += keys_len
    summary_total[2] += values_len
    summary_total[3] += kCL_total_seqs
    summary_total[4] += CDH_total_seqs

summary.add_row(["---","---","---","---","---"])
summary.add_row(summary_total)

In [61]:
import collections
x = [item for item, count in collections.Counter(a).items() if count > 1]
o = open("d2.txt" , 'w')
o.write(str(x))
o.write("\n\n\n")
o.write(str(a))
o.close()

## CDHIT -> kCl Summary

In [47]:
b = []

In [33]:
summary2 = PT()
summary2.field_names = ["CDHIT Type", "CDHIT", "kCluster", "CDHIT Seqs", "kCluster Seqs"]

summary2_total=["Total", 0, 0, 0, 0]
for TYPE in ["CC","IC","IM","CM"]:
    values_len = 0
    keys_len = 0
    kCL_total_seqs = 0
    CDH_total_seqs = 0
    for key, val in CDH_to_kCl[TYPE].iteritems():
        values_len += len(val)
        keys_len += 1
        CDH_total_seqs += CDH_seqs[key]
        for _kCl_cluster in val:
            kCL_total_seqs += kCl_seqs[_kCl_cluster]
    
    summary2.add_row([TYPE, keys_len, values_len,CDH_total_seqs,kCL_total_seqs])
    summary2_total[1] += keys_len
    summary2_total[2] += values_len
    summary2_total[3] += CDH_total_seqs
    summary2_total[4] += kCL_total_seqs

summary2.add_row(["---","---","---","---","---"])
summary2.add_row(summary2_total)

In [46]:
print(summary)

+----------+----------+-------+---------------+------------+
| kCl Type | kCluster | CDHIT | kCluster Seqs | CDHIT Seqs |
+----------+----------+-------+---------------+------------+
|    CC    |  14101   | 20670 |     50383     |   54043    |
|    IC    |  10246   | 14235 |     30081     |   35262    |
|    IM    |    88    |  179  |      687      |    819     |
|    CM    |   188    |  315  |      1184     |    1659    |
|   ---    |   ---    |  ---  |      ---      |    ---     |
|  Total   |  24623   | 35399 |     82335     |   91783    |
+----------+----------+-------+---------------+------------+


In [35]:
print(summary2)

+------------+-------+----------+------------+---------------+
| CDHIT Type | CDHIT | kCluster | CDHIT Seqs | kCluster Seqs |
+------------+-------+----------+------------+---------------+
|     CC     |  9527 |   9551   |   23770    |     23715     |
|     IC     | 23544 |  24144   |   54874    |     148342    |
|     IM     |  481  |   492    |    2743    |      3348     |
|     CM     |  213  |   222    |    948     |      624      |
|    ---     |  ---  |   ---    |    ---     |      ---      |
|   Total    | 33765 |  34409   |   82335    |     176029    |
+------------+-------+----------+------------+---------------+


### <i>Assertions</i>
<ul>
<li>CDHIT Clusters: 33765</li>
<li>kClusters Clusters: 24623</li>
<li>No. Of Seqs: 82335</li>
</ul>

In [None]:
kCl_to_CDH["CC"]["4195"]

In [None]:
a = ",".join(a)

In [None]:
print len(a.split(","))

In [None]:
o = open("d.txt" , 'w')
o.write(str(a))
o.close()