In [1]:
from prettytable import PrettyTable as PT
import re

# <b>Pairwise 1 Clusters Count kCluster->CDH </b>

## Parsing clusters

In [2]:
CDH_seqs = {}
# Reading CDHIT transcripts to clusterID
clstr_file = open("cdhit/cdhit_pc2_transcripts_80.fa.clstr", "r")
clstr_data = clstr_file.read()
clstr_file.close()

rep = {"\t": ",", "at +/": "", "at -/": "",
       "...": ",", "nt": "", "%": "", " ": ""}
rep = dict((re.escape(k), v) for k, v in rep.iteritems())
pattern = re.compile("|".join(rep.keys()))
clstr_data = pattern.sub(lambda m: rep[re.escape(m.group(0))], clstr_data)
all_clusters = clstr_data.split(">Cluster")

cdhit_tr_cluster = {}
for i in range(1, len(all_clusters), 1):
    cluster = all_clusters[i]
    cluster = cluster.split("\n")
    cluster_id = cluster[0]
    for item in cluster[1:-1]:
        if cluster_id in CDH_seqs:
            CDH_seqs[cluster_id] += 1
        else:
            CDH_seqs[cluster_id] = 1


"""
Parsing kCluster Clusters file
"""
kCl_seqs = {}
with open("clusters_c66.0_66%_clusters.tsv", "r") as kCL:
    next(kCL)
    for line in kCL:
        cline = line.split()
        cluster_id = cline[0]
        no_of_seqs = len(cline[1].split(","))
        kCl_seqs[cluster_id] = no_of_seqs

# <b>kCl -->> CDHIT</b>

In [3]:
kCl_to_CDH = {"CC":dict(), "IC":dict(), "CM":dict(), "IM":dict()}
CDH_to_type = {}
with open("uniq_bio_assess.tsv", 'r') as tsv:
    next(tsv)
    for line in tsv:
        line = line.split()
        kCl_ID = line[0]
        kCl_type = line[1]
        CDH_ID = line[2]
        CDH_type = line[3]
        CDH_to_type[CDH_ID] = CDH_type
        
        if kCl_ID in kCl_to_CDH[kCl_type]:
            kCl_to_CDH[kCl_type][kCl_ID].append(CDH_ID)
        else:
            kCl_to_CDH[kCl_type][kCl_ID] = [CDH_ID]

In [4]:
empty_d = {"CC":0, "IC":0, "CM":0, "IM":0}
kCl_to_CDH_clstrs = {"CC":dict(empty_d), "IC":dict(empty_d), "CM":dict(empty_d), "IM":dict(empty_d)}

summary = PT()
summary.field_names = ["kCl Type", "kCluster", "CDHIT", "kCluster Seqs", "CDHIT Seqs"]
summary_total=["Total", 0, 0, 0, 0]

for TYPE in ["CC","IC","IM","CM"]:
    values_len = 0
    all_vals = set()
    keys_len = 0
    kCL_total_seqs = 0
    CDH_total_seqs = 0
    for key, val in kCl_to_CDH[TYPE].iteritems():
        keys_len += 1
        kCL_total_seqs += kCl_seqs[key]
        
        for _CDH_cluster in val:
            all_vals.add(_CDH_cluster)

            
    for _CDH_cluster in all_vals:
        _cdh_type = CDH_to_type[_CDH_cluster]
        kCl_to_CDH_clstrs[TYPE][_cdh_type] += 1
        values_len += 1
        CDH_total_seqs += CDH_seqs[_CDH_cluster]
        
    summary.add_row([TYPE, keys_len, values_len,kCL_total_seqs,CDH_total_seqs])
    summary_total[1] += keys_len
    summary_total[2] += values_len
    summary_total[3] += kCL_total_seqs
    summary_total[4] += CDH_total_seqs

summary.add_row(["---","---","---","---","---"])
summary.add_row(summary_total)

## Pairwise construction

In [5]:
kClToCDHIT = PT()
kClToCDHIT.field_names = ["kCl/CDH","CC","IC","CM","IM"]
total_clusters1 = 0
TYPES = ["CC","IC","CM","IM"]
for TYPE in TYPES:
    row = []
    for T in TYPES:
        total_clusters1 += kCl_to_CDH_clstrs[TYPE][T]
        row.append(kCl_to_CDH_clstrs[TYPE][T])
    kClToCDHIT.add_row([TYPE]+row)

# <b>CDH -->> kCluster </b>

In [6]:
CDH_to_kCl = {"CC":dict(), "IC":dict(), "CM":dict(), "IM":dict()}
kCl_to_type = {}
with open("uniq_bio_assess.tsv", 'r') as tsv:
    next(tsv)
    for line in tsv:
        line = line.split()
        kCl_ID = line[0]
        kCl_type = line[1]
        CDH_ID = line[2]
        CDH_type = line[3]
        kCl_to_type[kCl_ID] = kCl_type
        
        
        if CDH_ID in CDH_to_kCl[kCl_type]:
            CDH_to_kCl[CDH_type][CDH_ID].append(kCl_ID)
        else:
            CDH_to_kCl[CDH_type][CDH_ID] = [kCl_ID]

In [7]:
empty_d = {"CC":0, "IC":0, "CM":0, "IM":0}
CDH_to_kCl_clstrs = {"CC":dict(empty_d), "IC":dict(empty_d), "CM":dict(empty_d), "IM":dict(empty_d)}

summary2 = PT()
summary2.field_names = ["CDHIT Type", "CDHIT", "kCluster", "CDHIT Seqs", "kCluster Seqs"]

summary2_total=["Total", 0, 0, 0, 0]
for TYPE in ["CC","IC","IM","CM"]:
    values_len = 0
    all_vals = set()
    keys_len = 0
    all_keys = set()
    kCL_total_seqs = 0
    CDH_total_seqs = 0
    
    for key, val in CDH_to_kCl[TYPE].iteritems():
        all_keys.add(key)
        
        for _kCl_cluster in val:
            all_vals.add(_kCl_cluster)
    
    for _kCl_cluster in all_vals:
        values_len += 1
        _kCl_type = kCl_to_type[_kCl_cluster]
        CDH_to_kCl_clstrs[TYPE][_kCl_type] += 1
        kCL_total_seqs += kCl_seqs[_kCl_cluster]
    
    for key in all_keys:
        keys_len += 1
        CDH_total_seqs += CDH_seqs[key]
        
    
    summary2.add_row([TYPE, keys_len, values_len,CDH_total_seqs,kCL_total_seqs])
    summary2_total[1] += keys_len
    summary2_total[2] += values_len
    summary2_total[3] += CDH_total_seqs
    summary2_total[4] += kCL_total_seqs

summary2.add_row(["---","---","---","---","---"])
summary2.add_row(summary2_total)

In [8]:
CDTokCl = PT()
CDTokCl.field_names = ["CDH/kCl","CC","IC","CM","IM"]
total_clusters2 = 0
TYPES = ["CC","IC","CM","IM"]
for TYPE in TYPES:
    row = []
    for T in TYPES:
        total_clusters2 += CDH_to_kCl_clstrs[TYPE][T]
        row.append(CDH_to_kCl_clstrs[TYPE][T])
    CDTokCl.add_row([TYPE]+row)

## Printing Summaries

In [9]:
print "kCl --->>> CDHIT"
print "Summary\n",summary
print "~" * 65
print "Pairwise Matrix\n",kClToCDHIT
print "Total: ", total_clusters1
print "\n\n"
print "CDHIT --->>> kCl"
print "Summary\n",summary2
print "~" * 65
print "Pairwise Matrix\n",CDTokCl
print "Total: ", total_clusters2

kCl --->>> CDHIT
Summary
+----------+----------+-------+---------------+------------+
| kCl Type | kCluster | CDHIT | kCluster Seqs | CDHIT Seqs |
+----------+----------+-------+---------------+------------+
|    CC    |  14101   | 20196 |     50383     |   51104    |
|    IC    |  10246   | 13321 |     30081     |   30563    |
|    IM    |    88    |  173  |      687      |    784     |
|    CM    |   188    |  304  |      1184     |    1488    |
|   ---    |   ---    |  ---  |      ---      |    ---     |
|  Total   |  24623   | 33994 |     82335     |   83939    |
+----------+----------+-------+---------------+------------+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Pairwise Matrix
+---------+------+-------+-----+-----+
| kCl/CDH |  CC  |   IC  |  CM |  IM |
+---------+------+-------+-----+-----+
|    CC   | 9264 | 10541 | 127 | 264 |
|    IC   | 257  | 12927 |  6  | 131 |
|    CM   |  6   |   64  | 106 | 128 |
|    IM   |  1   |   47  |  5  | 120 |
+---------