[Stats for collocation detection](http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-55462016000300327)
* What was this about?
    * Issue of multiple features in one location
      * Had to apply set to avoid this issue

In [13]:
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from utils import parse_cdhit, replace_val
import itertools
# from collections import defaultdict

In [14]:
data = pd.read_table("data/ENA_ML_input_ORFs")
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein


In [15]:
seq_to_clust = parse_cdhit("data/ENA.40.clstr")
list(seq_to_clust.items())[0:2]

[('ENA_KP211958_KP211958.1_65082_87137_88', '0'),
 ('ENA_AY940168_AY940168.2_57144_79082_90', '1')]

In [16]:
## Add a column to show the cluster to which each feature_id belongs
data["cluster_number"] = replace_val(data.Feature_id, seq_to_clust)
data.cluster_number = data.cluster_number.astype(int)
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,cluster_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194


In [17]:
# Add a contig sequential number in addition to contig id.
contig_id_to_int = {y:x for x,y in enumerate(data.contig_id.tolist())}
#### data.replace({'contig_id': contig_id_to_int})
data["contig_number"] = replace_val(data.contig_id, contig_id_to_int)

data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,cluster_number,contig_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065,237296
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194,234989


In [18]:
temp = data.groupby("contig_number").apply(lambda x: sorted(set(x["cluster_number"].values)))
subsets = pd.DataFrame({"contig_id": temp.index, "cluster_number": temp.values})
contig_counts = pd.DataFrame(Counter(itertools.chain(*subsets.cluster_number.values)).items())
contig_counts.columns = ["cluster_number", "nb_occurrences"]
contig_counts = contig_counts.set_index("cluster_number")
contig_counts.head()

Unnamed: 0_level_0,nb_occurrences
cluster_number,Unnamed: 1_level_1
3929,135
6051,134
8867,135
21058,122
25595,135


In [19]:
contigs_in_order = data.groupby("contig_number").apply(lambda g: list(g.sort_values("ORF_location_on_contig").cluster_number.values))
contigs_in_order.head()

contig_number
82638     [3929, 57763, 25595, 6051, 21058, 8867]
106079                 [6292, 30149, 18722, 3490]
109093    [57763, 25595, 6051, 21058, 8867, 3929]
118155                       [6292, 30149, 26181]
122079    [44650, 25595, 6051, 21058, 8867, 3929]
dtype: object

# Computing Modules of Size 2

In [20]:
def find_size_n_in_window(group, set_size, window_size):
    group.sort_values(by="ORF_location_on_contig")
    if window_size > len(group):
        window_size = len(group)
    unsorted_pairs = []
    pairs=[]
    contig_clusters = group.cluster_number.values
    window_size = min(len(contig_clusters), window_size)
    for i, line_id in enumerate(contig_clusters[:-(window_size- 1)]):
        unsorted_pairs.append(list(itertools.combinations(contig_clusters[i: i+window_size], set_size)))
    for pair in set(list(itertools.chain(*unsorted_pairs))):
        pairs.append(tuple(sorted(pair)))
    return list(set(pairs))

combination_2_contigs = data.groupby("contig_number").apply(find_size_n_in_window, set_size=2, window_size=8)
combination_2_contigs.head()

contig_number
82638     [(3929, 8867), (21058, 25595), (3929, 6051), (...
106079    [(6292, 30149), (18722, 30149), (3490, 18722),...
109093    [(3929, 8867), (21058, 25595), (3929, 6051), (...
118155       [(6292, 30149), (26181, 30149), (6292, 26181)]
122079    [(21058, 44650), (3929, 8867), (8867, 44650), ...
dtype: object

In [21]:
combination_2_counts = Counter(itertools.chain(*combination_2_contigs.values))
combination_2_counts = pd.DataFrame(combination_2_counts.items())
combination_2_counts.columns = ["combination", "nb_occurrences"]
combination_2_counts = combination_2_counts.set_index("combination")
combination_2_counts.head()

Unnamed: 0_level_0,nb_occurrences
combination,Unnamed: 1_level_1
"(3929, 8867)",135
"(21058, 25595)",122
"(3929, 6051)",134
"(3929, 57763)",25
"(8867, 25595)",135


In [22]:
combination_2_counts["item_0"] = contig_counts.loc[combination_2_counts.index.map(lambda x: x[0])].values
combination_2_counts["item_1"] = contig_counts.loc[combination_2_counts.index.map(lambda x: x[1])].values
combination_2_counts["Dice"] = (2 * combination_2_counts.nb_occurrences) / (combination_2_counts.item_0 + combination_2_counts.item_1)


In [23]:
combination_2_counts_filtered = combination_2_counts[(combination_2_counts.nb_occurrences > 10) & (combination_2_counts.Dice > 0.3)].sort_values(by=["nb_occurrences", "Dice"], ascending=False)
print(combination_2_counts_filtered.shape)
combination_2_counts_filtered.head()

(599, 4)


Unnamed: 0_level_0,nb_occurrences,item_0,item_1,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(3929, 8867)",135,135,135,1.0
"(8867, 25595)",135,135,135,1.0
"(3929, 25595)",135,135,135,1.0
"(3929, 6051)",134,135,134,0.996283
"(6051, 25595)",134,134,135,0.996283


# Computing Modules of Size 3

In [24]:
combination_3_contigs = data.groupby("contig_number").apply(find_size_n_in_window, set_size=3, window_size=8)
combination_3_contigs.head()

contig_number
82638     [(6051, 21058, 25595), (3929, 6051, 25595), (6...
106079    [(6292, 18722, 30149), (3490, 18722, 30149), (...
109093    [(6051, 21058, 25595), (3929, 6051, 25595), (6...
118155                               [(6292, 26181, 30149)]
122079    [(6051, 21058, 25595), (3929, 6051, 25595), (3...
dtype: object

In [25]:
combination_3_counts = Counter(itertools.chain(*combination_3_contigs.values))
combination_3_counts = pd.DataFrame(combination_3_counts.items())
combination_3_counts.columns = ["combination", "nb_occurrences"]
combination_3_counts = combination_3_counts.set_index("combination")
combination_3_counts.head()

Unnamed: 0_level_0,nb_occurrences
combination,Unnamed: 1_level_1
"(6051, 21058, 25595)",121
"(3929, 6051, 25595)",134
"(6051, 21058, 57763)",25
"(3929, 6051, 8867)",134
"(6051, 8867, 25595)",134


In [26]:
combination_3_counts["item_0"] = contig_counts.loc[combination_3_counts.index.map(lambda x: x[0])].values
combination_3_counts["item_1"] = contig_counts.loc[combination_3_counts.index.map(lambda x: x[1])].values
combination_3_counts["item_2"] = contig_counts.loc[combination_3_counts.index.map(lambda x: x[2])].values
combination_3_counts["Dice"] = (3 * combination_3_counts.nb_occurrences) / (combination_3_counts.item_0 + combination_3_counts.item_1 + combination_3_counts.item_2)
combination_3_counts.head()

Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(6051, 21058, 25595)",121,134,122,135,0.928389
"(3929, 6051, 25595)",134,135,134,135,0.99505
"(6051, 21058, 57763)",25,134,122,25,0.266904
"(3929, 6051, 8867)",134,135,134,135,0.99505
"(6051, 8867, 25595)",134,134,135,135,0.99505


In [27]:
combination_3_counts_filtered = combination_3_counts[(combination_3_counts.nb_occurrences > 5) & (combination_3_counts.Dice > 0.2)].sort_values(by=["nb_occurrences", "Dice"], ascending=False)
print(combination_3_counts_filtered.shape)
combination_3_counts_filtered.head()

(73, 5)


Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(3929, 8867, 25595)",135,135,135,135,1.0
"(3929, 6051, 25595)",134,135,134,135,0.99505
"(3929, 6051, 8867)",134,135,134,135,0.99505
"(6051, 8867, 25595)",134,134,135,135,0.99505
"(8867, 21058, 25595)",122,135,122,135,0.933673


# Computing Modules of Size 4


In [28]:
combination_4_contigs = data.groupby("contig_number").apply(find_size_n_in_window, set_size=4, window_size=8)
combination_4_contigs.head()

contig_number
82638     [(3929, 6051, 8867, 57763), (3929, 8867, 21058...
106079                         [(3490, 6292, 18722, 30149)]
109093    [(3929, 6051, 8867, 57763), (3929, 8867, 21058...
118155                                                   []
122079    [(3929, 8867, 21058, 44650), (3929, 6051, 2559...
dtype: object

In [29]:
combination_4_counts = Counter(itertools.chain(*combination_4_contigs.values))
combination_4_counts = pd.DataFrame(combination_4_counts.items())
combination_4_counts.columns = ["combination", "nb_occurrences"]
combination_4_counts = combination_4_counts.set_index("combination")
combination_4_counts.head()


Unnamed: 0_level_0,nb_occurrences
combination,Unnamed: 1_level_1
"(3929, 6051, 8867, 57763)",25
"(3929, 8867, 21058, 25595)",122
"(6051, 21058, 25595, 57763)",25
"(3929, 6051, 8867, 25595)",134
"(6051, 8867, 21058, 25595)",121


In [30]:
combination_4_counts["item_0"] = contig_counts.loc[combination_4_counts.index.map(lambda x: x[0])].values
combination_4_counts["item_1"] = contig_counts.loc[combination_4_counts.index.map(lambda x: x[1])].values
combination_4_counts["item_2"] = contig_counts.loc[combination_4_counts.index.map(lambda x: x[2])].values
combination_4_counts["item_3"] = contig_counts.loc[combination_4_counts.index.map(lambda x: x[3])].values
combination_4_counts["Dice"] = (4 * combination_4_counts.nb_occurrences) / \
                               (combination_4_counts.item_0 + combination_4_counts.item_1 + combination_4_counts.item_2 + combination_4_counts.item_3)
combination_4_counts.head()

Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,item_3,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(3929, 6051, 8867, 57763)",25,135,134,135,25,0.2331
"(3929, 8867, 21058, 25595)",122,135,135,122,135,0.925996
"(6051, 21058, 25595, 57763)",25,134,122,135,25,0.240385
"(3929, 6051, 8867, 25595)",134,135,134,135,135,0.994434
"(6051, 8867, 21058, 25595)",121,134,135,122,135,0.920152


In [31]:
combination_4_counts_filtered = combination_4_counts[(combination_4_counts.nb_occurrences > 5) & (combination_4_counts.Dice > 0.2)].sort_values(by=["nb_occurrences", "Dice"], ascending=False)
print(combination_4_counts_filtered.shape)
combination_4_counts_filtered.head()

(53, 6)


Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,item_3,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(3929, 6051, 8867, 25595)",134,135,134,135,135,0.994434
"(3929, 8867, 21058, 25595)",122,135,135,122,135,0.925996
"(6051, 8867, 21058, 25595)",121,134,135,122,135,0.920152
"(3929, 6051, 21058, 25595)",121,135,134,122,135,0.920152
"(3929, 6051, 8867, 21058)",121,135,134,135,122,0.920152


### Computing Modules of Size 5

In [32]:
combination_5_contigs = data.groupby("contig_number").apply(find_size_n_in_window, set_size=5, window_size=8)
combination_5_contigs.head()

contig_number
82638     [(3929, 8867, 21058, 25595, 57763), (3929, 605...
106079                                                   []
109093    [(3929, 8867, 21058, 25595, 57763), (3929, 605...
118155                                                   []
122079    [(3929, 6051, 8867, 21058, 44650), (6051, 8867...
dtype: object

In [33]:
combination_5_counts = Counter(itertools.chain(*combination_5_contigs.values))
combination_5_counts = pd.DataFrame(combination_5_counts.items())
combination_5_counts.columns = ["combination", "nb_occurrences"]
combination_5_counts = combination_5_counts.set_index("combination")
combination_5_counts.head()


Unnamed: 0_level_0,nb_occurrences
combination,Unnamed: 1_level_1
"(3929, 8867, 21058, 25595, 57763)",25
"(3929, 6051, 8867, 21058, 25595)",121
"(3929, 6051, 8867, 21058, 57763)",25
"(3929, 6051, 21058, 25595, 57763)",25
"(3929, 6051, 8867, 25595, 57763)",25


In [34]:
combination_5_counts["item_0"] = contig_counts.loc[combination_5_counts.index.map(lambda x: x[0])].values
combination_5_counts["item_1"] = contig_counts.loc[combination_5_counts.index.map(lambda x: x[1])].values
combination_5_counts["item_2"] = contig_counts.loc[combination_5_counts.index.map(lambda x: x[2])].values
combination_5_counts["item_3"] = contig_counts.loc[combination_5_counts.index.map(lambda x: x[3])].values
combination_5_counts["item_4"] = contig_counts.loc[combination_5_counts.index.map(lambda x: x[4])].values


In [35]:
combination_5_counts["Dice"] = (5 * combination_5_counts.nb_occurrences) / \
                               (combination_5_counts.item_0 + combination_5_counts.item_1 + combination_5_counts.item_2 + combination_5_counts.item_3 + combination_5_counts.item_4)
combination_5_counts.head()

Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,item_3,item_4,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(3929, 8867, 21058, 25595, 57763)",25,135,135,122,135,25,0.226449
"(3929, 6051, 8867, 21058, 25595)",121,135,134,135,122,135,0.91528
"(3929, 6051, 8867, 21058, 57763)",25,135,134,135,122,25,0.22686
"(3929, 6051, 21058, 25595, 57763)",25,135,134,122,135,25,0.22686
"(3929, 6051, 8867, 25595, 57763)",25,135,134,135,135,25,0.221631


In [36]:
combination_5_counts_filtered = combination_5_counts[(combination_5_counts.nb_occurrences > 2) & (combination_5_counts.Dice > 0.1)].sort_values(by=["nb_occurrences", "Dice"], ascending=False)
print(combination_5_counts_filtered.shape)
combination_5_counts_filtered

(180, 7)


Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,item_3,item_4,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(3929, 6051, 8867, 21058, 25595)",121,135,134,135,122,135,0.915280
"(3929, 8867, 21058, 25595, 44650)",96,135,135,122,135,96,0.770465
"(3929, 6051, 8867, 21058, 44650)",95,135,134,135,122,96,0.763666
"(6051, 8867, 21058, 25595, 44650)",95,134,135,122,135,96,0.763666
"(3929, 6051, 21058, 25595, 44650)",95,135,134,122,135,96,0.763666
...,...,...,...,...,...,...,...
"(5911, 7884, 29868, 45051, 50392)",3,11,13,5,9,8,0.326087
"(7884, 11892, 35990, 45051, 50392)",3,13,8,9,9,8,0.319149
"(5911, 7884, 11892, 35990, 50392)",3,11,13,8,9,8,0.306122
"(5911, 7884, 35990, 45051, 50392)",3,11,13,9,9,8,0.300000


In [37]:
combination_5_counts_filtered[combination_5_counts_filtered.index.map(lambda x: 3929 in x)].head()


Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,item_3,item_4,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(3929, 6051, 8867, 21058, 25595)",121,135,134,135,122,135,0.91528
"(3929, 8867, 21058, 25595, 44650)",96,135,135,122,135,96,0.770465
"(3929, 6051, 8867, 21058, 44650)",95,135,134,135,122,96,0.763666
"(3929, 6051, 21058, 25595, 44650)",95,135,134,122,135,96,0.763666
"(3929, 6051, 8867, 25595, 44650)",95,135,134,135,135,96,0.748031


In [38]:
counts_most_fre_clusters = Counter(list(itertools.chain(*combination_5_counts_filtered.index)))
{k: v for k, v in sorted(counts_most_fre_clusters.items(), key=lambda item: item[1], reverse=True)}

{7884: 63,
 6186: 60,
 5911: 59,
 45051: 57,
 35990: 47,
 50392: 47,
 63732: 46,
 11892: 40,
 3929: 38,
 6051: 38,
 8867: 38,
 25595: 38,
 54963: 35,
 33150: 34,
 60496: 30,
 20761: 30,
 29868: 22,
 42222: 19,
 7452: 16,
 46302: 13,
 14295: 12,
 28364: 11,
 22142: 10,
 29846: 10,
 21058: 9,
 26775: 6,
 44650: 5,
 57763: 5,
 19989: 5,
 62832: 4,
 6150: 3,
 7266: 3,
 7912: 3,
 52114: 3,
 32842: 3,
 44377: 2,
 55189: 2,
 15929: 2,
 26394: 2,
 42607: 2,
 45952: 2,
 54980: 2,
 3968: 2,
 12322: 2,
 17439: 2,
 46748: 2,
 8798: 2,
 56322: 1,
 36734: 1,
 3890: 1,
 19993: 1,
 26562: 1,
 40332: 1,
 46703: 1,
 3397: 1,
 12983: 1,
 1675: 1,
 6594: 1,
 23189: 1,
 34244: 1,
 60695: 1}

In [39]:
data[data["cluster_number"] ==60496]["annotation_val"].value_counts()

Unknown    11
Name: annotation_val, dtype: int64

In [40]:
combination_5_counts_filtered[combination_5_counts_filtered.index.map(lambda x: 3929 in x)]

Unnamed: 0_level_0,nb_occurrences,item_0,item_1,item_2,item_3,item_4,Dice
combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(3929, 6051, 8867, 21058, 25595)",121,135,134,135,122,135,0.91528
"(3929, 8867, 21058, 25595, 44650)",96,135,135,122,135,96,0.770465
"(3929, 6051, 8867, 21058, 44650)",95,135,134,135,122,96,0.763666
"(3929, 6051, 21058, 25595, 44650)",95,135,134,122,135,96,0.763666
"(3929, 6051, 8867, 25595, 44650)",95,135,134,135,135,96,0.748031
"(3929, 6051, 8867, 21058, 57763)",25,135,134,135,122,25,0.22686
"(3929, 6051, 21058, 25595, 57763)",25,135,134,122,135,25,0.22686
"(3929, 8867, 21058, 25595, 57763)",25,135,135,122,135,25,0.226449
"(3929, 6051, 8867, 25595, 57763)",25,135,134,135,135,25,0.221631
"(3929, 6051, 8867, 25595, 54963)",13,135,134,135,135,13,0.117754


In [41]:
top_5_3929 = combination_5_counts_filtered[combination_5_counts_filtered.index.map(lambda x: 3929 in x)].head().index
top_5_3929

Index([ (3929, 6051, 8867, 21058, 25595), (3929, 8867, 21058, 25595, 44650),
        (3929, 6051, 8867, 21058, 44650), (3929, 6051, 21058, 25595, 44650),
        (3929, 6051, 8867, 25595, 44650)],
      dtype='object', name='combination')

In [42]:
contigs = []
for combination in top_5_3929:
    out = contigs_in_order.map(lambda x: len(set(x).intersection(combination)) == len(combination))
    contigs.append(contigs_in_order[out].values)
list(itertools.chain(*contigs))

[[3929, 57763, 25595, 6051, 21058, 8867],
 [57763, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [3929, 44650, 25595, 6051, 21058, 8867],
 [3929, 57763, 25595, 6051, 21058, 8867],
 [3929, 57763, 25595, 6051, 21058, 8867],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [57763, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [3929, 57763, 25595, 6051, 21058, 8867],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [3929, 57763, 25595, 6051, 21058, 8867],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867, 3929],
 [44650, 25595, 6051, 21058, 8867,

In [43]:
Counter(list(itertools.chain(*list(itertools.chain(*contigs)))))

Counter({3929: 502,
         57763: 25,
         25595: 502,
         6051: 501,
         21058: 502,
         8867: 502,
         44650: 476,
         63455: 15,
         64729: 6,
         33967: 1,
         44025: 1,
         28062: 1})

In [58]:
Counter(data[data.cluster_number == 60496].annotation_val)

Counter({'Unknown': 11})

In [66]:
for i,j in Counter(list(itertools.chain(*combination_5_counts_filtered.index.tolist()))):
        print(i, Counter(data[data.cluster_number == i].annotation_val))


45952 Counter({'Unknown': 7})
3968 Counter({'DNA polymerase': 6, 'DNA_directed DNA polymerase': 1})
56322 Counter({'Unknown': 3})
6150 Counter({'Unknown': 3})
25595 Counter({'External scaffolding protein D': 69, 'Unknown': 66})
1675 Counter({'Putative intein containing helicase': 3, 'Putative helicase': 2, 'DNA helicase': 1, 'Helicase': 1})
40332 Counter({'Unknown': 4})
52114 Counter({'Unknown': 4})
55189 Counter({'Uncharacterized protein': 4})
35990 Counter({'Unknown': 9})
5911 Counter({'Virion export protein': 5, 'IV': 4, 'Phage assembly protein': 1, 'IV protein': 1})
26775 Counter({'Unknown': 4})
20761 Counter({'Unknown': 9})
29846 Counter({'Unknown': 4})
19989 Counter({'Mannosyl_glycoprotein endo_beta_N_acetylglucosaminidase': 4})
7452 Counter({'Replication initiation factor': 4})
19993 Counter({'Unknown': 5})
26394 Counter({'Unknown': 7})
17439 Counter({'Unknown': 7})
46748 Counter({'Unknown': 7})
12322 Counter({'Unknown': 7})
6051 Counter({'Unknown': 134})
8867 Counter({'Unknown'