In [42]:
import pandas as pd
from collections import Counter
from utils import parse_cdhit, replace_val
import itertools
import numpy as np


In [176]:
data = pd.read_table("data/ENA_ML_input_ORFs")


In [177]:
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein


In [178]:
seq_to_clust = parse_cdhit("data/ENA.40.clstr")

# what does it look like
list(seq_to_clust.items())[0:2]

[('ENA_KP211958_KP211958.1_65082_87137_88', '0'),
 ('ENA_AY940168_AY940168.2_57144_79082_90', '1')]

In [179]:
## Add a column to show the cluster to which each feature_id belongs
data["feature_number"] = replace_val(data.Feature_id, seq_to_clust)
data.feature_number = data.feature_number.astype(int)
data.head(2)



Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,feature_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194


In [180]:
# Add a contig sequential number in addition to contig id.
contig_id_to_int = {y:x for x,y in enumerate(data.contig_id.tolist())}
#### data.replace({'contig_id': contig_id_to_int})
data["contig_number"] = replace_val(data.contig_id, contig_id_to_int)
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,feature_number,contig_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065,237296
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194,234989


* Generate a table with all the feature per contig


In [34]:
temp = data.groupby("contig_number").apply(lambda x: x["feature_number"].values)
subsets = pd.DataFrame({"contig_id": temp.index, "features": temp.values})
subsets.head(2)

Unnamed: 0,contig_id,features
0,82638,"[3929, 6051, 21058, 8867, 57763, 25595]"
1,106079,"[18722, 6292, 30149, 3490]"


* coun the occurrences of each feature in all the contigs. 
  * Needed to compute the probability of a pair

In [66]:
feature_counts = pd.DataFrame(Counter(itertools.chain(*subsets.features.values)).items())
feature_counts.columns = ["feature", "nb_occurrences"]
feature_counts = feature_counts.set_index("feature")
feature_counts

Unnamed: 0_level_0,nb_occurrences
feature,Unnamed: 1_level_1
3929,135
6051,134
21058,122
8867,135
57763,25
...,...
54550,1
43098,1
57365,1
11839,1


* Count the occurrences of each pair of features that occur on at least 1 contig
* We generate generate all the possible pairs (tuples) of features 


In [54]:
# Generate all the subset of size 2 for each contig
subsets_size_2 = data.groupby("contig_number").apply(lambda x: list(itertools.combinations(x["feature_number"].values, 2)))
subsets_size_2.head(2)


contig_number
82638     [(3929, 6051), (3929, 21058), (3929, 8867), (3...
106079    [(18722, 6292), (18722, 30149), (18722, 3490),...
dtype: object

In [56]:
subsets_size_2.iloc[1]

[(18722, 6292),
 (18722, 30149),
 (18722, 3490),
 (6292, 30149),
 (6292, 3490),
 (30149, 3490)]

* We make sure the tuple is sorted so it's easy to compute frequencies

In [59]:
co_occurences_2 = []
for item in list(itertools.chain(*subsets_size_2.values)):
    if item[0] > item[1]:
        co_occurences_2.append((item[1], item[0]))
    else:
        co_occurences_2.append((item[0], item[1]))  
co_occurences_2[:10]    

[(3929, 6051),
 (3929, 21058),
 (3929, 8867),
 (3929, 57763),
 (3929, 25595),
 (6051, 21058),
 (6051, 8867),
 (6051, 57763),
 (6051, 25595),
 (8867, 21058)]

* Count the number of occurrences of each pair of features

In [64]:
co_occurences__2_counts = pd.DataFrame(Counter(co_occurences_2).items())
co_occurences__2_counts.columns = ["combination", "nb_occurrences"]
co_occurences__2_counts = co_occurences__2_counts.sort_values(by="nb_occurrences", ascending=False)
co_occurences__2_counts.head(2)

Unnamed: 0,combination,nb_occurrences
524534,"(1937, 6879)",178
523141,"(5188, 9330)",175


In [88]:
co_occurences__2_counts.shape

(6988165, 2)

* Only keep those that occur frequently
```python
>>> sum(co_occurences__2_counts.nb_occurrences > 10)
263730
```


In [155]:
co_occurences__2_counts = co_occurences__2_counts[co_occurences__2_counts.nb_occurrences > 10]
co_occurences__2_counts.shape

(263730, 4)

In [156]:
subset_size = len(co_occurences__2_counts.iloc[0].combination)
for i in range(subset_size):
    a = co_occurences__2_counts.combination.apply(lambda x: x[i])
    co_occurences__2_counts.loc[:, f"item_{i}"] = feature_counts.loc[a].nb_occurrences.values

co_occurences__2_counts.head(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1
524534,"(1937, 6879)",178,264,183
523141,"(5188, 9330)",175,175,175
523140,"(5188, 5262)",175,175,175
528811,"(5262, 9330)",175,175,175
511640,"(3671, 5262)",174,174,175
522610,"(3797, 5188)",174,174,175
528721,"(9330, 14120)",174,175,174
528720,"(5262, 14120)",174,175,174
523136,"(5188, 14120)",174,175,174
511641,"(3671, 9330)",174,174,175


In [112]:
co_occurences__2_counts.tail(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1
3564221,"(47994, 58305)",11,6,11
1127922,"(17842, 38805)",11,17,11
3564301,"(30626, 58305)",11,6,11
3564218,"(24404, 58305)",11,6,11
1146342,"(15225, 43863)",11,12,11
1127923,"(35515, 38805)",11,13,11
3564305,"(46694, 58305)",11,6,11
1145396,"(49606, 58207)",11,66,14
1127926,"(21292, 38805)",11,20,11
3564294,"(47092, 58305)",11,6,11


* compute the Dice Coefficient

In [170]:
co_occurences__2_counts.iloc[:,[2,3]].min(axis=1)

524534     183
523141     175
523140     175
528811     175
511640     174
          ... 
1127923     11
3564305      6
1145396     14
1127926     11
3564294      6
Length: 263730, dtype: int64

In [174]:
co_occurences__2_counts["Dice"] = co_occurences__2_counts["nb_occurrences"] / co_occurences__2_counts.iloc[:,[2,3]].min(axis=1)
co_occurences__2_counts.head(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
524534,"(1937, 6879)",178,264,183,0.972678
523141,"(5188, 9330)",175,175,175,1.0
523140,"(5188, 5262)",175,175,175,1.0
528811,"(5262, 9330)",175,175,175,1.0
511640,"(3671, 5262)",174,174,175,1.0
522610,"(3797, 5188)",174,174,175,1.0
528721,"(9330, 14120)",174,175,174,1.0
528720,"(5262, 14120)",174,175,174,1.0
523136,"(5188, 14120)",174,175,174,1.0
511641,"(3671, 9330)",174,174,175,1.0


* What is the top hit?
  (1937, 6879) has  an occurrence rate of 0.9726

In [183]:
data[data.feature_number == 1937]['annotation_val'].head(5)

662     Ribonucleoside_diphosphate reductase
1063    Ribonucleoside_diphosphate reductase
1069    Ribonucleoside_diphosphate reductase
1335    Ribonucleoside_diphosphate reductase
1912    Ribonucleoside_diphosphate reductase
Name: annotation_val, dtype: object

In [185]:
data[data.feature_number == 6879]['annotation_val']

170       Unknown
755       Unknown
2764      Unknown
5850      Unknown
7642      Unknown
           ...   
228043    Unknown
233414    Unknown
233774    Unknown
233929    Unknown
234767    Unknown
Name: annotation_val, Length: 183, dtype: object

* If 6879 is unknown, are any of the other features associated with 1937 konw

In [195]:
co_occurences__2_counts[co_occurences__2_counts.combination.apply(lambda x: 1937 in  x)].head(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
524534,"(1937, 6879)",178,264,183,0.972678
527895,"(1937, 5262)",172,264,175,0.982857
523113,"(1937, 5188)",172,264,175,0.982857
527896,"(1937, 9330)",172,264,175,0.982857
515638,"(1937, 8643)",171,264,172,0.994186
522671,"(1937, 3797)",171,264,174,0.982759
527670,"(1937, 14438)",171,264,173,0.988439
527891,"(1937, 14120)",171,264,174,0.982759
511613,"(1937, 3671)",171,264,174,0.982759
515309,"(1937, 19807)",170,264,171,0.994152


In [191]:
data[data.feature_number == 5262]['annotation_val']

1944      DNA primase_helicase
3300      DNA primase_helicase
3366      DNA primase_helicase
4688      DNA primase_helicase
5660      DNA primase_helicase
                  ...         
227452    DNA primase_helicase
227640    DNA primase_helicase
229896    DNA primase_helicase
234640    DNA primase_helicase
235863    DNA primase_helicase
Name: annotation_val, Length: 175, dtype: object

In [194]:
data[data.feature_number == 8643]['annotation_val']

226       RecA_like protein
1239                Unknown
1443      RecA_like protein
4673      RecA_like protein
6829      RecA_like protein
                ...        
217832              Unknown
218224    RecA_like protein
223380    RecA_like protein
229546              Unknown
230338    RecA_like protein
Name: annotation_val, Length: 172, dtype: object

In [196]:
data[data.feature_number == 3797]['annotation_val']

24        Gp17 terminase large subunit
3796      Gp17 terminase large subunit
3927                           Unknown
4671      Gp17 terminase large subunit
4747      Gp17 terminase large subunit
                      ...             
232512    Gp17 terminase large subunit
233211    Gp17 terminase large subunit
233997    Gp17 terminase large subunit
235142    Gp17 terminase large subunit
236928    Gp17 terminase large subunit
Name: annotation_val, Length: 174, dtype: object

* Repeating the analysis work for larger subsets of features
  * Start with paird what have surprise factor, by either
  1. merging most frequent pairs iterative  add another item   
  2. Start from seed and extending to include all pairs and finding the frequent items
