* Issue of multiple feature on one location 
  * Had to apply set to avoid this issue

In [327]:
import pandas as pd
from collections import Counter
from utils import parse_cdhit, replace_val
import itertools
import numpy as np
from tqdm.notebook import tqdm

In [328]:
data = pd.read_table("data/ENA_ML_input_ORFs")

In [329]:
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein


In [330]:
seq_to_clust = parse_cdhit("data/ENA.40.clstr")

# what does it look like
list(seq_to_clust.items())[0:2]

[('ENA_KP211958_KP211958.1_65082_87137_88', '0'),
 ('ENA_AY940168_AY940168.2_57144_79082_90', '1')]

In [331]:
## Add a column to show the cluster to which each feature_id belongs
data["feature_number"] = replace_val(data.Feature_id, seq_to_clust)
data.feature_number = data.feature_number.astype(int)
data.head(2)



Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,feature_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194


In [332]:
# Add a contig sequential number in addition to contig id.
contig_id_to_int = {y:x for x,y in enumerate(data.contig_id.tolist())}
#### data.replace({'contig_id': contig_id_to_int})
data["contig_number"] = replace_val(data.contig_id, contig_id_to_int)
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,feature_number,contig_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065,237296
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194,234989


* Generate a table with all the features per contig


In [59]:
# temp cell
data[data.feature_number == 13032].Feature_id

9109      ENA_KJ645900_KJ645900.1_246857_246081_265
33694        ENA_KJ645900_KJ645900.1_16565_17224_19
34898        ENA_KJ645900_KJ645900.1_13577_14275_15
140728    ENA_KJ645900_KJ645900.1_133148_132465_136
142078    ENA_KJ645900_KJ645900.1_330172_330837_352
179374    ENA_KJ645900_KJ645900.1_335709_336482_359
Name: Feature_id, dtype: object

In [374]:
temp = data.groupby("contig_number").apply(lambda x: sorted(set(x["feature_number"].values)))
subsets = pd.DataFrame({"contig_id": temp.index, "features": temp.values})
subsets.head(2)

Unnamed: 0,contig_id,features
0,82638,"[3929, 6051, 8867, 21058, 25595, 57763]"
1,106079,"[3490, 6292, 18722, 30149]"


In [375]:
# temp
len(subsets[subsets.contig_id == 237307].features.values[0])

339

* count the occurrences of each feature in all the contigs. 
  * Needed to compute the probability of a pair

In [376]:
feature_counts = pd.DataFrame(Counter(itertools.chain(*subsets.features.values)).items())
feature_counts.columns = ["feature", "nb_occurrences"]
feature_counts = feature_counts.set_index("feature")
feature_counts

Unnamed: 0_level_0,nb_occurrences
feature,Unnamed: 1_level_1
3929,135
6051,134
8867,135
21058,122
25595,135
...,...
59063,1
60147,1
60148,1
60622,1


* Count the occurrences of each pair of features that occur on at least 1 contig
* We generate generate all the possible pairs (tuples) of features 


In [393]:
# Generate all the subset of size 2 for each contig
subsets_size_2 = subsets.apply(lambda x: list(itertools.combinations(x["features"], 2)), axis=1)
subsets_size_2.index = subsets.contig_id
subsets_size_2.head(2)


contig_id
82638     [(3929, 6051), (3929, 8867), (3929, 21058), (3...
106079    [(3490, 6292), (3490, 18722), (3490, 30149), (...
dtype: object

In [394]:
#temp
# subsets_size_2[subsets_size_2.apply(lambda x: (10118, 17210) in x ).values]
len(subsets_size_2.loc[237307])

57291

* We make sure the tuple is sorted so it's easy to compute frequencies

In [395]:
co_occurences_2 = []
for item in list(itertools.chain(*subsets_size_2.values)):
    if item[0] > item[1]:
        co_occurences_2.append((item[1], item[0]))
    else:
        co_occurences_2.append((item[0], item[1]))  
co_occurences_2[:10]    

[(3929, 6051),
 (3929, 8867),
 (3929, 21058),
 (3929, 25595),
 (3929, 57763),
 (6051, 8867),
 (6051, 21058),
 (6051, 25595),
 (6051, 57763),
 (8867, 21058)]

* Count the number of occurrences of each pair of features

In [396]:
co_occurences__2_counts = pd.DataFrame(Counter(co_occurences_2).items())
co_occurences__2_counts.columns = ["combination", "nb_occurrences"]
co_occurences__2_counts = co_occurences__2_counts.sort_values(by="nb_occurrences", ascending=False)
co_occurences__2_counts.head(2)

Unnamed: 0,combination,nb_occurrences
508135,"(1937, 6879)",178
510515,"(5188, 9330)",175


In [397]:
co_occurences__2_counts.shape

(6987559, 2)

* Only keep those that occur frequently
```python
>>> sum(co_occurences__2_counts.nb_occurrences > 10)
263730
```


In [398]:
co_occurences__2_counts = co_occurences__2_counts[co_occurences__2_counts.nb_occurrences > 10]
co_occurences__2_counts.shape

(257945, 2)

In [399]:
subset_size = len(co_occurences__2_counts.iloc[0].combination)
for i in range(subset_size):
    a = co_occurences__2_counts.combination.apply(lambda x: x[i])
    co_occurences__2_counts.loc[:, f"item_{i}"] = feature_counts.loc[a].nb_occurrences.values

co_occurences__2_counts.head(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1
508135,"(1937, 6879)",178,264,183
510515,"(5188, 9330)",175,175,175
510497,"(5188, 5262)",175,175,175
510896,"(5262, 9330)",175,175,175
509525,"(3797, 5188)",174,174,175
509150,"(3671, 9330)",174,174,175
510530,"(5188, 14120)",174,175,174
509545,"(3797, 9330)",174,174,175
509130,"(3671, 5188)",174,174,175
509132,"(3671, 5262)",174,174,175


In [400]:
co_occurences__2_counts.tail(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1
1422998,"(45536, 61763)",11,12,16
1420845,"(32314, 37658)",11,11,16
1143899,"(5142, 32083)",11,73,11
1143952,"(5482, 22035)",11,22,11
1423103,"(47119, 49743)",11,12,16
1144486,"(8144, 41206)",11,17,11
1422227,"(38128, 48030)",11,11,19
1420909,"(32401, 35715)",11,24,13
1422230,"(38128, 49040)",11,11,16
1143954,"(5482, 23207)",11,22,11


* Compute the Dice Coefficient

In [403]:
co_occurences__2_counts["Dice"] = co_occurences__2_counts["nb_occurrences"] / co_occurences__2_counts.iloc[:,[2,3]].product(axis=1)
co_occurences__2_counts.sort_values(by=["nb_occurrences", "Dice"], ascending=False)



Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
508135,"(1937, 6879)",178,264,183,0.003684
510515,"(5188, 9330)",175,175,175,0.005714
510497,"(5188, 5262)",175,175,175,0.005714
510896,"(5262, 9330)",175,175,175,0.005714
509525,"(3797, 5188)",174,174,175,0.005714
...,...,...,...,...,...
2002712,"(14667, 40947)",11,140,52,0.001511
1572165,"(8440, 61607)",11,65,114,0.001484
1804394,"(8440, 16356)",11,65,115,0.001472
1572078,"(8440, 23265)",11,65,116,0.001459


* What is the top hit?
  (1937, 6879) has  an occurrence rate of 0.9726

In [402]:
data[data.feature_number == 1937]['annotation_val'].head(5)

662     Ribonucleoside_diphosphate reductase
1063    Ribonucleoside_diphosphate reductase
1069    Ribonucleoside_diphosphate reductase
1335    Ribonucleoside_diphosphate reductase
1912    Ribonucleoside_diphosphate reductase
Name: annotation_val, dtype: object

In [185]:
data[data.feature_number == 6879]['annotation_val']

170       Unknown
755       Unknown
2764      Unknown
5850      Unknown
7642      Unknown
           ...   
228043    Unknown
233414    Unknown
233774    Unknown
233929    Unknown
234767    Unknown
Name: annotation_val, Length: 183, dtype: object

* If 6879 is unknown, are any of the other features associated with 1937 konw

In [404]:
co_occurences__2_counts[co_occurences__2_counts.combination.apply(lambda x: 1937 in  x)].head(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
508135,"(1937, 6879)",178,264,183,0.003684
508145,"(1937, 9330)",172,264,175,0.003723
508127,"(1937, 5262)",172,264,175,0.003723
508125,"(1937, 5188)",172,264,175,0.003723
508118,"(1937, 3671)",171,264,174,0.003723
508120,"(1937, 3797)",171,264,174,0.003723
508140,"(1937, 8643)",171,264,172,0.003766
508160,"(1937, 14120)",171,264,174,0.003723
508162,"(1937, 14438)",171,264,173,0.003744
508158,"(1937, 13017)",170,264,172,0.003744


In [407]:
data[data.feature_number == 5262]['annotation_val']

1944      DNA primase_helicase
3300      DNA primase_helicase
3366      DNA primase_helicase
4688      DNA primase_helicase
5660      DNA primase_helicase
                  ...         
227452    DNA primase_helicase
227640    DNA primase_helicase
229896    DNA primase_helicase
234640    DNA primase_helicase
235863    DNA primase_helicase
Name: annotation_val, Length: 175, dtype: object

In [408]:
data[data.feature_number == 8643]['annotation_val']

226       RecA_like protein
1239                Unknown
1443      RecA_like protein
4673      RecA_like protein
6829      RecA_like protein
                ...        
217832              Unknown
218224    RecA_like protein
223380    RecA_like protein
229546              Unknown
230338    RecA_like protein
Name: annotation_val, Length: 172, dtype: object

In [409]:
data[data.feature_number == 3797]['annotation_val']

24        Gp17 terminase large subunit
3796      Gp17 terminase large subunit
3927                           Unknown
4671      Gp17 terminase large subunit
4747      Gp17 terminase large subunit
                      ...             
232512    Gp17 terminase large subunit
233211    Gp17 terminase large subunit
233997    Gp17 terminase large subunit
235142    Gp17 terminase large subunit
236928    Gp17 terminase large subunit
Name: annotation_val, Length: 174, dtype: object

* Repeating the analysis work for larger subsets of features
  * Start with paird what have surprise factor, by either
  1. merging most frequent pairs iterative  add another item   
  2. Start from seed and extending to include all pairs and finding the frequent items


In [412]:
co_occurences__2_counts.shape

(257945, 5)

In [None]:
co_occurences_3 = []

i = 0
for subset in co_occurences__2_counts.combination.values:
    i+=1
    for feature_group in subsets.features.values:
        if subset[0]  in feature_group and  subset[1] in feature_group:
            for feature in feature_group:
                if feature not in subset:
                    temp_group = sorted(list(subset) + [feature])
                    co_occurences_3.append(temp_group)            
    if i%100==0:
        print(i)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
