* Issue of multiple feature on one location 
  * Had to apply set to avoid this issue

In [327]:
import pandas as pd
from collections import Counter
from utils import parse_cdhit, replace_val
import itertools
import numpy as np
from tqdm.notebook import tqdm

In [328]:
data = pd.read_table("data/ENA_ML_input_ORFs")

In [329]:
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein


In [330]:
seq_to_clust = parse_cdhit("data/ENA.40.clstr")

# what does it look like
list(seq_to_clust.items())[0:2]

[('ENA_KP211958_KP211958.1_65082_87137_88', '0'),
 ('ENA_AY940168_AY940168.2_57144_79082_90', '1')]

In [331]:
## Add a column to show the cluster to which each feature_id belongs
data["feature_number"] = replace_val(data.Feature_id, seq_to_clust)
data.feature_number = data.feature_number.astype(int)
data.head(2)



Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,feature_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194


In [332]:
# Add a contig sequential number in addition to contig id.
contig_id_to_int = {y:x for x,y in enumerate(data.contig_id.tolist())}
#### data.replace({'contig_id': contig_id_to_int})
data["contig_number"] = replace_val(data.contig_id, contig_id_to_int)
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,feature_number,contig_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065,237296
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194,234989


* Generate a table with all the features per contig


In [333]:
temp = data.groupby("contig_number").apply(lambda x: set(x["feature_number"].values))
subsets = pd.DataFrame({"contig_id": temp.index, "features": temp.values})
subsets.head(2)

Unnamed: 0,contig_id,features
0,82638,"{21058, 6051, 57763, 8867, 3929, 25595}"
1,106079,"{3490, 18722, 6292, 30149}"


In [347]:
data[data.feature_number == 10118]

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,feature_number,contig_number
71244,ENA_KJ645900_KJ645900.1_12716_13486_14,KJ645900.1,12716,13486,14,Unknown,10118,237307
81712,ENA_KJ645900_KJ645900.1_18285_19043_21,KJ645900.1,18285,19043,21,Unknown,10118,237307
123848,ENA_KJ645900_KJ645900.1_5716_6642_7,KJ645900.1,5716,6642,7,Unknown,10118,237307
152686,ENA_KJ645900_KJ645900.1_14999_15829_17,KJ645900.1,14999,15829,17,Putative E3 ubiquitin ligase,10118,237307
159326,ENA_KJ645900_KJ645900.1_336560_337399_360,KJ645900.1,336560,337399,360,Membrane_associated lipoprotein,10118,237307
167461,ENA_KJ645900_KJ645900.1_11745_12623_13,KJ645900.1,11745,12623,13,Unknown,10118,237307
227347,ENA_KJ645900_KJ645900.1_295633_294791_312,KJ645900.1,295633,294791,312,Membrane_associated lipoprotein,10118,237307
232545,ENA_KJ645900_KJ645900.1_6771_7586_8,KJ645900.1,6771,7586,8,Unknown,10118,237307


In [334]:
# temp
len(set(subsets[subsets.contig_id == 237307].features.values[0]))

339

* count the occurrences of each feature in all the contigs. 
  * Needed to compute the probability of a pair

In [335]:
feature_counts = pd.DataFrame(Counter(itertools.chain(*subsets.features.values)).items())
feature_counts.columns = ["feature", "nb_occurrences"]
feature_counts = feature_counts.set_index("feature")
feature_counts

Unnamed: 0_level_0,nb_occurrences
feature,Unnamed: 1_level_1
21058,122
6051,134
57763,25
8867,135
3929,135
...,...
10480,1
60147,1
29940,1
60148,1


* Count the occurrences of each pair of features that occur on at least 1 contig
* We generate generate all the possible pairs (tuples) of features 


In [336]:
# Generate all the subset of size 2 for each contig
subsets_size_2 = data.groupby("contig_number").apply(lambda x: list(itertools.combinations(x["feature_number"].values, 2)))
subsets_size_2.head(2)


contig_number
82638     [(3929, 6051), (3929, 21058), (3929, 8867), (3...
106079    [(18722, 6292), (18722, 30149), (18722, 3490),...
dtype: object

In [337]:
subsets_size_2.iloc[1]

[(18722, 6292),
 (18722, 30149),
 (18722, 3490),
 (6292, 30149),
 (6292, 3490),
 (30149, 3490)]

* We make sure the tuple is sorted so it's easy to compute frequencies

In [338]:
co_occurences_2 = []
for item in list(itertools.chain(*subsets_size_2.values)):
    if item[0] > item[1]:
        co_occurences_2.append((item[1], item[0]))
    else:
        co_occurences_2.append((item[0], item[1]))  
co_occurences_2[:10]    

[(3929, 6051),
 (3929, 21058),
 (3929, 8867),
 (3929, 57763),
 (3929, 25595),
 (6051, 21058),
 (6051, 8867),
 (6051, 57763),
 (6051, 25595),
 (8867, 21058)]

* Count the number of occurrences of each pair of features

In [339]:
co_occurences__2_counts = pd.DataFrame(Counter(co_occurences_2).items())
co_occurences__2_counts.columns = ["combination", "nb_occurrences"]
co_occurences__2_counts = co_occurences__2_counts.sort_values(by="nb_occurrences", ascending=False)
co_occurences__2_counts.head(2)

Unnamed: 0,combination,nb_occurrences
524534,"(1937, 6879)",178
523141,"(5188, 9330)",175


In [340]:
co_occurences__2_counts.shape

(6988165, 2)

* Only keep those that occur frequently
```python
>>> sum(co_occurences__2_counts.nb_occurrences > 10)
263730
```


In [342]:
co_occurences__2_counts = co_occurences__2_counts[co_occurences__2_counts.nb_occurrences > 10]
co_occurences__2_counts.shape

(263730, 2)

In [343]:
subset_size = len(co_occurences__2_counts.iloc[0].combination)
for i in range(subset_size):
    a = co_occurences__2_counts.combination.apply(lambda x: x[i])
    co_occurences__2_counts.loc[:, f"item_{i}"] = feature_counts.loc[a].nb_occurrences.values

co_occurences__2_counts.head(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1
524534,"(1937, 6879)",178,264,183
523141,"(5188, 9330)",175,175,175
523140,"(5188, 5262)",175,175,175
528811,"(5262, 9330)",175,175,175
511640,"(3671, 5262)",174,174,175
522610,"(3797, 5188)",174,174,175
528721,"(9330, 14120)",174,175,174
528720,"(5262, 14120)",174,175,174
523136,"(5188, 14120)",174,175,174
511641,"(3671, 9330)",174,174,175


In [344]:
co_occurences__2_counts.tail(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1
3564221,"(47994, 58305)",11,6,6
1127922,"(17842, 38805)",11,17,11
3564301,"(30626, 58305)",11,6,6
3564218,"(24404, 58305)",11,6,6
1146342,"(15225, 43863)",11,12,11
1127923,"(35515, 38805)",11,13,11
3564305,"(46694, 58305)",11,6,6
1145396,"(49606, 58207)",11,66,14
1127926,"(21292, 38805)",11,20,11
3564294,"(47092, 58305)",11,6,6


* Compute the Dice Coefficient

In [346]:
co_occurences__2_counts["Dice"] = co_occurences__2_counts["nb_occurrences"] / co_occurences__2_counts.iloc[:,[2,3]].product(axis=1)
co_occurences__2_counts.sort_values(by="Dice", ascending=False)



Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
6814005,"(10118, 17210)",96,1,1,96.000000
6814027,"(7075, 17210)",84,1,1,84.000000
6813956,"(6083, 17210)",72,1,1,72.000000
6811021,"(13032, 17210)",72,1,1,72.000000
6813932,"(17210, 17210)",66,1,1,66.000000
...,...,...,...,...,...
2073476,"(5188, 31897)",15,175,96,0.000893
2072220,"(14044, 31897)",15,177,96,0.000883
2072276,"(7889, 31897)",15,179,96,0.000873
2073500,"(31897, 41336)",14,96,170,0.000858


* What is the top hit?
  (1937, 6879) has  an occurrence rate of 0.9726

In [183]:
data[data.feature_number == 1937]['annotation_val'].head(5)

662     Ribonucleoside_diphosphate reductase
1063    Ribonucleoside_diphosphate reductase
1069    Ribonucleoside_diphosphate reductase
1335    Ribonucleoside_diphosphate reductase
1912    Ribonucleoside_diphosphate reductase
Name: annotation_val, dtype: object

In [185]:
data[data.feature_number == 6879]['annotation_val']

170       Unknown
755       Unknown
2764      Unknown
5850      Unknown
7642      Unknown
           ...   
228043    Unknown
233414    Unknown
233774    Unknown
233929    Unknown
234767    Unknown
Name: annotation_val, Length: 183, dtype: object

* If 6879 is unknown, are any of the other features associated with 1937 konw

In [195]:
co_occurences__2_counts[co_occurences__2_counts.combination.apply(lambda x: 1937 in  x)].head(10)

Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
524534,"(1937, 6879)",178,264,183,0.972678
527895,"(1937, 5262)",172,264,175,0.982857
523113,"(1937, 5188)",172,264,175,0.982857
527896,"(1937, 9330)",172,264,175,0.982857
515638,"(1937, 8643)",171,264,172,0.994186
522671,"(1937, 3797)",171,264,174,0.982759
527670,"(1937, 14438)",171,264,173,0.988439
527891,"(1937, 14120)",171,264,174,0.982759
511613,"(1937, 3671)",171,264,174,0.982759
515309,"(1937, 19807)",170,264,171,0.994152


In [326]:
data[data.feature_number == 5262]['annotation_val']

AttributeError: 'DataFrame' object has no attribute 'feature_number'

In [325]:
data[data.feature_number == 8643]['annotation_val']

AttributeError: 'DataFrame' object has no attribute 'feature_number'

In [324]:
data[data.feature_number == 3797]['annotation_val']

AttributeError: 'DataFrame' object has no attribute 'feature_number'

* Repeating the analysis work for larger subsets of features
  * Start with paird what have surprise factor, by either
  1. merging most frequent pairs iterative  add another item   
  2. Start from seed and extending to include all pairs and finding the frequent items


In [233]:
co_occurences__2_counts.sort_values(by="Dice", ascending=False)

Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
6807705,"(12777, 17210)",12,1,12,12.000000
6814109,"(15279, 17210)",12,1,12,12.000000
6814111,"(4221, 17210)",12,1,12,12.000000
6814162,"(1071, 17210)",12,1,12,12.000000
6814163,"(17210, 51489)",12,12,1,12.000000
...,...,...,...,...,...
2073227,"(31897, 46049)",14,96,158,0.145833
2073521,"(15458, 31897)",14,155,96,0.145833
2071791,"(5941, 31897)",14,156,96,0.145833
2072904,"(31897, 58348)",13,96,95,0.136842


In [230]:
co_occurences_3 = []

i = 0
for subset in co_occurences__2_counts.combination.values:
    i+=1
    for feature_group in subsets.features.values:
        if subset[0]  in feature_group and  subset[1] in feature_group:
            for feature in feature_group:
                if feature not in subset:
                    temp_group = sorted(list(subset) + [feature])
                    co_occurences_3.append(temp_group)            
    if i%100==0:
        print(i)

Exception ignored in: <function tqdm.__del__ at 0x44e216280>
Traceback (most recent call last):
  File "/Users/mahdi/mambaforge/lib/python3.9/site-packages/tqdm/std.py", line 1147, in __del__
    self.close()
  File "/Users/mahdi/mambaforge/lib/python3.9/site-packages/tqdm/notebook.py", line 286, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000


KeyboardInterrupt: 