[Stats for collocation detection](http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-55462016000300327)
* What was this about?
    * Issue of multiple features in one location
      * Had to apply set to avoid this issue

In [10]:
import pandas as pd
from collections import Counter
from utils import parse_cdhit, replace_val
import itertools


In [11]:
data = pd.read_table("data/ENA_ML_input_ORFs")
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein


In [12]:
seq_to_clust = parse_cdhit("data/ENA.40.clstr")
list(seq_to_clust.items())[0:2]

[('ENA_KP211958_KP211958.1_65082_87137_88', '0'),
 ('ENA_AY940168_AY940168.2_57144_79082_90', '1')]

In [13]:
## Add a column to show the cluster to which each feature_id belongs
data["cluster_number"] = replace_val(data.Feature_id, seq_to_clust)
data.cluster_number = data.cluster_number.astype(int)
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,cluster_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194


In [14]:
# Add a contig sequential number in addition to contig id.
contig_id_to_int = {y:x for x,y in enumerate(data.contig_id.tolist())}
#### data.replace({'contig_id': contig_id_to_int})
data["contig_number"] = replace_val(data.contig_id, contig_id_to_int)
data.head(2)

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,cluster_number,contig_number
0,ENA_AY095314_AY095314.2_3724_3921_10,AY095314.2,3724,3921,10,Unknown,57065,237296
1,ENA_JX889246_JX889246.1_45181_44900_68,JX889246.1,45181,44900,68,Uncharacterized protein,43194,234989


#### Testing code with AB012574.1

In [15]:
x = data[data.contig_id == "AB012574.1"].sort_values(by="ORF_location_on_contig")
x

Unnamed: 0,Feature_id,contig_id,position_start,position_end,ORF_location_on_contig,annotation_val,cluster_number,contig_number
163381,ENA_AB012574_AB012574.1_1_591_1,AB012574.1,1,591,1,Phage replication protein,18804,221381
165466,ENA_AB012574_AB012574.1_595_948_2,AB012574.1,595,948,2,Unknown,34298,221381
169624,ENA_AB012574_AB012574.1_960_1205_3,AB012574.1,960,1205,3,Unknown,48370,221381
129210,ENA_AB012574_AB012574.1_1216_1449_4,AB012574.1,1216,1449,4,Unknown,50392,221381
31977,ENA_AB012574_AB012574.1_1583_3058_5,AB012574.1,1583,3058,5,Unknown,4917,221381
110108,ENA_AB012574_AB012574.1_3060_3374_6,AB012574.1,3060,3374,6,Unknown,39228,221381
221381,ENA_AB012574_AB012574.1_3371_4513_7,AB012574.1,3371,4513,7,Unknown,7275,221381
60438,ENA_AB012574_AB012574.1_5516_4821_8,AB012574.1,5516,4821,8,Unknown,15178,221381
28717,ENA_AB012574_AB012574.1_6049_5696_9,AB012574.1,6049,5696,9,Unknown,32777,221381
177073,ENA_AB012574_AB012574.1_6488_6042_10,AB012574.1,6488,6042,10,Unknown,25521,221381


In [27]:
def sort_pair(pair):
    if pair[0] > pair[1]:
        return (pair[1], pair[0])
    else:
        return pair

def find_pairs_in_window(group, window_size=8):
    unsorted_pairs = []
    pairs=[]
    contig_clusters = group.cluster_number.values
    window_size = min(len(contig_clusters), window_size)
    for i, line_id in enumerate(contig_clusters[:-(window_size-1)]):
        unsorted_pairs.append(list(itertools.combinations(contig_clusters[i: i+window_size], 2)))
    for pair in set(list(itertools.chain(*unsorted_pairs))):
        pairs.append(sort_pair(pair))
    return list(set(pairs))

In [36]:
combination_2_contigs = data.groupby("contig_number").apply(find_pairs_in_window)
combination_2_contigs

contig_number
82638     [(3929, 8867), (21058, 25595), (3929, 6051), (...
106079    [(6292, 30149), (18722, 30149), (3490, 18722),...
109093    [(3929, 8867), (21058, 25595), (3929, 6051), (...
118155       [(6292, 30149), (26181, 30149), (6292, 26181)]
122079    [(21058, 44650), (3929, 8867), (8867, 44650), ...
                                ...                        
237325    [(36368, 42687), (8296, 11445), (4523, 51220),...
237326    [(34497, 46153), (7687, 60163), (46153, 52331)...
237327    [(33875, 39830), (6713, 42039), (13145, 43591)...
237328    [(1737, 29433), (38072, 49163), (24481, 29433)...
237329    [(12100, 39039), (986, 45573), (35403, 64398),...
Length: 2480, dtype: object

In [37]:
combination_2 = list(itertools.chain(*combination_2_contigs.values))
combination_2_counts =  Counter(combination_2)


combination_2_counts = pd.DataFrame(combination_2_counts.items())
combination_2_counts.columns = ["combination", "nb_occurrences"]
combination_2_counts.head(2)

Unnamed: 0,combination,nb_occurrences
0,"(3929, 8867)",135
1,"(21058, 25595)",122


In [46]:
temp = data.groupby("contig_number").apply(lambda x: sorted(set(x["cluster_number"].values)))
subsets = pd.DataFrame({"contig_id": temp.index, "cluster_number": temp.values})
subsets.head(2)

Unnamed: 0,contig_id,cluster_number
0,82638,"[3929, 6051, 8867, 21058, 25595, 57763]"
1,106079,"[3490, 6292, 18722, 30149]"


In [47]:
contig_counts = pd.DataFrame(Counter(itertools.chain(*subsets.cluster_number.values)).items())
contig_counts.columns = ["cluster_number", "nb_occurrences"]
contig_counts = contig_counts.set_index("cluster_number")
contig_counts.head()

Unnamed: 0_level_0,nb_occurrences
cluster_number,Unnamed: 1_level_1
3929,135
6051,134
8867,135
21058,122
25595,135


In [66]:
subset_size = len(combination_2_counts.iloc[0].combination)
for i in range(subset_size):
    a = combination_2_counts.combination.apply(lambda x: x[i])
    combination_2_counts.loc[:, f"item_{i}"] = contig_counts.loc[a].nb_occurrences.values

combination_2_counts["Dice"] =     (2 * combination_2_counts["nb_occurrences"]) / (combination_2_counts["item_0"]  + combination_2_counts["item_1"] )
combination_2_counts.head(20)

Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
0,"(3929, 8867)",135,135,135,1.0
1,"(21058, 25595)",122,122,135,0.949416
2,"(3929, 6051)",134,135,134,0.996283
3,"(3929, 57763)",25,135,25,0.3125
4,"(8867, 25595)",135,135,135,1.0
5,"(3929, 21058)",122,135,122,0.949416
6,"(8867, 57763)",25,135,25,0.3125
7,"(6051, 25595)",134,134,135,0.996283
8,"(6051, 8867)",134,134,135,0.996283
9,"(21058, 57763)",25,122,25,0.340136


In [72]:
# Filtering co-occurrences that are less than 10
combination_2_counts[combination_2_counts.nb_occurrences > 10].sort_values(by=["nb_occurrences", "Dice"], ascending=False)

Unnamed: 0,combination,nb_occurrences,item_0,item_1,Dice
0,"(3929, 8867)",135,135,135,1.000000
4,"(8867, 25595)",135,135,135,1.000000
12,"(3929, 25595)",135,135,135,1.000000
2,"(3929, 6051)",134,135,134,0.996283
7,"(6051, 25595)",134,134,135,0.996283
...,...,...,...,...,...
229653,"(1937, 28686)",11,264,135,0.055138
417802,"(1937, 62412)",11,264,146,0.053659
284612,"(1937, 47127)",11,264,149,0.053269
473017,"(1937, 26582)",11,264,171,0.050575
