In [44]:
# import statements
import numpy as np
from ipynb.fs.full.data_explore import DataExplorer
from collections import Counter

In [43]:
class FunctionPrediction():
    def __init__(self, organism_name):
        # get all the annotation, adjacency list data from the explorer class
        self.data_explorer = DataExplorer(organism_name)
        
    def majority_rule(self, protein_id, t=1):
        # input: protein_id, adjacency list graph representation
        # output: cluster_id of protein_id based on the majority rule of protein_id's neighbors
        
        print("{} has {} neighbors".format(protein_id, len(self.data_explorer.adj_list[protein_id])))
        
        tmp = [
            self.data_explorer.annotation_list[nbor[0]]
            for nbor in self.data_explorer.adj_list[protein_id]
        ]
        
        nbor_clusters = []
        
        # need to identify this neighbor's cluster by membership to t^th-smallest cluster, parameterized
        for cluster_list in tmp:
            # (cluster, size) tuples, sort by size
            zipped_list = [
                (cluster_id, self.data_explorer.cluster_sizes[cluster_id])
                for cluster_id in cluster_list
            ]
            
            sorted_list = sorted(zipped_list, key=lambda tup: tup[1])
            
            # for each neighbor, we take the t^th smallest cluster that it's a part of, assign that to be
            # its identity, then add this cluster to the neighbor cluster list from which we pick the majority
            nbor_clusters.append(
                sorted_list[min(t - 1, len(sorted_list) - 1)][0]
            )
        
        most_common_nbor = Counter(nbor_clusters).most_common(1)[0][0]
        return most_common_nbor

In [38]:
f = FunctionPrediction("ecoli")
f.majority_rule("362663.ECP_0002")

362663.ECP_0002 has 631 neighbors
1053
['0', '1049', '1050', '1051', '1052', '1053', '1054', '1120', '1121', '1122', '1134', '12', '16', '21', '23', '27', '29', '30', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '8']
