In [1]:
# import statements
import json
import pprint
import matplotlib.pyplot as plt
import math
import statistics
import os
import numpy as np
import copy
from ipynb.fs.full.data_explore import DataExplorer
from ipynb.fs.full.function_prediction import FunctionPrediction
from collections import Counter
import random



1117


In [2]:
# RawResults class holds all info 
class RawResults():
    def __init__(self, organism_name, n, r):
        
        # folder name
        self.organism_name = organism_name
        # length of list of proteins
        self.n = n
        # radius or timestep
        self.r = r
        
        # adjustable variables for raw scoring that we currently do not use
        self.true_positive_reward = 1
        self.false_positive_penalty = -0.25
        self.false_negative_penalty = -0.25
        
        # DataExplorer
        self.data_explorer = DataExplorer(self.organism_name)
        # used to get the actual list of annotations for a protein: self.annotation_dict[self.p[p]]
        self.annotation_dict = self.data_explorer.annotation_list
        # used in get_test_proteins() to gather self.p
        self.names_set = self.data_explorer.filtered_names_set
        self.names_list = self.data_explorer.filtered_names_list
        # used in get_global/local_percentile_ to get list of glosters sorted by size
        self.global_clusters_sorted = self.data_explorer.clusters_sorted
        
        # gathering list of test proteins
        self.p = self.get_test_proteins(n)
        print(self.p)
        # FunctionPrediction
        function_predictor = FunctionPrediction(self.organism_name)
        # raw results 
#         print("starting majority")
        self.majority_approach_results = function_predictor.majority_rule(self.p, r=self.r)
#         print("did majority")
#         print("starting ff")
        self.functional_flow_results = function_predictor.functional_flow(self.p, t=self.r)
#         print("did majority")
        self.alignment_approach_results = function_predictor.alignment_approach(self.p, r=self.r)

    # returns a random size n list of proteins to test
    def get_test_proteins(self, n):
#         p = []
#         for x in range(n):
#             i = random.randint(0, len(self.names_set))
#             while self.names_list[i] in p:
#                 i = random.randint(0, len(self.names_set))
#                 while len(self.annotation_dict[self.names_list[i]]) == 0:
#                     i = random.randint(0, len(self.names_set))
#             p.append(self.names_list[i])
        kz = self.data_explorer.adj_list.keys()
        return random.sample(kz, n)

In [3]:
# compares results from FunctionPrediction to actual
def compare(p, annotation, results):
    true_positive = []
    false_positive = []
    false_negative = []
    for i in range(len(p)):
        protein = p[i]
        true_positive.append([set(), 0])
        false_positive.append([set(), 0])
        false_negative.append([set(), 0])
        actual_clusters = copy.deepcopy(annotation[protein])
        result_clusters = results[protein]
        if len(result_clusters) >= len(actual_clusters):
            for j in range(len(actual_clusters)):
                cluster = result_clusters[j]
                if cluster in actual_clusters:
                    true_positive[i][0].add(cluster)
                    true_positive[i][1] += 1
                    actual_clusters.remove(cluster)
                elif cluster not in actual_clusters:
                    false_positive[i][0].add(cluster)
                    false_positive[i][1] += 1
            for cluster in actual_clusters:
                false_negative[i][0].add(cluster)
                false_negative[i][1] += 1
        elif len(result_clusters) < len(actual_clusters):
            for j in range(len(result_clusters)):
                cluster = actual_clusters[j]
                if cluster in result_clusters:
                    true_positive[i][0].add(cluster)
                    true_positive[i][1] += 1
                    result_clusters.remove(cluster)
                elif cluster not in result_clusters:
                    false_negative[i][0].add(cluster)
                    false_negative[i][1] += 1
            for cluster in result_clusters:
                false_positive[i][0].add(cluster)
                false_positive[i][1] += 1
    return true_positive, false_positive, false_negative

In [4]:
# finds where actual protein clusters fall in the global cluster size distribution
def get_global_percentile_actual(p, true_positive, annotation_dict, global_clusters_sorted):
    percentile = []
    for protein in range(len(p)):
        for c in annotation_dict[p[protein]]:
            i = global_clusters_sorted.index(c)
            percentile.append((i/len(global_clusters_sorted)*100)//1)
    listof100actual = []
    totallen = 0
    for protein in range(len(p)):
        totallen = len(annotation_dict[p[protein]])
    for i in range(100):
        listof100actual.append((percentile.count(i)/totallen*100)//(len(p)))
    return listof100actual

# finds where true positive predictions of protein clusters fall in the global cluster size distribution
def get_global_percentile(p, true_positive, annotation_dict, global_clusters_sorted):
    percentile = []
    for protein in range(len(p)):
        for c in annotation_dict[p[protein]]:
            i = global_clusters_sorted.index(c)
            percentile.append((i/len(global_clusters_sorted)*100)//1)
    listof100actual = []
    totallen = 0
    for protein in range(len(p)):
        totallen = len(annotation_dict[p[protein]])
    for i in range(100):
        listof100actual.append((percentile.count(i)))
    listof100avg = []
    for i in range(100):
        listof100avg.append([])
    percentile = []
    for protein in range(len(p)):
        for tp in true_positive[protein][0]:
            i = global_clusters_sorted.index(tp)
            percentile.append((i/len(global_clusters_sorted)*100)//1)
        listof100result = []
        for i in range(100):
            listof100result.append(percentile.count(i))
        for i in range(100):
            listof100avg[i].append(listof100result[i])
    for i in range(100):
        totalfori = 0
        for j in range(len(listof100avg[i])):
            totalfori += listof100avg[i][j]
        totalfori/=len(listof100avg[i])
        listof100avg[i] = totalfori
    listof100 = []
    for i in range(100):
        if listof100actual[i] == 0:
            listof100.append(0)
        else:
            listof100.append(((listof100avg[i]/listof100actual[i])*100)//1)
    return listof100
        
# finds where actual protein clusters fall in the local cluster size distribution
def get_local_percentile(p, true_positive, annotation_dict, global_clusters_sorted):
    clusters = []
    cluster_len_total = 0
    for protein in range(len(p)):
        cluster_unsorted = annotation_dict[p[protein]]
        cluster_sorted = []
        for c in global_clusters_sorted:
            if c in cluster_unsorted:
                cluster_sorted.append(c)
        clusters.append(cluster_sorted)
        cluster_len_total += len(cluster_sorted)
    percentile = []
    for i in range(100):
        percentile.append(0)
    for i in range(len(p)):
        tpi = true_positive[i]
        ci = clusters[i]
        for tp in tpi[0]:
            percentile[int((ci.index(tp)/len(ci)*100)//1)]+=1
    for i in range(100):
        if percentile[i] != 0:
            percentile[i] == (percentile[i]/(cluster_len_total/100)*100*100)
    return percentile        

In [5]:
class MajorityApproachResults():
    def __init__(self, raw_results):
        self.global_title = 'Global Specificity with Majority Approach'
        self.global_color = 'red'
        self.local_title = 'Local Specificity with Majority Approach'
        self.local_color = 'orange'
        
        # info gathered from raw_results
        self.p = raw_results.p
        self.annotation_dict = raw_results.annotation_dict
        self.global_clusters_sorted = raw_results.global_clusters_sorted
        self.raw_results = raw_results.majority_approach_results
        #self.tp_reward = raw_results.true_positive_reward
        #self.fp_penalty = raw_results.false_positive_penalty
        #self.fn_penalty = raw_results.false_negative_penalty
        
        # 2D lists of length self.n corresponding to the proteins in self.p: [[{cluster_ids},count]]
        self.true_positive, self.false_positive, self.false_negative = (
            compare(self.p, self.annotation_dict, self.raw_results))
        
        # not currently used
        #self.scores = score(self.p, self.true_positive, self.false_positive, self.false_negative, 
                            #self.tp_reward, self.fp_penalty, self.fn_penalty, self.annotation_dict)
        
        self.global_percentile_actual = get_global_percentile_actual(self.p, self.true_positive, self.annotation_dict,
                                                                     self.global_clusters_sorted)
        self.global_percentile = get_global_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                       self.global_clusters_sorted)
        self.local_percentile = get_local_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                     self.global_clusters_sorted)

In [6]:
class FunctionalFlowResults():
    def __init__(self, raw_results):
        self.global_title = 'Global Specificity with Functional Flow'
        self.global_color = 'gold'
        self.local_title = 'Local Specificity with Functional Flow'
        self.local_color = 'green'
        
        # info gathered from raw_results
        self.p = raw_results.p
        self.annotation_dict = raw_results.annotation_dict
        self.global_clusters_sorted = raw_results.global_clusters_sorted
        self.raw_results = raw_results.functional_flow_results
        #self.tp_reward = raw_results.true_positive_reward
        #self.fp_penalty = raw_results.false_positive_penalty
        #self.fn_penalty = raw_results.false_negative_penalty
        
        # 2D lists of length self.n corresponding to the proteins in self.p: [[{cluster_ids},count]]
        self.true_positive, self.false_positive, self.false_negative = (
            compare(self.p, self.annotation_dict, self.raw_results))
        
        # not currently used
        #self.scores = score(self.p, self.true_positive, self.false_positive, self.false_negative, 
                            #self.tp_reward, self.fp_penalty, self.fn_penalty, self.annotation_dict)
        
        self.global_percentile_actual = get_global_percentile_actual(self.p, self.true_positive, self.annotation_dict,
                                                                     self.global_clusters_sorted)
        self.global_percentile = get_global_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                       self.global_clusters_sorted)
        self.local_percentile = get_local_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                     self.global_clusters_sorted)

In [7]:
class AlignmentApproachResults():
    def __init__(self, raw_results):
        self.global_title = 'Global Specificity with Alignment Approach'
        self.global_color = 'blue'
        self.local_title = 'Local Specificity with Alignment Approach'
        self.local_color = 'purple'
        
        # info gathered from raw_results
        self.p = raw_results.p
        self.annotation_dict = raw_results.annotation_dict
        self.global_clusters_sorted = raw_results.global_clusters_sorted
        self.raw_results = raw_results.alignment_approach_results
        #self.tp_reward = raw_results.true_positive_reward
        #self.fp_penalty = raw_results.false_positive_penalty
        #self.fn_penalty = raw_results.false_negative_penalty
        
        # 2D lists of length self.n corresponding to the proteins in self.p: [[{cluster_ids},count]]
        self.true_positive, self.false_positive, self.false_negative = (
            compare(self.p, self.annotation_dict, self.raw_results))
        
        # not currently used
        #self.scores = score(self.p, self.true_positive, self.false_positive, self.false_negative, 
                            #self.tp_reward, self.fp_penalty, self.fn_penalty, self.annotation_dict)
        
        self.global_percentile_actual = get_global_percentile_actual(self.p, self.true_positive, self.annotation_dict,
                                                                     self.global_clusters_sorted)
        self.global_percentile = get_global_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                       self.global_clusters_sorted)
        self.local_percentile = get_local_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                     self.global_clusters_sorted)

In [8]:
# histograms of actual protein clusters in the global cluster size distribution
def get_global_actual_bucket_1(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile_actual)):
            percentile[p].append(result.global_percentile_actual[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//= len(percentile[i])
        percentile[i] = totalfori
    left = []
    for i in range(100):
        left.append(i)
    height = percentile
    plt.bar(left, height, width = 1, color ='black')
    plt.ylim((0,100))
    plt.xlabel("Global Cluster Size Percentiles")
    plt.ylabel("Percent of Protein Clusters in Percentile")
    plt.title("Global Cluster Size Distribution")
    plt.show()
    
def get_global_actual_bucket_2(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile_actual)):
            percentile[p].append(result.global_percentile_actual[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//= len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(50):
        fr_this_time.append(0)
    for i in range(50):
        totalfori = 0
        for j in range(2):
            totalfori+=percentile[2*i+j]
        totalfori/=2
        fr_this_time[i] = totalfori
    left = []
    for i in range(50):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','10','','','','','20','','','','','30','','','','', '40','','','','',
                 '50','','','','','60','','','','','70','','','','','80','','','','', '90','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = 'black')
    plt.ylim((0,100))
    plt.xlabel("Global Cluster Size Percentiles")
    plt.ylabel("Percent of Protein Clusters in Percentile")
    plt.title("Global Cluster Size Distribution")
    plt.show()
    
def get_global_actual_bucket_4(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile_actual)):
            percentile[p].append(result.global_percentile_actual[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//= len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(25):
        fr_this_time.append(0)
    for i in range(25):
        totalfori = 0
        for j in range(4):
            totalfori+=percentile[4*i+j]
        totalfori/=4
        fr_this_time[i] = totalfori
    left = []
    for i in range(25):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','20','','','','','40','','','','','60','','','','', '80','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = 'black')
    plt.ylim((0,100))
    plt.xlabel("Global Cluster Size Percentiles")
    plt.ylabel("Percent of Protein Clusters in Percentile")
    plt.title("Global Cluster Size Distribution")
    plt.show()
    
def get_global_actual_bucket_5(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile_actual)):
            percentile[p].append(result.global_percentile_actual[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//= len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(20):
        fr_this_time.append(0)
    for i in range(20):
        totalfori = 0
        for j in range(5):
            totalfori+=percentile[5*i+j]
        totalfori/=5
        fr_this_time[i] = totalfori
    left = []
    for i in range(20):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','5','10','15','20','25','30','35','40','45','50','55','60','65','70','75','80','85','90','95']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = 'black')
    plt.ylim((0,100))
    plt.xlabel("Global Cluster Size Percentiles")
    plt.ylabel("Percent of Protein Clusters in Percentile")
    plt.title("Global Cluster Size Distribution")
    plt.show()

In [9]:
# histograms of true positive predictions of protein clusters in global cluster size distribution
def get_global_bucket_1(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile)):
            percentile[p].append(result.global_percentile[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    left = []
    for i in range(100):
        left.append(i)
    height = percentile
    tick_label = ['0','5','10','15','20','25','30','35','40','45','50','55','60','65','70','75','80','85','90','95']
    plt.bar(left, height, width = 1, color = results[0].global_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].global_title)
    plt.show()
    
def get_global_bucket_2(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile)):
            percentile[p].append(result.global_percentile[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(50):
        fr_this_time.append(0)
    for i in range(50):
        totalfori = 0
        for j in range(2):
            totalfori+=percentile[2*i+j]
        totalfori/=2
        fr_this_time[i] = totalfori
    left = []
    for i in range(50):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','10','','','','','20','','','','','30','','','','', '40','','','','',
                 '50','','','','','60','','','','','70','','','','','80','','','','', '90','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = results[0].global_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].global_title)
    plt.show()

def get_global_bucket_4(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile)):
            percentile[p].append(result.global_percentile[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(25):
        fr_this_time.append(0)
    for i in range(25):
        totalfori = 0
        for j in range(4):
            totalfori+=percentile[4*i+j]
        totalfori/=4
        fr_this_time[i] = totalfori
    left = []
    for i in range(25):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','20','','','','','40','','','','','60','','','','', '80','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = results[0].global_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].global_title)
    plt.show() 

def get_global_bucket_5(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile)):
            percentile[p].append(result.global_percentile[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(20):
        fr_this_time.append(0)
    for i in range(20):
        totalfori = 0
        for j in range(5):
            totalfori+=percentile[5*i+j]
        totalfori/=5
        fr_this_time[i] = totalfori
    left = []
    for i in range(20):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','5','10','15','20','25','30','35','40','45','50','55','60','65','70','75','80','85','90','95']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = results[0].global_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].global_title)
    plt.show()   

In [10]:
# histograms of true positive predictions of protein clusters in local cluster size distribution
def get_local_bucket_1(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for i in range(100):
        for result in results:
            percentile[i].append(result.local_percentile[i]*100)
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    left = []
    for i in range(100):
        left.append(i)
    height = percentile
    plt.bar(left, height, width = 1, color = results[0].local_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].local_title)
    plt.show()

def get_local_bucket_2(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for i in range(100):
        for result in results:
            percentile[i].append(result.local_percentile[i]*100)
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(50):
        fr_this_time.append(0)
    for i in range(50):
        totalfori = 0
        for j in range(2):
            totalfori+=percentile[2*i+j]
        totalfori/=2
        fr_this_time[i] = totalfori
    left = []
    for i in range(50):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','10','','','','','20','','','','','30','','','','', '40','','','','',
                 '50','','','','','60','','','','','70','','','','','80','','','','', '90','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = results[0].local_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].local_title)
    plt.show()

def get_local_bucket_4(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for i in range(100):
        for result in results:
            percentile[i].append(result.local_percentile[i]*100)
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(25):
        fr_this_time.append(0)
    for i in range(25):
        totalfori = 0
        for j in range(4):
            totalfori+=percentile[4*i+j]
        totalfori/=4
        fr_this_time[i] = totalfori
    left = []
    for i in range(25):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','20','','','','','40','','','','','60','','','','', '80','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = results[0].local_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].local_title)
    plt.show()

def get_local_bucket_5(results):
    percentile = []
    for i in range(100):
        percentile.append([])
    for i in range(100):
        for result in results:
            percentile[i].append(result.local_percentile[i]*100)
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(20):
        fr_this_time.append(0)
    for i in range(20):
        totalfori = 0
        for j in range(5):
            totalfori+=percentile[5*i+j]
        totalfori/=5
        fr_this_time[i] = totalfori
    left = []
    for i in range(20):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','5','10','15','20','25','30','35','40','45','50','55','60','65','70','75','80','85','90','95']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = results[0].local_color)
    plt.ylim((0,100))
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title(results[0].local_title)
    plt.show()

In [11]:
# majority results held here
majority_list_radius_1 = []
majority_list_radius_2 = []
majority_list_radius_3 = []
majority_list_radius_4 = []
majority_list_radius_5 = []

# flow results held here
flow_list_radius_1 = []
flow_list_radius_2 = []
flow_list_radius_3 = []
flow_list_radius_4 = []
flow_list_radius_5 = []

# align results held here
align_list_radius_1 = []
align_list_radius_2 = []
align_list_radius_3 = []
align_list_radius_4 = []
align_list_radius_5 = []

In [None]:
print("RADIUS 1")
for i in range(50):
    print("ITER {} / {}".format(i+1, 50))
    
    try:
        results = RawResults("ecoli", 5, 1)
        majority_results = MajorityApproachResults(results)
        majority_list_radius_1.append(majority_results)
        flow_results = FunctionalFlowResults(results)
        flow_list_radius_1.append(flow_results)
        align_results = AlignmentApproachResults(results)
        align_list_radius_1.append(align_results)
    except:
        pass

RADIUS 1
ITER 1 / 50
['362663.ECP_3575', '362663.ECP_0089', '362663.ECP_2491', '362663.ECP_0159', '362663.ECP_0672']
ITER 2 / 50
ITER 3 / 50
ITER 4 / 50
['362663.ECP_2056', '362663.ECP_3657', '362663.ECP_0723', '362663.ECP_3106', '362663.ECP_0766']
ITER 5 / 50
['362663.ECP_0751', '362663.ECP_1101', '362663.ECP_1258', '362663.ECP_2153', '362663.ECP_4179']
starting majority
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 12
WORKING NBORHOOD SIZE: 15
WORKING NBORHOOD SIZE: 15
WORKING NBORHOOD SIZE: 13
WORKING NBORHOOD SIZE: 1
did majority
starting ff
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 56
did majority
ITER 6 / 50
['362663.ECP_1396', '362663.ECP_4552', '362663.ECP_3109', '362663.ECP_0183', '362663.ECP_1400']
ITER 7 / 50
ITER 8 / 50


In [None]:
print("RADIUS 2")
for i in range(50):
    print("ITER {} / {}".format(i+1, 50))
    
    try:
        results = RawResults("ecoli", 1, 2)
        majority_results = MajorityApproachResults(results)
        majority_list_radius_2.append(majority_results)
        flow_results = FunctionalFlowResults(results)
        flow_list_radius_2.append(flow_results)
#         align_results = AlignmentApproachResults(results)
#         align_list_radius_2.append(align_results)
    except:
        continue

RADIUS 2
ITER 1 / 50
['362663.ECP_3981']
starting majority
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 114
did majority
starting ff
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 114
ITER 2 / 50
ITER 3 / 50
['362663.ECP_2341']
ITER 4 / 50
ITER 5 / 50
ITER 6 / 50
ITER 7 / 50
ITER 8 / 50
ITER 9 / 50
ITER 10 / 50
ITER 11 / 50
ITER 12 / 50
ITER 13 / 50
ITER 14 / 50
ITER 15 / 50
ITER 16 / 50
ITER 17 / 50
ITER 18 / 50
ITER 19 / 50
ITER 20 / 50
ITER 21 / 50
ITER 22 / 50
ITER 23 / 50
ITER 24 / 50
['362663.ECP_1754']


In [12]:
print("RADIUS 3")
for i in range(50):
    print("ITER {} / {}".format(i+1, 50))
    
    try:
        results = RawResults("ecoli", 1, 3)
        majority_results = MajorityApproachResults(results)
        majority_list_radius_3.append(majority_results)
        flow_results = FunctionalFlowResults(results)
        flow_list_radius_3.append(flow_results)
#         align_results = AlignmentApproachResults(results)
#         align_list_radius_3.append(align_results)
    except:
        continue

RADIUS 3
ITER 1 / 50
['362663.ECP_4454']
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 1706
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 1706
ITER 2 / 50
['362663.ECP_0268']
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 168
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 168
ITER 3 / 50
['362663.ECP_4001']
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 80
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 80
ITER 4 / 50
['362663.ECP_2125']
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 27
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 27
ITER 5 / 50
['362663.ECP_3012']
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 59
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 59
ITER 6 / 50
['362663.ECP_2584']
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 503
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 503
ITER 7 / 50
['362663.ECP_2545']
---- MAJORITY APPROACH ----
WORKING NBORHOOD SIZE: 664
---- FLOW APPROACH ----
WORKING NBORHOOD SIZE: 664
ITER 8 / 50
['362663.ECP_4197

In [None]:
print("RADIUS 4")
for i in range(50):
    print("ITER {} / {}".format(i+1, 50))
    
    try:
        results = RawResults("ecoli", 1, 4)
        majority_results = MajorityApproachResults(results)
        majority_list_radius_4.append(majority_results)
        flow_results = FunctionalFlowResults(results)
        flow_list_radius_4.append(flow_results)
#         align_results = AlignmentApproachResults(results)
#         align_list_radius_4.append(align_results)
    except:
        continue

In [None]:
print("RADIUS 5")
for i in range(50):
    print("ITER {} / {}".format(i+1, 50))
    
    try:
        results = RawResults("ecoli", 1, 5)
        majority_results = MajorityApproachResults(results)
        majority_list_radius_5.append(majority_results)
        flow_results = FunctionalFlowResults(results)
        flow_list_radius_5.append(flow_results)
#         align_results = AlignmentApproachResults(results)
#         align_list_radius_5.append(align_results)
    except:
        continue

In [None]:
# global stats 
get_global_actual_bucket_1(majority_list_radius_1)
get_global_actual_bucket_2(majority_list_radius_1)
get_global_actual_bucket_4(majority_list_radius_1)
get_global_actual_bucket_5(majority_list_radius_1)

# majority radius 1
get_global_bucket_1(majority_list_radius_1)
get_global_bucket_2(majority_list_radius_1)
get_global_bucket_4(majority_list_radius_1)
get_global_bucket_5(majority_list_radius_1)

get_local_bucket_1(majority_list_radius_1)
get_local_bucket_2(majority_list_radius_1)
get_local_bucket_4(majority_list_radius_1)
get_local_bucket_5(majority_list_radius_1)

# flow radius 1 
get_global_bucket_1(flow_list_radius_1)
get_global_bucket_2(flow_list_radius_1)
get_global_bucket_4(flow_list_radius_1)
get_global_bucket_5(flow_list_radius_1)

get_local_bucket_1(flow_list_radius_1)
get_local_bucket_2(flow_list_radius_1)
get_local_bucket_4(flow_list_radius_1)
get_local_bucket_5(flow_list_radius_1)

# align radius 1
get_global_bucket_1(align_list_radius_1)
get_global_bucket_2(align_list_radius_1)
get_global_bucket_4(align_list_radius_1)
get_global_bucket_5(align_list_radius_1)

get_local_bucket_1(align_list_radius_1)
get_local_bucket_2(align_list_radius_1)
get_local_bucket_4(align_list_radius_1)
get_local_bucket_5(align_list_radius_1)

In [None]:
# global stats 
get_global_actual_bucket_1(majority_list_radius_2)
get_global_actual_bucket_2(majority_list_radius_2)
get_global_actual_bucket_4(majority_list_radius_2)
get_global_actual_bucket_5(majority_list_radius_2)

# majority radius 2
get_global_bucket_1(majority_list_radius_2)
get_global_bucket_2(majority_list_radius_2)
get_global_bucket_4(majority_list_radius_2)
get_global_bucket_5(majority_list_radius_2)

get_local_bucket_1(majority_list_radius_2)
get_local_bucket_2(majority_list_radius_2)
get_local_bucket_4(majority_list_radius_2)
get_local_bucket_5(majority_list_radius_2)

# flow radius 2 
get_global_bucket_1(flow_list_radius_2)
get_global_bucket_2(flow_list_radius_2)
get_global_bucket_4(flow_list_radius_2)
get_global_bucket_5(flow_list_radius_2)

get_local_bucket_1(flow_list_radius_2)
get_local_bucket_2(flow_list_radius_2)
get_local_bucket_4(flow_list_radius_2)
get_local_bucket_5(flow_list_radius_2)

# align radius 2
get_global_bucket_1(align_list_radius_2)
get_global_bucket_2(aign_list_radius_2)
get_global_bucket_4(align_list_radius_2)
get_global_bucket_5(align_list_radius_2)

get_local_bucket_1(align_list_radius_2)
get_local_bucket_2(align_list_radius_2)
get_local_bucket_4(align_list_radius_2)
get_local_bucket_5(align_list_radius_2)

In [None]:
# global stats 
get_global_actual_bucket_1(majority_list_radius_3)
get_global_actual_bucket_2(majority_list_radius_3)
get_global_actual_bucket_4(majority_list_radius_3)
get_global_actual_bucket_5(majority_list_radius_3)

# majority radius 3
get_global_bucket_1(majority_list_radius_3)
get_global_bucket_2(majority_list_radius_3)
get_global_bucket_4(majority_list_radius_3)
get_global_bucket_5(majority_list_radius_3)

get_local_bucket_1(majority_list_radius_3)
get_local_bucket_2(majority_list_radius_3)
get_local_bucket_4(majority_list_radius_3)
get_local_bucket_5(majority_list_radius_3)

# flow radius 3
get_global_bucket_1(flow_list_radius_3)
get_global_bucket_2(flow_list_radius_3)
get_global_bucket_4(flow_list_radius_3)
get_global_bucket_5(flow_list_radius_3)

get_local_bucket_1(flow_list_radius_3)
get_local_bucket_2(flow_list_radius_3)
get_local_bucket_4(flow_list_radius_3)
get_local_bucket_5(flow_list_radius_3)

# align radius 3
get_global_bucket_1(align_list_radius_3)
get_global_bucket_2(align_list_radius_3)
get_global_bucket_4(align_list_radius_3)
get_global_bucket_5(align_list_radius_3)

get_local_bucket_1(align_list_radius_3)
get_local_bucket_2(align_list_radius_3)
get_local_bucket_4(align_list_radius_3)
get_local_bucket_5(align_list_radius_3)

In [None]:
# global stats 
get_global_actual_bucket_1(majority_list_radius_4)
get_global_actual_bucket_2(majority_list_radius_4)
get_global_actual_bucket_4(majority_list_radius_4)
get_global_actual_bucket_5(majority_list_radius_4)

# majority radius 4
get_global_bucket_1(majority_list_radius_4)
get_global_bucket_2(majority_list_radius_4)
get_global_bucket_4(majority_list_radius_4)
get_global_bucket_5(majority_list_radius_4)

get_local_bucket_1(majority_list_radius_4)
get_local_bucket_2(majority_list_radius_4)
get_local_bucket_4(majority_list_radius_4)
get_local_bucket_5(majority_list_radius_4)

# flow radius 4
get_global_bucket_1(flow_list_radius_4)
get_global_bucket_2(flow_list_radius_4)
get_global_bucket_4(flow_list_radius_4)
get_global_bucket_5(flow_list_radius_4)

get_local_bucket_1(flow_list_radius_4)
get_local_bucket_2(flow_list_radius_4)
get_local_bucket_4(flow_list_radius_4)
get_local_bucket_5(flow_list_radius_4)

# align radius 4
get_global_bucket_1(align_list_radius_4)
get_global_bucket_2(align_list_radius_4)
get_global_bucket_4(align_list_radius_4)
get_global_bucket_5(align_list_radius_4)

get_local_bucket_1(align_list_radius_4)
get_local_bucket_2(align_list_radius_4)
get_local_bucket_4(align_list_radius_4)
get_local_bucket_5(align_list_radius_4)

In [None]:
# global stats 
get_global_actual_bucket_1(majority_list_radius_5)
get_global_actual_bucket_2(majority_list_radius_5)
get_global_actual_bucket_4(majority_list_radius_5)
get_global_actual_bucket_5(majority_list_radius_5)

# majority radius 5
get_global_bucket_1(majority_list_radius_5)
get_global_bucket_2(majority_list_radius_5)
get_global_bucket_4(majority_list_radius_5)
get_global_bucket_5(majority_list_radius_5)

get_local_bucket_1(majority_list_radius_5)
get_local_bucket_2(majority_list_radius_5)
get_local_bucket_4(majority_list_radius_5)
get_local_bucket_5(majority_list_radius_5)

# flow radius 5
get_global_bucket_1(flow_list_radius_5)
get_global_bucket_2(flow_list_radius_5)
get_global_bucket_4(flow_list_radius_5)
get_global_bucket_5(flow_list_radius_5)

get_local_bucket_1(flow_list_radius_5)
get_local_bucket_2(flow_list_radius_5)
get_local_bucket_4(flow_list_radius_5)
get_local_bucket_5(flow_list_radius_5)

# align radius 5
get_global_bucket_1(align_list_radius_5)
get_global_bucket_2(align_list_radius_5)
get_global_bucket_4(align_list_radius_5)
get_global_bucket_5(align_list_radius_5)

get_local_bucket_1(align_list_radius_5)
get_local_bucket_2(align_list_radius_5)
get_local_bucket_4(align_list_radius_5)
get_local_bucket_5(align_list_radius_5)

In [None]:
# not used currently
def score(p, true_pos, false_pos, false_neg, tp_reward, fp_penalty, fn_penalty, annotations):
    all_scores = []
    for i in range(len(p)):
        score_sum = 0
        score_sum += true_pos[i][1]*tp_reward
        score_sum += false_pos[i][1]*fp_penalty
        score_sum += false_neg[i][1]*fn_penalty
        score_percent = score_sum/len(annotations[p[i]])
        all_scores.append(score_percent)
    return all_scores

In [None]:
# not currently used
def get_accuracy(results):
    temp_dict = dict()
    temp_set = set()
    x1 = []
    y1 = []
    for result in results:
        temp_set.add(result.r)
    for result in results:
        temp = 0
        for p in range(len(result.p)):
            temp += ((result.true_positive_majority[p][1])/len(result.annotation_dict[result.p[p]]))
        temp_dict[result.r] = (temp/len(result.p))
    for x in temp_dict:
        x1.append(x)
        y1.append(temp_dict[x])
    plt.plot(x1, y1, label = "Majority Approach")

    temp_dict = dict()
    temp_set = set()
    x2 = []
    y2 = []
    for result in results:
        temp_set.add(result.r)
    for result in results:
        temp = 0
        for p in range(len(result.p)):
            temp += ((result.true_positive_flow[p][1])/len(result.annotation_dict[result.p[p]]))
        temp_dict[result.r] = (temp/len(result.p))
    for x in temp_dict:
        x2.append(x)
        y2.append(temp_dict[x])
    print(x2, y2)
    plt.plot(x2, y2, label = "Functional Flow")

    temp_dict = dict()
    temp_set = set()
    x3 = []
    y3 = []
    for result in results:
        temp_set.add(result.r)
    for result in results:
        temp = 0
        for p in range(len(result.p)):
            temp += ((result.true_positive_align[p][1])/len(result.annotation_dict[result.p[p]]))
        temp_dict[result.r] = (temp/len(result.p))
    for x in temp_dict:
        x3.append(x)
        y2.append(temp_dict[x])
    plt.plot(x3, y3, label = "Alignment Approach")

    plt.xlabel("radius r or timestep t")
    plt.ylabel('accuracy')
    plt.title('Accuracy Comparison')
    plt.legend()
    plt.show()

In [None]:
# used for presentation
# plots proximity v intersection of function
left = [1,2,3,4]
height = [83, 64, 55, 47]
tick_label = ['r=1', 'r=2', 'r=3', 'r=4']
plt.bar(left, height, tick_label = tick_label,
        width = 1, color = ['orange', 'gold'])
plt.xlabel('proximity of neighbors')
plt.ylabel('intersection of function')
plt.title('')
#plt.show()