In [None]:
# import statements
import json
import pprint
import matplotlib.pyplot as plt
import math
import statistics
import os
import numpy as np
import copy
from ipynb.fs.full.data_explore import DataExplorer
from ipynb.fs.full.function_prediction import FunctionPrediction
from collections import Counter
import random

In [None]:
# results class holds all info 
class Results():
    def __init__(self, organism_name, n, t, r):
        # inputs
        self.organism_name = organism_name
        self.n = n
        self.t = t
        self.r = r
        
        # adjustables !
        self.correct_reward = 1
        self.false_positive_penalty = -0.25
        self.false_negative_penalty = -0.25
        
        # gathered from DataExplorer
        data_explorer = DataExplorer(self.organism_name)
        self.annotation_dict = data_explorer.annotation_list
        self.names_set = data_explorer.filtered_names_set
        self.names_list = data_explorer.filtered_names_list
        self.names_set = data_explorer.filtered_names_set
        self.global_clusters_sorted = data_explorer.clusters_sorted
        
        # using defined method
        self.p = self.test_proteins(n)
        
        # gathered from FunctionPrediction
        function_predictor = FunctionPrediction(self.organism_name)
        self.majority_approach_results = function_predictor.majority_rule(self.p, r=self.r)
        self.functional_flow_results = function_predictor.functional_flow(self.p, t=self.t)
        
        # scoring majority approach
        self.correct_majority, self.false_positive_majority, self.false_negative_majority = (
            compare_majority(self.p, self.annotation_dict, self.majority_approach_results))
        self.scores_majority = score(self.p, self.correct_majority, 
                                     self.false_positive_majority, self.false_negative_majority, 
                                     self.correct_reward, self.false_positive_penalty, 
                                     self.false_negative_penalty, self.annotation_dict)
        self.global_percentile_majority = self.get_global_percentile_majority()
        self.local_percentile_majority = self.get_local_percentile_majority()
        
        # scoring functional flow
        self.correct_flow, self.false_positive_flow, self.false_negative_flow = (
            compare_flow(self.p, self.annotation_dict, self.functional_flow_results))
        self.scores_flow = score(self.p, self.correct_flow, 
                                    self.false_positive_flow, self.false_negative_flow, 
                                    self.correct_reward, self.false_positive_penalty, 
                                    self.false_negative_penalty, self.annotation_dict)
        self.global_percentile_flow = self.get_global_percentile_flow()
        self.local_percentile_flow = self.get_local_percentile_flow()
        
    # returns a random size n list of protein names
    # currently uses filtered names
    def test_proteins(self, n):
        p = []
        for x in range(n):
            i = random.randint(0, len(self.names_set))
            while self.names_list[i] in p:
                i = random.randint(0, len(self.names_set))
                while len(self.annotation_dict[self.names_list[i]]) == 0:
                    i = random.randint(0, len(self.names_set))
            p.append(self.names_list[i])
        return p
    
    def get_global_percentile_majority(self):
        percentile = []
        for p in range(len(self.p)):
            for correct in self.correct_majority[p][0]:
                i = self.global_clusters_sorted.index(correct)
                percentile.append(i/len(self.global_clusters_sorted)*100)
        return percentile
    
    def get_local_percentile_majority(self):
        percentile = []
        for p in range(len(self.p)):
            local_clusters_sorted = []
            for i in self.global_clusters_sorted:
                    if i in self.annotation_dict[self.p[p]]:
                        local_clusters_sorted.append(i)
            for correct in self.correct_majority[p][0]:
                i = local_clusters_sorted.index(correct)
                percentile.append(i/len(local_clusters_sorted)*100)
        return percentile
    
    def get_global_percentile_flow(self):
        percentile = []
        for p in range(len(self.p)):
            for correct in self.correct_flow[p][0]:
                i = self.global_clusters_sorted.index(correct)
                percentile.append(i/len(self.global_clusters_sorted)*100)
        return percentile
    
    def get_local_percentile_flow(self):
        percentile = []
        for p in range(len(self.p)):
            local_clusters_sorted = []
            for i in self.global_clusters_sorted:
                    if i in self.annotation_dict[self.p[p]]:
                        local_clusters_sorted.append(i)
            for correct in self.correct_flow[p][0]:
                i = local_clusters_sorted.index(correct)
                percentile.append(i/len(local_clusters_sorted)*100)
        return percentile
    
                
        
# compares results from majority approach to actual
def compare_majority(p, annotation, majority_results):
    # identified in actual and majority
    correct = []
    # identified in majority but not in actual
    false_positive = []
    # identified in actual but not in majority
    false_negative = []
    for i in range(len(p)):
        protein = p[i]
        correct.append([set(), 0])
        false_positive.append([set(), 0])
        false_negative.append([set(), 0])
        actual_clusters = copy.deepcopy(annotation[protein])
        majority_clusters = majority_results[protein]
        for j in range(len(actual_clusters)):
            cluster = majority_clusters[j]
            if cluster in actual_clusters:
                correct[i][0].add(cluster)
                correct[i][1] += 1
                actual_clusters.remove(cluster)
            elif cluster not in actual_clusters:
                false_positive[i][0].add(cluster)
                false_positive[i][1] += 1
        for cluster in actual_clusters:
            false_negative[i][0].add(cluster)
            false_negative[i][1] += 1
    return correct, false_positive, false_negative
 
# compares results from functional flow to actual
def compare_flow(p, annotation, flow_results):
    # identified in actual and flow
    correct = []
    # identified in flow but not in actual
    false_positive = []
    # identified in actual but not in flow
    false_negative = []
    for i in range(len(p)):
        protein = p[i]
        correct.append([set(), 0])
        false_positive.append([set(), 0])
        false_negative.append([set(), 0])
        actual_clusters = copy.deepcopy(annotation[protein])
        flow_clusters = flow_results[protein]
        for j in range(len(actual_clusters)):
            cluster = flow_clusters[j]
            if cluster in actual_clusters:
                correct[i][0].add(cluster)
                correct[i][1] += 1
                actual_clusters.remove(cluster)
            elif cluster not in actual_clusters:
                false_positive[i][0].add(cluster)
                false_positive[i][1] += 1
        for cluster in actual_clusters:
            false_negative[i][0].add(cluster)
            false_negative[i][1] += 1
    return correct, false_positive, false_negative

# not used
def score(p, correct, false_pos, false_neg, c_reward, fp_penalty, fn_penalty, annotations):
    all_scores = []
    for i in range(len(p)):
        score_sum = 0
        score_sum += correct[i][1]*c_reward
        score_sum += false_pos[i][1]*fp_penalty
        score_sum += false_neg[i][1]*fn_penalty
        score_percent = score_sum/len(annotations[p[i]])
        all_scores.append(score_percent)
    return all_scores
            

In [None]:
#creating graphs
def get_global_majority(results):
    percentile = []
    for result in results:
        for p in result.global_percentile_majority:
            percentile.append(p)
    range = (0, 100)
    bins = 20 
    plt.hist(percentile, bins, range, color = 'red',
    histtype = 'bar', rwidth = 0.8)
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title("Global Specificity with Majority Approach")
    plt.show()

def get_local_majority(results):
    percentile = []
    for result in results:
        for p in result.local_percentile_majority:
            percentile.append(p)
    range = (0, 100)
    bins = 20 
    plt.hist(percentile, bins, range, color = 'orange',
    histtype = 'bar', rwidth = 0.8)
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title("Local Specificity with Majority Approach")
    plt.show()

def get_global_flow(results):
    percentile = []
    for result in results:
        for p in result.global_percentile_flow:
            percentile.append(p)
    range = (0, 100)
    bins = 20 
    plt.hist(percentile, bins, range, color = 'blue',
    histtype = 'bar', rwidth = 0.8)
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title("Global Specificity with Functional Flow")
    plt.show()

def get_local_flow(results):
    percentile = []
    for result in results:
        for p in result.local_percentile_flow:
            percentile.append(p)
    range = (0, 100)
    bins = 20 
    plt.hist(percentile, bins, range, color = 'green',
    histtype = 'bar', rwidth = 0.8)
    plt.xlabel("Specificity of Function Assignment")
    plt.ylabel("Frequency of Correct Assignment")
    plt.title("Local Specificity with Functional Flow")
    plt.show()
    
def get_accuracy(results):
    temp_dict = dict()
    temp_set = set()
    x1 = []
    y1 = []
    for result in results:
        temp_set.add(result.r)
    for result in results:
        temp = 0
        for p in range(len(result.p)):
            temp += ((result.correct_majority[p][1])/len(result.annotation_dict[result.p[p]]))
        temp_dict[result.r] = (temp/len(result.p))
    for x in temp_dict:
        x1.append(x)
        y1.append(temp_dict[x])
    plt.plot(x1, y1, label = "Majority Approach")

    temp_dict = dict()
    temp_set = set()
    x2 = []
    y2 = []
    for result in results:
        temp_set.add(result.t)
    for result in results:
        temp = 0
        for p in range(len(result.p)):
            temp += ((result.correct_flow[p][1])/len(result.annotation_dict[result.p[p]]))
        temp_dict[result.t] = (temp/len(result.p))
    for x in temp_dict:
        x2.append(x)
        y2.append(temp_dict[x])
    plt.plot(x2, y2, label = "Functional Flow")
    plt.xlabel("radius r or timestep t")
    plt.ylabel('accuracy')
    plt.title('Accuracy Comparison')
    plt.legend()
    plt.show()

In [None]:
results_list = []
# below are just running results for graphs

In [None]:
try:
    results = Results("ecoli", 10, 1, 1)
    results_list.append(results)
except:
    print("1")

In [None]:
try:
    results = Results("ecoli", 10, 2, 2)
    results_list.append(results)
except:
    print("2")

In [None]:
try:
    results = Results("ecoli", 10, 3, 3)
    results_list.append(results)
except:
    print("3")

In [None]:
try:
    results = Results("ecoli", 10, 4, 4)
    results_list.append(results)
except:
    print("4")

In [None]:
try:
    results = Results("ecoli", 10, 5, 5)
    results_list.append(results)
except:
    print("5")

In [None]:
try:
    results = Results("ecoli", 10, 6, 6)
    results_list.append(results)
except:
    print("6")

In [None]:
try:
    results = Results("ecoli", 10, 7, 7)
    results_list.append(results)
except:
    print("7")

In [None]:
try:
    results = Results("ecoli", 10, 8, 8)
    results_list.append(results)
except:
    print("8")

In [None]:
try:
    results = Results("ecoli", 10, 9, 9)
    results_list.append(results)
except:
    results_list.append(results)

In [None]:
try:
    results = Results("ecoli", 10, 10, 10)
    results_list.append(results)
except:
    print("10")

In [None]:
get_global_majority([results_list[0], results_list[1], results_list[2], results_list[3]])
get_local_majority([results_list[0], results_list[1], results_list[2], results_list[3]])

In [None]:
get_global_flow([resultsmod1[0], resultsmod2[0], resultsmod3[0], resultsmod4[0]])
get_local_flow([resultsmod1[0], resultsmod2[0], resultsmod3[0], resultsmod4[0]])

In [None]:
get_local_majority([results_list[0], results_list[1], results_list[2], results_list[3]])
get_local_flow([results_list[0], results_list[1], results_list[2], results_list[3]])

In [None]:
get_accuracy([results_list[0], results_list[1], results_list[2], results_list[3]])

In [None]:
# plots proximity v intersection of function
# x-coordinates of left sides of bars
left = [1,2,3,4]
 
# heights of bars
height = [83, 64, 55, 47]
 
# labels for bars
tick_label = ['r=1', 'r=2', 'r=3', 'r=4']
 
# plotting a bar chart
plt.bar(left, height, tick_label = tick_label,
        width = 1, color = ['orange', 'gold'])
 
# naming the x-axis
plt.xlabel('proximity of neighbors')
# naming the y-axis
plt.ylabel('intersection of function')
# plot title
plt.title('')
 
# function to show the plot
plt.show()