In [None]:
# import statements
import json
import pprint
import matplotlib.pyplot as plt
import math
import statistics
import os
import numpy as np
import copy
from ipynb.fs.full.data_explore import DataExplorer
from ipynb.fs.full.function_prediction import FunctionPrediction
from collections import Counter
import random

In [None]:
# RawResults class holds all info 
class RawResults():
    def __init__(self, organism_name, n, r, do_alignment=True):
        
        # folder name
        self.organism_name = organism_name
        # length of list of proteins
        self.n = n
        # radius or timestep
        self.r = r
        
        # adjustable variables for raw scoring that we currently do not use
        self.true_positive_reward = 1
        self.false_positive_penalty = -0.25
        self.false_negative_penalty = -0.25
        
        # DataExplorer
        self.data_explorer = DataExplorer(self.organism_name)
        # used to get the actual list of annotations for a protein: self.annotation_dict[self.p[p]]
        self.annotation_dict = self.data_explorer.annotation_list
        # used in get_test_proteins() to gather self.p
        self.names_set = self.data_explorer.filtered_names_set
        self.names_list = self.data_explorer.filtered_names_list
        # used in get_global/local_percentile_ to get list of glosters sorted by size
        self.global_clusters_sorted = self.data_explorer.clusters_sorted
        
        # gathering list of test proteins
        self.p = self.get_test_proteins(n)
        print(self.p)
        # FunctionPrediction
        function_predictor = FunctionPrediction(self.organism_name)
        # raw results 
        self.majority_approach_results = function_predictor.majority_rule(self.p, r=self.r)
        self.functional_flow_results = function_predictor.functional_flow(self.p, t=self.r)
        
        if do_alignment:
            self.alignment_approach_results = function_predictor.alignment_approach(self.p, r=self.r)

    # returns a random size n list of proteins to test
    def get_test_proteins(self, n):
        kz = self.data_explorer.adj_list.keys()
        return random.sample(kz, n)

In [None]:
# compares results from FunctionPrediction to actual
def compare(p, annotation, results):
    true_positive = []
    false_positive = []
    false_negative = []
    for i in range(len(p)):
        protein = p[i]
        true_positive.append([set(), 0])
        false_positive.append([set(), 0])
        false_negative.append([set(), 0])
        actual_clusters = copy.deepcopy(annotation[protein])
        result_clusters = results[protein]
        if len(result_clusters) >= len(actual_clusters):
            for j in range(len(actual_clusters)):
                cluster = result_clusters[j]
                if cluster in actual_clusters:
                    true_positive[i][0].add(cluster)
                    true_positive[i][1] += 1
                    actual_clusters.remove(cluster)
                elif cluster not in actual_clusters:
                    false_positive[i][0].add(cluster)
                    false_positive[i][1] += 1
            for cluster in actual_clusters:
                false_negative[i][0].add(cluster)
                false_negative[i][1] += 1
        elif len(result_clusters) < len(actual_clusters):
            for j in range(len(result_clusters)):
                cluster = actual_clusters[j]
                if cluster in result_clusters:
                    true_positive[i][0].add(cluster)
                    true_positive[i][1] += 1
                    result_clusters.remove(cluster)
                elif cluster not in result_clusters:
                    false_negative[i][0].add(cluster)
                    false_negative[i][1] += 1
            for cluster in result_clusters:
                false_positive[i][0].add(cluster)
                false_positive[i][1] += 1
    return true_positive, false_positive, false_negative

In [None]:
# finds where actual protein clusters fall in the global cluster size distribution
def get_global_percentile_actual(p, true_positive, annotation_dict, global_clusters_sorted):
    percentile = []
    for protein in range(len(p)):
        for c in annotation_dict[p[protein]]:
            i = global_clusters_sorted.index(c)
            percentile.append((i/len(global_clusters_sorted)*100)//1)
    listof100actual = []
    totallen = 0
    for protein in range(len(p)):
        totallen = len(annotation_dict[p[protein]])
    for i in range(100):
        listof100actual.append((percentile.count(i)/totallen*100)//(len(p)))
    return listof100actual

# finds where true positive predictions of protein clusters fall in the global cluster size distribution
def get_global_percentile(p, true_positive, annotation_dict, global_clusters_sorted):
    percentile = []
    for protein in range(len(p)):
        for c in annotation_dict[p[protein]]:
            i = global_clusters_sorted.index(c)
            percentile.append((i/len(global_clusters_sorted)*100)//1)
    listof100actual = []
    totallen = 0
    for protein in range(len(p)):
        totallen = len(annotation_dict[p[protein]])
    for i in range(100):
        listof100actual.append((percentile.count(i)))
    listof100avg = []
    for i in range(100):
        listof100avg.append([])
    percentile = []
    for protein in range(len(p)):
        for tp in true_positive[protein][0]:
            i = global_clusters_sorted.index(tp)
            percentile.append((i/len(global_clusters_sorted)*100)//1)
        listof100result = []
        for i in range(100):
            listof100result.append(percentile.count(i))
        for i in range(100):
            listof100avg[i].append(listof100result[i])
    for i in range(100):
        totalfori = 0
        for j in range(len(listof100avg[i])):
            totalfori += listof100avg[i][j]
        totalfori/=len(listof100avg[i])
        listof100avg[i] = totalfori
    listof100 = []
    for i in range(100):
        if listof100actual[i] == 0:
            listof100.append(0)
        else:
            listof100.append(((listof100avg[i]/listof100actual[i])*100)//1)
    return listof100
        
# finds where actual protein clusters fall in the local cluster size distribution
def get_local_percentile(p, true_positive, annotation_dict, global_clusters_sorted):
    clusters = []
    cluster_len_total = 0
    for protein in range(len(p)):
        cluster_unsorted = annotation_dict[p[protein]]
        cluster_sorted = []
        for c in global_clusters_sorted:
            if c in cluster_unsorted:
                cluster_sorted.append(c)
        clusters.append(cluster_sorted)
        cluster_len_total += len(cluster_sorted)
    percentile = []
    for i in range(100):
        percentile.append(0)
    for i in range(len(p)):
        tpi = true_positive[i]
        ci = clusters[i]
        for tp in tpi[0]:
            percentile[int((ci.index(tp)/len(ci)*100)//1)]+=1
    for i in range(100):
        if percentile[i] != 0:
            percentile[i] == (percentile[i]/(cluster_len_total/100)*100*100)
    return percentile        

In [None]:
class MajorityApproachResults():
    def __init__(self, raw_results):
        self.global_title = 'Global Specificity with Majority Approach'
        self.global_color = '#ff7979'
        self.local_title = 'Local Specificity with Majority Approach'
        self.local_color = '#eb4d4b'
        
        # info gathered from raw_results
        self.p = raw_results.p
        self.annotation_dict = raw_results.annotation_dict
        self.global_clusters_sorted = raw_results.global_clusters_sorted
        self.raw_results = raw_results.majority_approach_results
        
        # 2D lists of length self.n corresponding to the proteins in self.p: [[{cluster_ids},count]]
        self.true_positive, self.false_positive, self.false_negative = (
            compare(self.p, self.annotation_dict, self.raw_results))
        
        self.global_percentile_actual = get_global_percentile_actual(self.p, self.true_positive, self.annotation_dict,
                                                                     self.global_clusters_sorted)
        self.global_percentile = get_global_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                       self.global_clusters_sorted)
        self.local_percentile = get_local_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                     self.global_clusters_sorted)

In [None]:
class FunctionalFlowResults():
    def __init__(self, raw_results):
        self.global_title = 'Global Specificity with Functional Flow'
        self.global_color = '#7ed6df'
        self.local_title = 'Local Specificity with Functional Flow'
        self.local_color = '#22a6b3'
        
        # info gathered from raw_results
        self.p = raw_results.p
        self.annotation_dict = raw_results.annotation_dict
        self.global_clusters_sorted = raw_results.global_clusters_sorted
        self.raw_results = raw_results.functional_flow_results
        
        # 2D lists of length self.n corresponding to the proteins in self.p: [[{cluster_ids},count]]
        self.true_positive, self.false_positive, self.false_negative = (
            compare(self.p, self.annotation_dict, self.raw_results))
        
        self.global_percentile_actual = get_global_percentile_actual(self.p, self.true_positive, self.annotation_dict,
                                                                     self.global_clusters_sorted)
        self.global_percentile = get_global_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                       self.global_clusters_sorted)
        self.local_percentile = get_local_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                     self.global_clusters_sorted)

In [None]:
class AlignmentApproachResults():
    def __init__(self, raw_results):
        self.global_title = 'Global Specificity with Alignment Approach'
        self.global_color = '#badc58'
        self.local_title = 'Local Specificity with Alignment Approach'
        self.local_color = '#6ab04c'
        
        # info gathered from raw_results
        self.p = raw_results.p
        self.annotation_dict = raw_results.annotation_dict
        self.global_clusters_sorted = raw_results.global_clusters_sorted
        self.raw_results = raw_results.alignment_approach_results
        
        # 2D lists of length self.n corresponding to the proteins in self.p: [[{cluster_ids},count]]
        self.true_positive, self.false_positive, self.false_negative = (
            compare(self.p, self.annotation_dict, self.raw_results))
        
        self.global_percentile_actual = get_global_percentile_actual(self.p, self.true_positive, self.annotation_dict,
                                                                     self.global_clusters_sorted)
        self.global_percentile = get_global_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                       self.global_clusters_sorted)
        self.local_percentile = get_local_percentile(self.p, self.true_positive, self.annotation_dict, 
                                                     self.global_clusters_sorted)

In [None]:
# histograms of true positive predictions of protein clusters in global cluster size distribution
def get_global(results, r, a):
    
    if a == "maj": color = "#ff5e57"; edgecolor="#ff3f34"
    elif a == "ff": color = "#0be881"; edgecolor="#05c46b"
    elif a == "sa": color = "#4bcffa"; edgecolor="#0fbcf9"
    
    percentile = []
    for i in range(100):
        percentile.append([])
    for result in results:
        for p in range(len(result.global_percentile)):
            percentile[p].append(result.global_percentile[p])
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(50):
        fr_this_time.append(0)
    for i in range(50):
        totalfori = 0
        for j in range(2):
            totalfori+=percentile[2*i+j]
        totalfori/=2
        fr_this_time[i] = totalfori
    left = []
    for i in range(50):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','10','','','','','20','','','','','30','','','','', '40','','','','',
                 '50','','','','','60','','','','','70','','','','','80','','','','', '90','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = color, edgecolor=edgecolor, alpha=0.8)
    plt.ylim((0,100))
    plt.xlabel("Global Cluster Size Percentiles")
    plt.ylabel("% Correctly Predicted")
    plt.title(results[0].global_title)
    bucket = "global-2"
    if a == "maj":
        plt.savefig("results/maj/{}-{}-{}.png".format(bucket, str(r), a))
    elif a == "ff":
        plt.savefig("results/ff/{}-{}-{}.png".format(bucket, str(r), a))
    elif a == "sa":
        plt.savefig("results/sa/{}-{}-{}.png".format(bucket, str(r), a))
    plt.show()

In [None]:
# histograms of true positive predictions of protein clusters in local cluster size distribution
def get_local(results, r, a):
    if a == "maj": color = "#ff5e57"; edgecolor="#ff3f34"
    elif a == "ff": color = "#0be881"; edgecolor="#05c46b"
    elif a == "sa": color = "#4bcffa"; edgecolor="#0fbcf9"
        
    percentile = []
    for i in range(100):
        percentile.append([])
    for i in range(100):
        for result in results:
            percentile[i].append(result.local_percentile[i]*100)
    for i in range(len(percentile)):
        totalfori = 0
        for j in range(len(percentile[i])):
            totalfori += percentile[i][j]
        totalfori//=len(percentile[i])
        percentile[i] = totalfori
    fr_this_time = []
    for i in range(50):
        fr_this_time.append(0)
    for i in range(50):
        totalfori = 0
        for j in range(2):
            totalfori+=percentile[2*i+j]
        totalfori/=2
        fr_this_time[i] = totalfori
    left = []
    for i in range(50):
        left.append(i)
    height = fr_this_time
    tick_label = ['0','','','','','10','','','','','20','','','','','30','','','','', '40','','','','',
                 '50','','','','','60','','','','','70','','','','','80','','','','', '90','','','','']
    plt.bar(left, height, tick_label = tick_label, width = 1, color = color, edgecolor=edgecolor, alpha=0.8)
    plt.ylim((0,100))
    plt.xlabel("Local Cluster Size Percentiles")
    plt.ylabel("% Correctly Predicted")
    plt.title(results[0].local_title)
    bucket = "local-2"
    if a == "maj":
        plt.savefig("results/maj/{}-{}-{}.png".format(bucket, str(r), a))
    elif a == "ff":
        plt.savefig("results/ff/{}-{}-{}.png".format(bucket, str(r), a))
    elif a == "sa":
        plt.savefig("results/sa/{}-{}-{}.png".format(bucket, str(r), a))
    plt.show()

In [None]:
# results held here
majority_list = {n: [] for n in range(1, 6)}
flow_list = {n: [] for n in range(1, 6)}
align_list = {n: [] for n in range(1, 6)}

In [None]:
# run a trial for all methods at radius r
def run_trial(r, n_iter=50):
    for i in range(n_iter):
        print("ITER {} / {}".format(i+1, n_iter))

        try:
            results = RawResults("ecoli", 5, r, do_alignment=True)

            # perform majority neighbor
            majority_results = MajorityApproachResults(results)
            majority_list[r].append(majority_results)

            # perform functional flow
            flow_results = FunctionalFlowResults(results)
            flow_list[r].append(flow_results)

            # perform sequence alignment
            align_results = AlignmentApproachResults(results)
            align_list[r].append(align_results)
        except:
            # ran into an issue ... continue on next iteration
            print("ERROR! CONTINUING FROM ITER {}...".format(i+1))
            continue

In [None]:
# gather global and local stats for all methods at radius r
def get_radii_stats(r):
    # radius r statistics
    get_global(majority_list_radius_1, r=r, a="maj")
    get_global(flow_list_radius_1, r=r, a="ff")
    get_global(align_list_radius_1, r=r, a="sa")

    get_local(majority_list_radius_1, r=r, a="maj")
    get_local(flow_list_radius_1, r=r, a="ff")
    get_local(align_list_radius_1, r=r, a="sa")

In [None]:
# run trials for all tested radii
run_trial(1)
run_trial(2)
run_trial(3)
run_trial(4)
run_trial(5)

In [None]:
# gather global and local statistics for all tested radii
get_radii_stats(1)
get_radii_stats(2)
get_radii_stats(3)
get_radii_stats(4)
get_radii_stats(5)

### Testing Implementations
The above code and classes run tests for each of the three methods implemented in the paper and radii 1 through 5. To run a test of the three methods, use the `run_trial(r)` method where `r` is the radius. To see the plots of global and local specificity after the trial is run, use the `get_radii_stats(r)` function.

In [None]:
class AccuracyResults():
    def __init__(self, organism_name):
        self.organism_name = organism_name
        self.data_explorer = DataExplorer(organism_name)
        
    def jaccard(self, pred_labels, true_labels, weights=None):
        # compute weighted/non-weighted jaccard distance between true and predicted labels
        # smaller weights indicate higher importance

        pred_labels = set(pred_labels)
        true_labels = set(true_labels)

        intersection = pred_labels.intersection(true_labels)
        union = pred_labels.union(true_labels)

        if weights:
            intersection_size = sum([
                (1 / weights[label]) for label in intersection
            ])

            union_size = sum([
                (1 / weights[label]) for label in union
            ])
        else:
            intersection_size = len(intersection)
            union_size = len(union)

        return intersection_size / union_size

    def precision(self, pred_labels, true_labels, weights=None):
        # compute weighted/non-weighted precision between true and predicted labels
        # smaller weights indicate higher importance

        pred_labels = set(pred_labels)
        true_labels = set(true_labels)

        intersection = pred_labels.intersection(true_labels)

        if weights:
            intersection_size = sum([
                (1 / weights[label]) for label in intersection
            ])

            pred_size = sum([
                (1 / weights[label]) for label in pred_labels
            ])
        else:
            intersection_size = len(intersection)
            pred_size = len(pred_labels)

        return intersection_size / pred_size

    def recall(self, pred_labels, true_labels, weights=None):
        # compute weighted/non-weighted recall between true and predicted labels
        # smaller weights indicate higher importance

        pred_labels = set(pred_labels)
        true_labels = set(true_labels)

        intersection = pred_labels.intersection(true_labels)
        false_negatives = true_labels.difference(pred_labels)

        if weights:
            intersection_size = sum([
                (1 / weights[label]) for label in intersection
            ])

            false_negative_size = sum([
                (1 / weights[label]) for label in false_negatives
            ])
        else:
            intersection_size = len(intersection)
            false_negative_size = len(false_negatives)

        return intersection_size / (intersection_size + false_negative_size)

    def f1_score(self, pred_labels, true_labels, weights=None):
        precision_score = precision(pred_labels, true_labels, weights)
        recall_score = recall(pred_labels, true_labels, weights)

        return (2 * precision_score * recall_score) / (precision_score + recall_score)
    
    def compute_scores(self, method):
        if method == "maj":
            res = [majority_list[i] for i in majority_list]
        elif method == "ff":
            res = [flow_list[i] for i in flow_list]
        elif method == "sa":
            res = [align_list[i] for i in align_list]
        
        scores = scores = {n : {
                "jaccard": {"weighted": 0, "nonweighted": 0},
                "precision": {"weighted": 0, "nonweighted": 0},
                "recall": {"weighted": 0, "nonweighted": 0},
                "f1": {"weighted": 0, "nonweighted": 0}} for n in range(1, 6)
        }
        
        for n in range(len(res)):
            cnt = 0
            for trial in res[n]:
                for i in range(len(trial.p)):
                    cnt += 1

                    tp, _ = trial.true_positive[i]
                    fp, _ = trial.false_positive[i]

                    pred_labels = list(tp) + list(fp)
                    true_labels = data_explorer.annotation_list[trial.p[i]]
                    try:
                        majority_scores[n+1]["jaccard"]["weighted"] += self.jaccard(pred_labels,true_labels,weights)
                        majority_scores[n+1]["jaccard"]["nonweighted"] += self.jaccard(pred_labels,true_labels)
                    except: cnt -= 1

                    try:
                        majority_scores[n+1]["precision"]["weighted"] += self.precision(pred_labels,true_labels,weights)
                        majority_scores[n+1]["precision"]["nonweighted"] += self.precision(pred_labels,true_labels)
                    except: cnt -= 1

                    try:
                        majority_scores[n+1]["recall"]["weighted"] += self.recall(pred_labels,true_labels,weights)
                        majority_scores[n+1]["recall"]["nonweighted"] += self.recall(pred_labels,true_labels)
                    except: cnt -= 1

                    try:
                        majority_scores[n+1]["f1"]["weighted"] += self.f1_score(pred_labels,true_labels,weights)
                        majority_scores[n+1]["f1"]["nonweighted"] += self.f1_score(pred_labels,true_labels)
                    except: cnt -= 1
                        
            for method in majority_scores[n+1]:
                for w in majority_scores[n+1][method]:
                    majority_scores[n+1][method][w] /= cnt

        print("--- {} SCORES (ACCURACY) ---".format(method))
        pprint.pprint(majority_scores)
        
        return scores
    
    def visualize_accuracies(self, w, method):
        fig, ax = plt.subplots()

        width=0.3
        x = np.arange(len([1, 2, 3, 4, 5]))
        
        y1 = [round(majority_scores[1][method][w], 3),
              round(majority_scores[2][method][w], 3),
              round(majority_scores[3][method][w], 3),
              round(majority_scores[4][method][w], 3)]

        y2 = [round(ff_scores[1][method][w], 3),
              round(ff_scores[2][method][w], 3),
              round(ff_scores[3][method][w], 3),
              round(ff_scores[4][method][w], 3)]

        y3 = [round(sa_scores[1][method][w], 3),
              round(sa_scores[2][method][w], 3), 0, 0]

        b1 = ax.bar(x - width, y1, width, label="Majority", color="#ff5e57",edgecolor="#ff3f34")
        b2 = ax.bar(x, y3, width, label="Alignment", color="#4bcffa",edgecolor="#0fbcf9")
        b3 = ax.bar(x + width, y2, width, label="Flow", color="#0be881",edgecolor="#05c46b")

        ax.bar_label(b1, padding=3)
        ax.bar_label(b2, padding=3)
        ax.bar_label(b3, padding=3)
        ax.set_xticks(x, ["r=1", "r=2", "r=3", "r=4", "r=5"])
        ax.set_ylim([0, 1.1])

        ax.set_title("Average Weighted F1 Scores ({})".format(self.organism_name))
        ax.set_ylabel("{} Score ({})".format(method, w))
        ax.set_xlabel("Tested Radii")

        fig.tight_layout()
        plt.legend(loc='upper right')

        plt.savefig("results/{}-scores-{}.png".format(method, w))
        plt.show()

### Testing Implementations (pt. 2)
Once you have gathered data about each implementation for a number of radii, you may visualize and analyze the accuracy of each model through the Jaccard index, precision score, recall score, and F1 score. To view accuracy information, use the `compute_scores(m)` function where `m` is one of `{"maj", "ff", "sa}`, corresponding to each method. You may also view a plot of the accuracy with the `visualize_accuracies(w, m)` function where `w` is one of `{"weighted", "nonweighted"}`.

In [None]:
def cluster_plot():
    clusters = data_explorer.cluster_sizes
    sizes = sorted(list(clusters.values()))[::-1]
    
    print(statistics.median(sizes))
    
    plt.plot(sizes, color = "#0984e3")
    plt.fill_between(range(len(sizes)), 0, sizes, color="#74b9ff", alpha=0.2)
    
    plt.title("Cluster Size Distribution (E. coli)")
    plt.xlabel("Cluster ID")
    plt.ylabel("Cluster size")
    plt.savefig("cluster-dist.png")

    cluster_plot()

In [None]:
def cluster_protein():
    kz = data_explorer.adj_list.keys()
    random_proteins = kz
    
    sizes = []
    for protein in random_proteins:
        for c in data_explorer.annotation_list[protein]:
            sizes.append(data_explorer.cluster_sizes[c])
    
    plt.title("Protein Cluster Distribution (E. coli)")
    plt.xlabel("Cluster size")
    plt.ylabel("Frequency")
    
    plt.hist(sizes, bins=30, color="#74b9ff", edgecolor='#0984e3', alpha=0.8)
    plt.savefig("protein-dist.png")

cluster_protein()

### Network Statistics (p. 2)
The above code is there to visualize information about cluster sizes and display the cluster size heirarchy and class imbalance present within many STRING organisms