In [None]:
import sys
import itertools
import pandas as pd
import numpy as np
import gseapy as gp
from matplotlib import pyplot as plt
from functools import reduce  # intersect more than two arrays

#from scipy.interpolate import interp1d, splev, splrep  # to connect plot smoothly
#from decimal import *    # for displaying decimal values
#getcontext().prec = 8    # number of decimal digits
% matplotlib inline

general_output_dir = '/Users/Miko/Desktop/CCBB/Network/output/rep_perm'
fdr_cutoff = 0.3 # significant pathways are those with fdr<=0.3

# permutation number
perm_num_all = [5, 10, 50, 100, 500, 600]

# number of iterations for each permutation number
i_all = [1,2,3,4,5,6,7,8,9,10]

In [None]:
# returns a list containing significant pathways for each iteration
def sig_pathways(perm_num, i_all):
    
    sig_path_list = []
    for i in i_all:
        # locate the directory and output file
        output_dir = general_output_dir +"/"+ str(perm_num) +"/"+ str(i)
        #print(output_dir)
        output_file = output_dir + "/gseapy.gsea.gene_set.report.csv"

        # reads in output
        df_output = pd.read_csv(output_file)
        #df_output.head()

        # get the sig. pathways from results
        sig_path = df_output[df_output['fdr'] <= fdr_cutoff]['Term'].tolist()
        print("number of sig. path: ", len(sig_path))
        sig_path_list.append(sig_path)
        #print("number of iterations: ", len(sig_path_list))

    return sig_path_list

In [None]:
# a function to calculate repeatability ratio
# Repeatability ratio is defined as: 
# the intersection divided by the union of 
# the sig. pathways for each output from a single perm_num experiment
def repeatability_ratio(perm_num):

    sig_path_tuple = tuple(sig_pathways(perm_num, i_all))
    print(len(sig_path_tuple))
    intersection = reduce(np.intersect1d, sig_path_tuple)
    print(len(intersection))
    #type(intersection)

    union = reduce(np.union1d, sig_path_tuple)
    print(len(union))
    #type(union)

    #rep_ratio = Decimal(len(intersection))/Decimal(len(union))
    rep_ratio = float(len(intersection))/float(len(union))
    print(rep_ratio)
    return rep_ratio

### Master cell

In [None]:
# find out repeatability ratio for each permutation number

## TODO:
# calculate repeatability for each pair of the 10 iterations

rep_ratio_list = []
for perm_num in perm_num_all:
    print("\n")
    print(perm_num)
    rep_ratio_list.append([perm_num, repeatability_ratio(perm_num)])
    
print(rep_ratio_list)    

In [None]:
type(rep_ratio_list)

In [None]:
data = np.array(rep_ratio_list) # create a multi-dimension array
type(data)

In [None]:
# Do a transpose on the data (ndarray), to get the perm_num in the first column,
# rather than the first row
x, y = data.T 

In [None]:
# Graph the dots and connect them with straight lines
#plt.scatter(x,y)
#plt.plot(x, y)

In [None]:
# graph the dots in log scale (x in log, y in linear), because we increase x exponentially
plt.semilogx(x,y,'.',markersize=10)
plt.errorbar(list(x),list(y),yerr=np.random.rand(len(x),1)) #yerr is a list of standard deviations for the 10 iterations

"""
# Graph with smooth lines
x_smooth = np.linspace(x.min(), x.max(), num=6)
y_smooth = y
spl = splrep(x_smooth, y_smooth)

x2 = np.linspace(x.min(), x.max(), num=1000)
y2 = splev(x2, spl)
plt.plot(x_smooth, y_smooth, 'o', x2, y2)

plt.show()

"""
plt.ylabel('Repeatability')
plt.xlabel('Permutation number')

#plt.savefig('foo.png')