# Setup pipeline and initialize the three models

In [None]:
%load_ext autoreload

import os, sys
sys.path.insert(0, "../")

%autoreload 2
from flowset import *

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300



from os import listdir
from os.path import isfile, join
from upsetplot import plot, from_contents,UpSet



In [None]:
def download_and_unzip(download_url_link, dir_path, zipped_filename,destination_dir_name):
    #https://www.tutorialsbuddy.com/download-and-unzip-a-zipped-file-in-python
    print("Download starting")
    urllib.request.urlretrieve(
        download_url_link, os.path.join(dir_path, zipped_filename)
    )
    print("Download complete")

    print("unzipping file starting")
    with zipfile.ZipFile(os.path.join(dir_path, zipped_filename), "r") as zip_file:
        zip_file.extractall(os.path.join(dir_path, destination_dir_name))
    print("unzipping complete")


if not os.path.exists("ReactomePathways.gmt"):
    download_and_unzip("https://reactome.org/download/current/ReactomePathways.gmt.zip", ".", "ReactomePathways.gmt.zip", ".")

In [None]:
exprData = pl.read_csv("./summarised_simulated_scdata_random.tsv", has_header=True, sep="\t", null_values=["NA"])
print(exprData)

In [None]:
def pl_hist(df, column, n_bins=100,):
    data = df.select(pl.col(column))
    
    fig, ax = plt.subplots(1, 1, tight_layout=True)
    ax.hist(data, bins=n_bins, density=True)
    
    plt.show()
    plt.close()
pl_hist(exprData, "mean.cluster")
pl_hist(exprData, "expr.cluster")

In [None]:
explDFRaw, mfFuzzy = FlowAnalysis.fuzzify_exprvalues(exprData, stepsize=0.1,seriesOrder=["cluster.wildtype", "cluster.knockout01", "cluster.knockout02", "cluster.knockout03"],mfLevels = ["NO","NOLOW","LOW","LOWMED","MED","MEDHIGH","HIGH"],centers=[.1,.5,.9,1.3,1.7,2.1,2.5])
explDFWide = FlowAnalysis.to_vwide(explDFRaw, mfFuzzy)
print(explDFWide)

In [None]:
fa_triangular = FlowAnalysis(explDFWide, "gene", (("wildtype","WT"), ("knockout01","K01"),  ("knockout02","KO2"),  ("knockout03","KO3")), mfFuzzy) 
fa_triangular.plot_flows()

In [None]:

explDFRaw, mfFuzzy = FlowAnalysis.exprDF2LongDF(exprData, stepsize=0.1,seriesOrder=["cluster.wildtype", "cluster.knockout01", "cluster.knockout02", "cluster.knockout03"],mfLevels = ["NO","NOLOW","LOW","LOWMED","MED","MEDHIGH","HIGH"],centers=[.1,.5,.9,1.3,1.7,2.1,2.5],shape="gauss")
explDFWide = FlowAnalysis.to_vwide(explDFRaw, mfFuzzy)
print(explDFWide)
fa_gaussian = FlowAnalysis(explDFWide, "gene",(("wildtype","WT"),  ("knockout01","K01"),  ("knockout02","KO2"),  ("knockout03","KO3")), mfFuzzy) 
fa_gaussian.plot_flows()


In [None]:
explDFRaw, mfFuzzy = FlowAnalysis.exprDF2LongDF(exprData, stepsize=0.1,seriesOrder=["cluster.wildtype", "cluster.knockout01", "cluster.knockout02", "cluster.knockout03"],mfLevels = ["NO","NOLOW","LOW","LOWMED","MED","MEDHIGH","HIGH"],centers=[.1,.5,.9,1.3,1.7,2.1,2.5], sdcolName=None, shape="crisp")
explDFWide = FlowAnalysis.to_vwide(explDFRaw, mfFuzzy)
print(explDFWide)
fa_crisp = FlowAnalysis(explDFWide, "gene",(("wildtype","WT"),  ("knockout01","K01"),  ("knockout02","KO2"),  ("knockout03","KO3")), mfFuzzy) 
fa_crisp.plot_flows()


In [None]:

fa_crisp.hist_level_membershipsum()
fa_gaussian.hist_level_membershipsum()
fa_triangular.hist_level_membershipsum()


In [None]:
pw_file="go_human_filtered.bp.gmt"
rp = fa_crisp.read_gmt_file(pw_file)


In [None]:
alteredPathways=pd.read_csv('simulated_changingPathways_random.tsv',sep='\t')

pattern_groups=dict()
for pattern in alteredPathways[alteredPathways.duplicated() == False].iterrows():
    strpattern = '-'.join(pattern[-1].astype(str))
    pattern_groups[strpattern]=alteredPathways[[x[-1].equals(pattern[-1]) for x in alteredPathways.iterrows()]].index

pattern_groups

In [None]:
changed_genes= pd.read_csv("./simulated_changingGenes_random.tsv",sep="\t")

fig = plt.figure() 
ax = fig.add_subplot(2, 1, 1) 
for index, row in changed_genes.iterrows(): 
    plt.plot(row,label=index)

ax.set_yscale('log')

changed_genes=changed_genes[changed_genes.max(axis=1)>0.2] 

fig = plt.figure() 
ax = fig.add_subplot(2, 1, 1) 
for index, row in changed_genes.iterrows(): 
    plt.plot(row,label=index)

ax.set_yscale('log')

changed_genes_list=set(changed_genes.index)


# Basic overview over the memberships of the altered genes in the differnt states and models

In [None]:

pwnames=[rp[x][0] for x in rp]

for key, value in pattern_groups.items():
    union_geneset=[rp[list(rp.keys())[pwnames.index(p)]][1] for p in value]
    union_geneset = [item for sublist in union_geneset for item in sublist]
    union_geneset=list(set(union_geneset).intersection(changed_genes_list))

    print(union_geneset)

    fa_crisp.plot_state_memberships(union_geneset,"Crisp: Pattern -> "+key,cluster_genes=True)
    fa_crisp.plot_genes_membership(union_geneset)
    fa_crisp.plot_genes(union_geneset,min_flow=.010)


    fa_gaussian.plot_state_memberships(union_geneset,"Gaussian: Pattern -> "+key,cluster_genes=True)
    fa_gaussian.plot_genes_membership(union_geneset)
    fa_gaussian.plot_genes(union_geneset,min_flow=.010)

    fa_triangular.plot_state_memberships(union_geneset,"Triangular: Pattern -> "+key,cluster_genes=True)
    fa_triangular.plot_genes_membership(union_geneset)
    fa_triangular.plot_genes(union_geneset,min_flow=.01)




In [None]:

fa_crisp.plot_genes(["TLR4"],min_flow=0.05)
fa_gaussian.plot_genes(["TLR4"],min_flow=0.05)
fa_triangular.plot_genes(["TLR4"],min_flow=0.05)


# Analyse "UP" pattern
The first pattern is analyzed.
First with only the genes changed through the pathways and then pathway enrichment.

In [None]:
# Analyse simulated UP pattern

up_geneset=[]
up_pathways=[]
key=list(pattern_groups.keys())[0]
value=pattern_groups[key]
union_geneset=[rp[list(rp.keys())[pwnames.index(p)]][1] for p in value]
up_pathways=[list(rp.keys())[pwnames.index(p)] for p in value]
up_geneset_temp = [item for sublist in union_geneset for item in sublist]

relFlows = set().union(*
    [
    fa_crisp.flow_finder(["=", "<<", "="], verbose=True)
    ]
)


up_geneset=set(up_geneset_temp).intersection(changed_genes_list)

print(len(up_geneset))

In [None]:
module2=["TLR4","HYKK","LACC1","PPP2R5B","CD81","ESYT1"]
print(len(module2))
print(sum([ x in up_geneset for x in module2] ))
print(sum([ x in  set(up_geneset_temp)   for x in module2] ))


confusion_matrix(up_geneset,module2,list(fa_crisp.flows.select(pl.col(fa_crisp.symbol_column)).to_series()),outfile="./confusion_matrices/Genes_pattern1_wgcna.csv")


In [None]:

genes_to_consider=len(up_geneset)
genes, scores_df,_=fa_crisp.plot_flow_memberships(use_flows=relFlows,color_genes=up_geneset,n_genes=genes_to_consider)
fa_crisp.get_confusion_matrix(scores_df,up_geneset,genes_to_consider,outfile="./confusion_matrices/Genes_pattern1_crisp.csv")

genes, scores_df,_=fa_gaussian.plot_flow_memberships(use_flows=relFlows,color_genes=up_geneset,n_genes=genes_to_consider)
fa_gaussian.get_confusion_matrix(scores_df,up_geneset,genes_to_consider,outfile="./confusion_matrices/Genes_pattern1_gaussian.csv")

genes, scores_df,_=fa_triangular.plot_flow_memberships(use_flows=relFlows,color_genes=up_geneset,n_genes=genes_to_consider)
fa_triangular.get_confusion_matrix(scores_df,up_geneset,genes_to_consider,outfile="./confusion_matrices/Genes_pattern1_triangular.csv")

In [None]:

pwScores_crisp = fa_crisp.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_crisp=pwScores_crisp[pwScores_crisp["adj_pval"]<0.05]
print("Crisp) Found: "+ str(pwScores_signif_crisp[pwScores_signif_crisp["pwid"].isin(up_pathways)].shape[0])+" / "+str(len(up_pathways))+" with  "+ str(pwScores_signif_crisp.shape[0]) +" significant")
confusion_matrix(up_pathways,pwScores_signif_crisp["pwid"],pwScores_crisp["pwid"],outfile="./confusion_matrices/Pathways_pattern1_crisp.csv")

pwScores_gaussian = fa_gaussian.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_gaussian=pwScores_gaussian[pwScores_gaussian["adj_pval"]<0.05]
print("Gaussian) Found: "+ str(pwScores_signif_gaussian[pwScores_signif_gaussian["pwid"].isin(up_pathways)].shape[0])+" / "+str(len(up_pathways))+" with  "+ str(pwScores_signif_gaussian.shape[0]) +" significant")
confusion_matrix(up_pathways,pwScores_signif_gaussian["pwid"],pwScores_gaussian["pwid"],outfile="./confusion_matrices/Pathways_pattern1_gaussian.csv")

pwScores_triangular = fa_triangular.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_triangular=pwScores_triangular[pwScores_triangular["adj_pval"]<0.05]
print("Triangular) Found: "+ str(pwScores_signif_triangular[pwScores_signif_triangular["pwid"].isin(up_pathways)].shape[0])+" / "+str(len(up_pathways))+" with  "+ str(pwScores_signif_triangular.shape[0]) +" significant")
confusion_matrix(up_pathways,pwScores_signif_triangular["pwid"],pwScores_triangular["pwid"],outfile="./confusion_matrices/Pathways_pattern1_triangular.csv")

results={ 'triangular': pwScores_signif_triangular["pwid"], 'gaussian': pwScores_signif_gaussian["pwid"], 'crisp': pwScores_signif_crisp["pwid"] }


upset=from_contents(results)
upsetpl = UpSet(upset, orientation='vertical')
upsetpl.plot()    

plt.suptitle("Overlaps of genesets found with the different fuzzy concepts")
plt.show()

# Temp up pattern


In [None]:
# Analyse tUP pattern

tup_geneset=[]
tup_pathways=[]
key=list(pattern_groups.keys())[1]
value=pattern_groups[key]
union_geneset=[rp[list(rp.keys())[pwnames.index(p)]][1] for p in value]
tup_pathways=[list(rp.keys())[pwnames.index(p)] for p in value]
tup_geneset_temp = [item for sublist in union_geneset for item in sublist]

relFlows = set().union(*
    [

    fa_triangular.flow_finder(["<<", "=", ">>"], verbose=True)
    ]
)
tup_geneset=set(tup_geneset_temp).intersection(changed_genes_list)
print(len(tup_geneset))

In [None]:
module1=["IKBKB","ROCK1","ROCK2","RC3H2","ETFB","PRKN","PARK7","TNF","SNX","IL4I1","RC3H1","ATF5","MARK4","MCPH1","PHH","FNBP1L","GGPS1"]
print(len(module1))
print(sum([ x in tup_geneset for x in module1] ))
print(sum([ x in  set(tup_geneset_temp)   for x in module1] ))

confusion_matrix(tup_geneset,module1,list(fa_crisp.flows.select(pl.col(fa_crisp.symbol_column)).to_series()),outfile="./confusion_matrices/Genes_pattern2_wgcna.csv")


In [None]:

genes_consider=len(tup_geneset)

genes, scores_df,_=fa_crisp.plot_flow_memberships(use_flows=relFlows,color_genes=tup_geneset,n_genes=genes_consider)
fa_crisp.get_confusion_matrix(scores_df,tup_geneset,genes_consider,outfile="./confusion_matrices/Genes_pattern2_crisp.csv")

genes, scores_df,_=fa_gaussian.plot_flow_memberships(use_flows=relFlows,color_genes=tup_geneset,n_genes=genes_consider)
fa_gaussian.get_confusion_matrix(scores_df,tup_geneset,genes_consider,outfile="./confusion_matrices/Genes_pattern2_gaussian.csv")

genes, scores_df,_=fa_triangular.plot_flow_memberships(use_flows=relFlows,color_genes=tup_geneset,n_genes=genes_consider)
fa_triangular.get_confusion_matrix(scores_df,tup_geneset,genes_consider,outfile="./confusion_matrices/Genes_pattern2_triangular.csv")

In [None]:

pwScores_crisp = fa_crisp.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_crisp=pwScores_crisp[pwScores_crisp["adj_pval"]<0.05]
print("Crisp) Found: "+ str(pwScores_signif_crisp[pwScores_signif_crisp["pwid"].isin(tup_pathways)].shape[0])+" / "+str(len(tup_pathways))+" with  "+ str(pwScores_signif_crisp.shape[0]) +" significant")
confusion_matrix(tup_pathways,pwScores_signif_crisp["pwid"],pwScores_crisp["pwid"],outfile="./confusion_matrices/Pathways_pattern2_crisp.csv")

pwScores_gaussian = fa_gaussian.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_gaussian=pwScores_gaussian[pwScores_gaussian["adj_pval"]<0.05]
print("Gaussian) Found: "+ str(pwScores_signif_gaussian[pwScores_signif_gaussian["pwid"].isin(tup_pathways)].shape[0])+" / "+str(len(tup_pathways))+" with  "+ str(pwScores_signif_gaussian.shape[0]) +" significant")
confusion_matrix(tup_pathways,pwScores_signif_gaussian["pwid"],pwScores_gaussian["pwid"],outfile="./confusion_matrices/Pathways_pattern2_gaussian.csv")

pwScores_triangular = fa_triangular.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_triangular=pwScores_triangular[pwScores_triangular["adj_pval"]<0.05]
print("Triangular) Found: "+ str(pwScores_signif_triangular[pwScores_signif_triangular["pwid"].isin(tup_pathways)].shape[0])+" / "+str(len(tup_pathways))+" with  "+ str(pwScores_signif_triangular.shape[0]) +" significant")
confusion_matrix(tup_pathways,pwScores_signif_triangular["pwid"],pwScores_triangular["pwid"],outfile="./confusion_matrices/Pathways_pattern2_triangular.csv")

results={ 'triangular': pwScores_signif_triangular["pwid"], 'gaussian': pwScores_signif_gaussian["pwid"], 'crisp': pwScores_signif_crisp["pwid"] }


upset=from_contents(results)
upsetpl = UpSet(upset, orientation='vertical')
upsetpl.plot()    

plt.suptitle("Overlaps of genesets found with the different fuzzy concepts")
plt.show()

# DOWN pattern

In [None]:
# Analyse simulated DOWN pattern

down_geneset=[]
down_pathways=[]
key=list(pattern_groups.keys())[2]
value=pattern_groups[key]
union_geneset=[rp[list(rp.keys())[pwnames.index(p)]][1] for p in value]
down_pathways=[list(rp.keys())[pwnames.index(p)] for p in value]
down_geneset = [item for sublist in union_geneset for item in sublist]

relFlows = set().union(*
    [
    fa_triangular.flow_finder(["=", ">>", "="], verbose=True)
    ]
)

down_geneset=set(down_geneset).intersection(changed_genes_list)
print(len(down_geneset))

In [None]:

gene_to_consider=len(down_geneset)
genes, scores_df,_=fa_crisp.plot_flow_memberships(use_flows=relFlows,color_genes=down_geneset,n_genes=gene_to_consider)
fa_crisp.get_confusion_matrix(scores_df,down_geneset,gene_to_consider,outfile="./confusion_matrices/Genes_pattern3_crisp.csv")

genes, scores_df,_=fa_gaussian.plot_flow_memberships(use_flows=relFlows,color_genes=down_geneset,n_genes=gene_to_consider)
fa_gaussian.get_confusion_matrix(scores_df,down_geneset,gene_to_consider,outfile="./confusion_matrices/Genes_pattern3_gaussian.csv")

genes, scores_df,_=fa_triangular.plot_flow_memberships(use_flows=relFlows,color_genes=down_geneset,n_genes=gene_to_consider)
fa_triangular.get_confusion_matrix(scores_df,down_geneset,gene_to_consider,outfile="./confusion_matrices/Genes_pattern3_triangular.csv")

In [None]:

pwScores_crisp = fa_crisp.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_crisp=pwScores_crisp[pwScores_crisp["adj_pval"]<0.05]
print("Crisp) Found: "+ str(pwScores_signif_crisp[pwScores_signif_crisp["pwid"].isin(down_pathways)].shape[0])+" / "+str(len(down_pathways))+" with  "+ str(pwScores_signif_crisp.shape[0]) +" significant")
confusion_matrix(down_pathways,pwScores_signif_crisp["pwid"],pwScores_crisp["pwid"],outfile="./confusion_matrices/Pathways_pattern3_crisp.csv")

pwScores_gaussian = fa_gaussian.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_gaussian=pwScores_gaussian[pwScores_gaussian["adj_pval"]<0.05]
print("Gaussian) Found: "+ str(pwScores_signif_gaussian[pwScores_signif_gaussian["pwid"].isin(down_pathways)].shape[0])+" / "+str(len(down_pathways))+" with  "+ str(pwScores_signif_gaussian.shape[0]) +" significant")
confusion_matrix(down_pathways,pwScores_signif_gaussian["pwid"],pwScores_gaussian["pwid"],outfile="./confusion_matrices/Pathways_pattern3_gaussian.csv")

pwScores_triangular = fa_triangular.analyse_pathways_grouped(use_flows=relFlows,pathways_file=pw_file)
pwScores_signif_triangular=pwScores_triangular[pwScores_triangular["adj_pval"]<0.05]
print("Triangular) Found: "+ str(pwScores_signif_triangular[pwScores_signif_triangular["pwid"].isin(down_pathways)].shape[0])+" / "+str(len(down_pathways))+" with  "+ str(pwScores_signif_triangular.shape[0]) +" significant")
confusion_matrix(down_pathways,pwScores_signif_triangular["pwid"],pwScores_triangular["pwid"],outfile="./confusion_matrices/Pathways_pattern3_triangular.csv")

results={ 'triangular': pwScores_signif_triangular["pwid"], 'gaussian': pwScores_signif_gaussian["pwid"], 'crisp': pwScores_signif_crisp["pwid"] }


upset=from_contents(results)
upsetpl = UpSet(upset, orientation='vertical')
upsetpl.plot()    

plt.suptitle("Overlaps of genesets found with the different fuzzy concepts")
plt.show()







In [None]:

import os
cm_folder="./confusion_matrices"
files = [f for f in os.listdir(cm_folder) if os.path.isfile(join(cm_folder, f))]
files

def get_stats(file):
    cm1=pd.read_csv(cm_folder+"/"+file,header=0,index_col=0)
    total=cm1.sum(axis=1 ).sum(axis=0 )
    #####from confusion matrix calculate accuracy
    accuracy=(cm1.iloc[0,0]+cm1.iloc[1,1])/total
    precision=cm1.iloc[0,0]/(cm1.iloc[0,0]+cm1.iloc[0,1])
    recall=cm1.iloc[0,0]/(cm1.iloc[0,0]+cm1.iloc[1,0])
    sensitivity = cm1.iloc[0,0]/(cm1.iloc[0,0]+cm1.iloc[0,1])
    specificity = cm1.iloc[1,1]/(cm1.iloc[1,0]+cm1.iloc[1,1])
    f1=2*cm1.iloc[0,0]/(2*cm1.iloc[0,0]+cm1.iloc[1,0]+cm1.iloc[0,1])
    jaccard=cm1.iloc[0,0]/(cm1.iloc[0,0]+cm1.iloc[0,1]+cm1.iloc[1,0])

    return(file,accuracy,sensitivity,specificity,f1,precision,recall,jaccard)

df = pd.DataFrame(columns=['acc','sens','spec','f1','precision','recall','jaccard'], index=files)

for file in files:
    file,accuracy,sensitivity,specificity,f1,precision,recall,jaccard=get_stats(file)
    df.loc[file] = pd.Series({'acc':accuracy, 'sens':sensitivity, 'spec':specificity, 'f1':f1,'precision':precision,'recall':recall,'jaccard':jaccard})
df=df.sort_index()

df=df.fillna(0)
df["short_name"]=[x.split(".")[0].upper() for x in df.index]
df["cond1"]=[x.split("_")[0] for x in df["short_name"]]
df["cond2"]=[x.split("_")[1] for x in df["short_name"]]
df["cond3"]=[x.split("_")[2] for x in df["short_name"] ]

df=df.sort_values(['cond1', 'cond3'], ascending=[True, True])


my_range=np.arange(len(df.index))
fig = plt.figure(figsize=(10, 5))
ax = plt.axes()
ax.set_facecolor("white")
plt.grid(True,axis="y")
#plt.stem(my_range-0.3,df['acc'],'r',label="acc")
#plt.stem(my_range-0.15,df['spec'],"b",label="spec")
plt.stem(my_range,df['recall'],"tab:orange",label="recall")
plt.stem(my_range-0.15,df['precision'],"tab:blue",label="prec")
plt.stem(my_range+0.15,df['f1'],"tab:green",label="f1")
plt.stem(my_range+0.3,df['jaccard'],"tab:red",label="jaccard")


plt.xticks( my_range, df['short_name'], rotation = 90,)
plt.legend(loc=(1.04, 0),fontsize=10)

plt.xticks(size = 12)
plt.yticks(size = 10)
plt.savefig('./benchmark.pdf', bbox_inches='tight')
    

In [None]:
print(df)