In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def write_gene_list_file(gene_list, filename):
    with open(filename, "w") as file:
        for gene in gene_list:
            file.write(gene+"\n")

In [3]:
df_test=pd.read_csv("../benchmark/prediction_values_all_models.csv.zip", index_col=0)

In [4]:
print(df_test.columns)
print(df_test["label"].unique())

Index(['seq', 'id', 'description', 'protein_ac', 'gene', 'label',
       'cov_0.1_min_seq_id_0.1_e_0.001_cluster', 'query', 'group_split_0',
       'group_split_1', 'group_split_2', 'group_split_3', 'group_split_4',
       'protein_id', 'transfactor_0', 'transfactor_1', 'transfactor_2',
       'transfactor_3', 'transfactor_4', 'transfactor_ensemble', 'ablation_0',
       'ablation_1', 'ablation_2', 'ablation_3', 'ablation_4',
       'ablation_ensemble', 'cnn_lstm_0', 'cnn_lstm_1', 'cnn_lstm_2',
       'cnn_lstm_3', 'cnn_lstm_4', 'cnn_lstm_ensemble', 'svm_lin_0',
       'svm_lin_1', 'svm_lin_2', 'svm_lin_3', 'svm_lin_4', 'svm_lin_ensemble',
       'svm_rbf_0', 'svm_rbf_1', 'svm_rbf_2', 'svm_rbf_3', 'svm_rbf_4',
       'svm_rbf_ensemble'],
      dtype='object')
['candidate' '0.0' '1.0']


In [5]:
# We will compare the enriched terms for the high vs low scoring candidate set
# The candidate set is really experimentally candidate. Therefore maybe this tool can help to sort for useful candidates

#1) Write host factors (labels) to list, background: all genes
BG_positives=df_test["protein_ac"].to_list() #background for positives
write_gene_list_file(BG_positives, "BG_positives.txt")
print("Number of genes in Background positives: " +str(len(BG_positives)))

List_positives=df_test[df_test["label"]=='1.0']["protein_ac"].to_list() #background for positives
write_gene_list_file(List_positives, "List_positives.txt")
print("Number of genes in List positives: " +str(len(List_positives)))

Number of genes in Background positives: 20415
Number of genes in List positives: 1045


In [6]:
#2)candidate set, create common background and splits
df=df_test[df_test["group_split_0"].isna()] #dataframe only containing candidate
BG_candidate=df["protein_ac"].to_list()
write_gene_list_file(BG_candidate, "BG_candidate.txt")
print("Genes in Background candidate: "+ str(len(BG_candidate)))

threshold=0.571461
col = 'transfactor_ensemble'
negatives=df[df[col]<=threshold]["protein_ac"].to_list()
filename=str(col)+f"_below_{threshold}.txt"
write_gene_list_file(negatives, filename)
print(f"Genes in candidate {col} negatives: "+ str(len(negatives)))

positives=df[df[col]>threshold]["protein_ac"].to_list()
filename=str(col)+f"_above_{threshold}.txt"
write_gene_list_file(positives, filename)
print(f"Genes in candidate {col} positives: "+ str(len(positives)))

Genes in Background candidate: 3936
Genes in candidate transfactor_ensemble negatives: 3053
Genes in candidate transfactor_ensemble positives: 883
