## Initial Setup

In [None]:
from sys import path
base_path = "/media/paulati/Nuevo vol/paula/ingebi/2020/agustina_mazzella/github/arabidopsis_phospho"
path.append(base_path)    

from preparation import util

## Imports

In [None]:
from os import path, remove
import pandas as pd
import numpy as np
import re
from sklearn.metrics import confusion_matrix, recall_score, precision_score
import seaborn as sns
import matplotlib.pyplot as plt

## Arabidopsis all ids

In [None]:
all_arabidopsisis_file_path = path.join(base_path, "data/preproc/all_ids.zip")
all_arabidopsisis = pd.read_csv(all_arabidopsisis_file_path, header = None)
print(all_arabidopsisis.shape)
all_arabidopsisis_codes = all_arabidopsisis.iloc[:, 0].unique()
len(all_arabidopsisis_codes)

## Experimental data

In [None]:
experimental_file_path = path.join(base_path, "data/preproc/experimental_ids.zip")
experimental_data = pd.read_csv(experimental_file_path, sep = "\t")
experimental_data

## Common

In [None]:
def performance(data, score_column_name, threshold, true_value_column_name):

    if(threshold is None):
        y_pred = data[score_column_name].notnull()
    else:
        y_pred = data[score_column_name] >= threshold    
        
    y_true = data[true_value_column_name].notnull()
    
    fp_mask = np.logical_and(y_pred, np.logical_not(y_true))
    fp = fp_mask.sum()

    y_pred_neg = np.logical_not(y_pred)
    y_true_neg = np.logical_not(y_true)
    tn_mask = np.logical_and(y_pred_neg, y_true_neg)
    tn = tn_mask.sum()

    fpr = fp / (fp + tn)
    
    tp_mask = np.logical_and(y_pred, y_true)
    tp = tp_mask.sum()

    fdr = fp / (fp + tp)
    
    fn_mask = np.logical_and(y_pred_neg, y_true)
    fn = fn_mask.sum()

    tpr = tp/ (tp + fn)
    
    matrix = confusion_matrix(y_true, y_pred)
    
    return fpr, tpr, matrix, fdr

## Phosphat

In [None]:
phosphat_2020_file_path = path.join(base_path, "data/raw/HiconfPred_psite_20200624.zip")
data_phosphat = pd.read_csv(phosphat_2020_file_path, sep=",")
data_phosphat.head()                             

In [None]:
data_phosphat["code_unified"] = data_phosphat.code.apply(lambda x: x.split(".")[0])
data_phosphat.head()

In [None]:
all_phosphat_ids = data_phosphat.code_unified.unique()
len(all_phosphat_ids)

In [None]:
valid_phosphat_ids = [x for x in all_phosphat_ids if x in all_arabidopsisis_codes]
len(valid_phosphat_ids)

In [None]:
phosphat_pred = pd.DataFrame(valid_phosphat_ids, columns=["code"])
phosphat_pred_file_path = path.join(base_path, "data/preproc/phosphat_prediction_2020.csv")
#phosphat_pred.to_csv(phosphat_pred_file_path, index = False, header = True)

phosphat_pred_zip_file_path = phosphat_pred_file_path.replace(".csv", ".zip")

### Phosphat performance

In [None]:
all_arabidopsisis_codes_df = pd.DataFrame(all_arabidopsisis_codes, columns = ['code'])
all_arabidopsisis_codes_df.index = all_arabidopsisis_codes_df.code
all_arabidopsisis_codes_df

In [None]:
phosphat_pred = pd.read_csv(phosphat_pred_zip_file_path, sep= "\t")

In [None]:
phosphat_pred.index = phosphat_pred.code
phosphat_pred.columns = ["code_phosphat"]
experimental_data.index = experimental_data.code
experimental_data.columns = ["code_experimental"]
phostphat_pred_exp = phosphat_pred.join(experimental_data, how = "outer")
phostphat_pred_exp_all = all_arabidopsisis_codes_df.join(phostphat_pred_exp, how = "left")
phostphat_pred_exp_all

In [None]:
fpr, tpr, matrix, fdr = performance(phostphat_pred_exp_all, "code_phosphat", None, "code_experimental")
    
print(fpr)

print(tpr)

print(matrix)

print(fdr)

## Musite Deep

The value to be analyzed is **post-translational modification (PTM) score**

### Input Preparation

In [None]:
fasta_proteins_file_path = path.join(base_path, "data/raw/Araport11_genes.201606.pep.fasta")
print(fasta_proteins_file_path)
fasta_proteins_file_path_gz = fasta_proteins_file_path + ".gz"
print(fasta_proteins_file_path_gz)

if not path.isfile(fasta_proteins_file_path):
    util.extractgz(fasta_proteins_file_path_gz, fasta_proteins_file_path)

In [None]:
lines_per_file = 7000
f_out_base_file_path = path.join(base_path,  "data/preproc/musitedeep/file{}_Araport11_genes.201606.pep.fasta")
f_out_base_file_path

Split fasta file in chunks of size lines_per_file to be processed by https://www.musite.net/

In [None]:
f  = open(fasta_proteins_file_path, 'r') 

current_line = ''
file_index = 1
output_lines = []

for line in f:
    
    if len(output_lines) < lines_per_file:            
        if line[0] == '>':
            if len(current_line) > 0:                
                output_lines.append(current_line)                
            current_line = line
        else:
            line_clean = line.replace('\n', '')
            current_line += line                        
    else:
        print(len(output_lines))
        # write data to file
        out_file_path = f_out_base_file_path.format(file_index)
        print(out_file_path)
        f_out  = open(out_file_path, 'w') 
        f_out.writelines(output_lines)
        f_out.close()
        file_index += 1
        output_lines = []
        
#save las part of sequences
print(len(output_lines))
# write data to file
out_file_path = f_out_base_file_path.format(file_index)
print(out_file_path)
f_out  = open(out_file_path, 'w') 
f_out.writelines(output_lines)
f_out.close()

# close fasta input file
f.close()
remove(fasta_proteins_file_path)

### Results analysis

Combine all results in a single file `musiteDeep_prediction_all_file_path`

In [None]:
musiteDeep_prediction_files_count = 7
musite_results_base_path = path.join(base_path, 'data/results_preproc/musitedeep')
musiteDeep_prediction_file_path = path.join(musite_results_base_path, 'file{}_Prediction_results.txt')
musiteDeep_prediction_all_file_path = path.join(musite_results_base_path, "Prediction_results_all.txt")
musiteDeep_prediction_all_scores_file_path = path.join(musite_results_base_path, "Prediction_results_all_scores.txt")
musiteDeep_prediction_score_by_protein_file_path = path.join(musite_results_base_path, "Prediction_results_score_by_protein.txt")
musiteDeep_prediction_score_by_id_base_file_path = path.join(musite_results_base_path, "Prediction_results_score_by_id_base.txt")

musiteDeep_prediction_all_zip_file_path = musiteDeep_prediction_all_file_path.replace(".txt", ".zip")
musiteDeep_prediction_all_scores_zip_file_path = musiteDeep_prediction_all_scores_file_path.replace(".txt", ".zip")
musiteDeep_prediction_score_by_protein_zip_file_path = musiteDeep_prediction_score_by_protein_file_path.replace(".txt", ".zip")
musiteDeep_prediction_score_by_id_base_zip_file_path = musiteDeep_prediction_score_by_id_base_file_path.replace(".txt", ".zip")

In [None]:
musiteDeep_prediction_files_count = 7
musite_results_base_path = path.join(base_path, 'data/results_preproc/musitedeep')
musiteDeep_prediction_file_path = path.join(musite_results_base_path, 'file{}_Prediction_results.txt')
musiteDeep_prediction_all_file_path = path.join(musite_results_base_path, "Prediction_results_all.txt")
musiteDeep_prediction_all_scores_file_path = path.join(musite_results_base_path, "Prediction_results_all_scores.txt")
musiteDeep_prediction_score_by_protein_file_path = path.join(musite_results_base_path, "Prediction_results_score_by_protein.txt")
musiteDeep_prediction_score_by_id_base_file_path = path.join(musite_results_base_path, "Prediction_results_score_by_id_base.txt")

musiteDeep_prediction_all_zip_file_path = musiteDeep_prediction_all_file_path.replace(".txt", ".zip")
musiteDeep_prediction_all_scores_zip_file_path = musiteDeep_prediction_all_scores_file_path.replace(".txt", ".zip")
musiteDeep_prediction_score_by_protein_zip_file_path = musiteDeep_prediction_score_by_protein_file_path.replace(".txt", ".zip")
musiteDeep_prediction_score_by_id_base_zip_file_path = musiteDeep_prediction_score_by_id_base_file_path.replace(".txt", ".zip")

In [None]:
class ScoreItem:
    def __init__(self, protein_id, prediction_type, prediction_score):        
        switcher = {            
            'Phosphothreonine': [protein_id, prediction_score, None, None],
            'Phosphoserine': [protein_id, None, prediction_score, None],
            'Phosphotyrosine': [protein_id, None, None, prediction_score]                
            }
        self.result_row = switcher[prediction_type]
            
    def get_row(self):
        return self.result_row

In [None]:
score_rows = []

protein_ids = []

to_discard = []

row_index = 0

for i in range(1, musiteDeep_prediction_files_count + 1):

    musiteDeep_prediction_file_path_n = musiteDeep_prediction_file_path.format(i)                
        
    if(path.isfile(musiteDeep_prediction_file_path_n)):
    
        print(musiteDeep_prediction_file_path_n)
    
        f  = open(musiteDeep_prediction_file_path_n, 'r') 

        for line in  f:          
            
            if line[0] == '>' or line[0:2] == 'ID':    
                # do nothing
                #print(line)
                to_discard.append(line)
                j = 1
            else:
                parts = line.split('\t')
                # print(parts)
                if len(parts) > 0:

                    protein_id = parts[0]

                    protein_ids.append(protein_id)

                    # column PTMscores
                    prediction = parts[len(parts) - 2]

                    prediction_parts = prediction.split(':')

                    if len(prediction_parts) > 1:

                        prediction_score = prediction_parts[1]
                        prediction_type =  prediction_parts[0]
                        score_item = ScoreItem(protein_id, prediction_type, prediction_score)                                                        
                        score_rows.append(score_item.get_row())

                    else:
                        print(prediction_parts)
                else:
                    print(parts)

        f.close()
    

In [None]:
scores_df = pd.DataFrame(score_rows, columns = ['protein_id', 'Phosphothreonine', 'Phosphoserine', 'Phosphotyrosine'])

In [None]:
scores_df.shape

In [None]:
#scores_df.to_csv(musiteDeep_prediction_all_file_path, sep='\t', index = None)

Analyze `to_discard` data.

How many ids are in `to_discard` data?

In [None]:
to_discard_ids = []

for line in to_discard:    
    if line[0] == '>':        
        parts = line.split('|')
        if len(parts) > 0:
            tmp = parts[0]
            id = tmp.strip()[1:len(tmp)]
            to_discard_ids.append(id)            
        
to_discard_ids_unique = np.unique(to_discard_ids)

print(len(to_discard))
print(len(to_discard_ids_unique))

How many of theses have not any prediction in `scores_df`

(This cell takes too much time, run only when required)

In [None]:
#for x in to_discard_ids_unique:
#    if x not in scores_df.protein_id.values:
#        print(x)

Ids that are included in results list but the have not score associated:

AT1G33355.1

AT1G64633.1

AT2G07617.1

AT2G21105.1

AT2G29925.1

AT5G38150.1 PLASTID MOVEMENT IMPAIRED protein (DUF827)

ATMG00665.1

<code>
    
>AT1G33355.1 | hypothetical protein | Chr1:12089639-12089662 FORWARD LENGTH=7 | 201606
MRKVLEN
    
>AT1G64633.1 | hypothetical protein | Chr1:24019584-24019586 REVERSE LENGTH=1 | 201606
M

>AT2G07617.1 | hypothetical protein | Chr2:3262540-3262563 REVERSE LENGTH=7 | 201606
MKMDGLR
    
>AT2G21105.1 | hypothetical protein | Chr2:9048192-9048203 FORWARD LENGTH=3 | 201606
FKD
    
>AT2G29925.1 | hypothetical protein | Chr2:12755040-12755237 REVERSE LENGTH=24 | 201606
MILVKWQQLKELKVKIRIWVRVLQ

This one has an score but there is a missing pipe | so that is not parsed properly:
    
>AT5G38150.1 PLASTID MOVEMENT IMPAIRED protein (DUF827) | Chr5:15223113-15225192 REVERSE LENGTH=1740 | 201606
MLNRAMENSDMKRNSSTLLDLPVVKSSLVVEAIHMSRKKLGWYNESRRDSETVKARVEAG
LSEVKKSVEELALLIKRSNRSAGFQEKDMEVLKMEEKYAEVMRVLEVVKEEVSRVKLDVS
SVLIERVAAEEKVEELRFKTEGGLRLLESLKKEIEVANEEHLMVALGKIEALKGYKEIER
QREGKAIKVLDLLVERNKRIKNMLEEAERSKDIEIELFETSTDVEMLETQLKLFKKMERR
VQGRDSSSMSRSNRSFGRGKYSLSVLKEVTEGKKEELASVKVEIFRVMTVMDALRNEIIR
ARDETACLGKILREDDVKIEKLNSKILIEKSKLEVVSIAEERISSLAENFVGSLEKIKKS
RNAAKKEEFLFKEEKTVTKAETQKTKLDIDKKESELNSKLDELEKVKHTEALVLEKLESL
VEDMMESREMESEHCSTITISRFEYEYLSKHASQAEETAEKKVAAAAAWVEALKASTKSF
LMKTETLMRESEMTKAEEEREVFRMERSLSTKRLVEGEIQKIKRNSEAEGYISPKPVGKF
TPVQRGKPRRYSSVGTPTFFVIKKKKKVPRLAKFFSRRS  
    
>ATMG00665.1 | NADH dehydrogenase 5B | ChrM:190740-190761 REVERSE LENGTH=7 | 201606
DMMIGLG

    
</code>




## Scores musiteDeep by protein

In [None]:
data = pd.read_csv(musiteDeep_prediction_all_zip_file_path, sep='\t')

scores = data.groupby(by='protein_id').max()

protein_score_df = scores.reset_index()

protein_score_df.fillna(-1, inplace = True)

#protein_score_df.to_csv(musiteDeep_prediction_all_scores_file_path, sep = '\t', index = None)


In [None]:
protein_score_df["max_score"] = protein_score_df.apply(lambda x: max(x[1], x[2], x[3]), axis=1)
protein_score_df.head()

In [None]:
#protein_score_df.to_csv(musiteDeep_prediction_score_by_protein_file_path, sep='\t', index = None)

## Scores musiteDeep by base id

In [None]:
score_by_protein = pd.read_csv(musiteDeep_prediction_score_by_protein_zip_file_path, sep='\t')

print(score_by_protein.shape)

score_by_protein


In [None]:
def get_id_base(data):
    
    parts = data.split('.')
    
    if len(parts) > 0:
        id_base = parts[0]
    else:
        id_base = data
        
    return(id_base)

In [None]:
score_by_protein["protein_id_base"] = score_by_protein.apply(lambda x: get_id_base(x['protein_id']), axis = 1)

In [None]:
print(score_by_protein.shape)
score_by_protein

There is not any max_score null:

In [None]:
(score_by_protein.max_score == -1).sum()

In [None]:
max_values = pd.DataFrame(score_by_protein.groupby('protein_id_base')['max_score'].max())
max_values = max_values.rename(columns={"max_score": "max_value"})

min_values = pd.DataFrame(score_by_protein.groupby('protein_id_base')['max_score'].min())
min_values = min_values.rename(columns={"max_score": "min_value"})

mean_values = pd.DataFrame(score_by_protein.groupby('protein_id_base')['max_score'].mean())
mean_values = mean_values.rename(columns={"max_score": "mean_value"})

In [None]:
tmp1 = min_values.join(max_values, how="outer")
score_by_id_base = tmp1.join(mean_values, how="outer")

In [None]:
score_by_id_base.reset_index(inplace = True)
score_by_id_base.columns = ['protein_id_base','min_score','max_score', 'mean_score']
score_by_id_base

Checking all ids are valid

In [None]:
all([x in all_arabidopsisis_codes for x in score_by_id_base.protein_id_base])

In [None]:
#score_by_id_base.to_csv(musiteDeep_prediction_score_by_id_base_file_path, sep= "\t", index = False, header = True)

## Join MusiteDeep scores by base id - Experimental data

In [None]:
musite_scores_file_path = musiteDeep_prediction_score_by_id_base_file_path.replace(".txt", ".zip")
musite_scores_data = pd.read_csv(musite_scores_file_path, sep = "\t")
musite_scores_data

In [None]:
musite_scores_data.index = musite_scores_data.protein_id_base
#experimental_data.index = experimental_data.code
musite_experimental_data = musite_scores_data.join(experimental_data, how = "outer")
musite_experimental_data

In [None]:
musite_experimental_data["label"] = [1 if x is not np.NaN else 0 for x in musite_experimental_data.code_experimental.values]
musite_experimental_data["color"] = ["orange" if x == 1 else "blue" for x in musite_experimental_data.label.values]

In [None]:
musite_experimental_data

In [None]:
score_threshold = 0.85 # from notebook roc.ipynb

In [None]:
musite_experimental_data['prediction'] = 0
mask = musite_experimental_data.max_score >= score_threshold
musite_experimental_data.loc[mask, 'prediction'] = 1
columns = ['protein_id_base', 'max_score', 'prediction']
musite_prediction = musite_experimental_data[columns]
musite_prediction

In [None]:
musite_prediction_file_path = path.join(base_path, "data/results_preproc/musitedeep/musiteDeep_prediction_thr.txt")
#musite_prediction.to_csv(musite_prediction_file_path, sep="\t", index = None, header = True)

In [None]:
len(musite_prediction.protein_id_base.unique())

In [None]:
len(all_arabidopsisis_codes)

There are 7 (27655 - 27648) ids not scored by musiteDeep

### MusiteDeep performance

#### threshold = 0.5

In [None]:
fpr_05, tpr_05, matrix_05, fdr_05 = performance(musite_experimental_data, "max_score", 0.5, "code_experimental")
print(fpr_05) 
print(tpr_05)
print(matrix_05)
print(fdr_05)


#### threshold = 0.85

In [None]:
fpr_085, tpr_085, matrix_085, fdr_085 = performance(musite_experimental_data, "max_score", 0.85, "code_experimental")
print(fpr_085) 
print(tpr_085)
print(matrix_085)
print(fdr_085)

## PredOr: phosphat + musiteDeep

In [None]:
musite_prediction_file_path = path.join(base_path, "data/results_preproc/musitedeep/musiteDeep_prediction_thr.zip")
musite_prediction = pd.read_csv(musite_prediction_file_path, sep="\t")
musite_prediction

In [None]:
musite_prediction.prediction.sum()

In [None]:
phosphat_pred_file_path = path.join(base_path, "data/preproc/phosphat_prediction_2020.zip")
phosphat_pred =  pd.read_csv(phosphat_pred_file_path, sep="\t")
phosphat_pred.index = phosphat_pred.code
phosphat_pred

In [None]:
musite_prediction_mask = musite_prediction.prediction == 1
musite_pred = musite_prediction.loc[musite_prediction_mask, 'protein_id_base'].values
predOr = np.concatenate([musite_pred, phosphat_pred.code.values])
predOr_result = pd.DataFrame(np.unique(predOr), columns = ['code'])
predOr_result

In [None]:
all_valid_codes = len(np.unique([x for x in predOr_result.code if x in all_arabidopsisis_codes])) == predOr_result.shape[0]
all_valid_codes

In [None]:
predOr_file_path_out = path.join(base_path, "data/results_preproc/predOr_phosphat_musiteDeep.csv")
#predOr_result.to_csv(predOr_file_path_out, index = None)

Combining musiteDeep and Phosphat results

In [None]:
result = musite_prediction.loc[musite_prediction_mask, :]
result.index = result.protein_id_base
phosphat_musitedeep = result.join(phosphat_pred, lsuffix = '_musitedeep', rsuffix='_phosphat', how="outer")
phosphat_musitedeep

How many musite predicted positive are not in phospaht predicted?

In [None]:
mask_1 = pd.notnull(phosphat_musitedeep.protein_id_base).values & pd.isnull(phosphat_musitedeep.code).values
mask_1.sum()

How many phosphat predicted positive are not in musite predicted?

In [None]:
mask_2 = pd.isnull(phosphat_musitedeep.protein_id_base).values & pd.notnull(phosphat_musitedeep.code).values
mask_2.sum()

## PredOr performance metrics

In [None]:
#predOr_file_path = path.join(base_path, "data/results_preproc/predOr_phosphat_musiteDeep.zip")
predOr_file_path = path.join(base_path, "data/results_preproc/PredRS.zip")
predOr_data = pd.read_csv(predOr_file_path)
predOr_data.index = predOr_data.code
predOr_data

In [None]:
all_arabidopsisis_codes

In [None]:
experimental_data

In [None]:
exp_predOr = pd.DataFrame(all_arabidopsisis_codes, columns = ["arabidopsisis_codes"])
exp_predOr.index = exp_predOr.arabidopsisis_codes
print(exp_predOr.shape)
exp_predOr = exp_predOr.join(predOr_data, how = 'left', lsuffix = "_arabidopsis", rsuffix = "_predOr")
exp_predOr = exp_predOr.join(experimental_data, how = 'left', lsuffix = "_predOr", rsuffix = "_exp")

y_true_mask = [1 if x else 0 for x in pd.notnull(exp_predOr.code_experimental)]
exp_predOr["y_true"] = y_true_mask
y_pred_mask = [1 if x else 0 for x in pd.notnull(exp_predOr.code)]
exp_predOr["y_pred"] = y_pred_mask
exp_predOr

In [None]:
conf_matrix = confusion_matrix(exp_predOr.y_true, exp_predOr.y_pred)
print(conf_matrix)

predOR_recall = recall_score(exp_predOr.y_true, exp_predOr.y_pred)    
print(predOR_recall)

predOR_precision = precision_score(exp_predOr.y_true, exp_predOr.y_pred)    
print(predOR_precision)

In [None]:
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)
specificity

In [None]:
# false positive rate

fpr = fp / (fp + tn)
fpr

In [None]:
fpr, tpr, matrix, fdr = performance(exp_predOr, "code", None, "code_experimental")
print(fpr)
print(tpr)
print(matrix)
print(fdr)

## Why do we have combined a good performance predictor as phosphat with a not so good as musiteDeep

**The idea is to get a prediction that predict as positive a geater number of ids that already have experimental evidence**

Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class.

**Phosphat** confusion matrix is

$\begin{matrix}14497 & 21 \\ 3034 & 10103 \end{matrix}$


fpr 0.0014464802314368371

tpr 0.7690492502093325


**PredOr** confusion matrix is

$\begin{matrix}9386 & 5132 \\ 1113 & 12024 \end{matrix}$

fpr 0.3398539743766359

tpr 0.9117759001294055

## Experimental evidence and not predicted as phosphorylated

There are 1113 ids predicted as negative that have experimental evidence

What are they scores?

How does metrics change if they are included as positives by musiteDeep?

In [None]:
ids_1113_mask = np.logical_and(exp_predOr.y_true == 1, exp_predOr.y_pred == 0)
ids_1113 = exp_predOr.loc[ids_1113_mask, 'arabidopsisis_codes'].values
ids_1113

In [None]:
ids_1113_mask = [x in ids_1113 for x in musite_prediction.protein_id_base]
scores = musite_prediction.loc[ids_1113_mask, 'max_score'].values
sns.histplot(scores)

In [None]:
min(scores)

ids_1113 included in phosphat prediction?:

In [None]:
[x for x in ids_1113 if x in phosphat_pred.code.values]

# How do metrics vary with differente musite threshold?

In [None]:
experimental_file_path = path.join(base_path, "data/preproc/experimental_ids.zip")
experimental_data = pd.read_csv(experimental_file_path, sep = "\t")
experimental_data.index = experimental_data.code
experimental_data

In [None]:
musite_scores_file_path = musiteDeep_prediction_score_by_id_base_file_path.replace(".txt", ".zip")
musite_scores_data = pd.read_csv(musite_scores_file_path, sep = "\t")
musite_scores_data.index = musite_scores_data.protein_id_base
musite_scores_data = musite_scores_data[["protein_id_base", "max_score" ]]
musite_scores_data

In [None]:
phosphat_pred_file_path = path.join(base_path, "data/preproc/phosphat_prediction_2020.zip")
phosphat_pred =  pd.read_csv(phosphat_pred_file_path, sep="\t")
phosphat_pred.index = phosphat_pred.code
phosphat_pred

In [None]:
all_data = pd.DataFrame(all_arabidopsisis_codes, columns = ["arabidopsisis_codes"])
all_data.index = all_data.arabidopsisis_codes
experimental_data.columns = ["code_exp"]
all_data = all_data.join(experimental_data, how = "left")
musite_scores_data.columns = ["code_musite", "score_musite"]
all_data = all_data.join(musite_scores_data, how = "left")
phosphat_pred.columns = ["code_phosphat"]
all_data = all_data.join(phosphat_pred, how = "left")
all_data

In [None]:
len(all_arabidopsisis_codes) == all_data.shape[0]

In [None]:
y_true_mask = [1 if x else 0 for x in pd.notnull(all_data.code_exp)]
all_data["y_true"] = y_true_mask
all_data

In [None]:
threshold_count = 100

pred_recalls = []
pred_precisions = []
pred_specificities = []
false_negatives = []
true_negatives = []
false_positives = []
true_positives = []
fprs = []

min_thr = 0
#min_thr = 50

for i in range(min_thr, threshold_count):
    threshold = i / threshold_count
    
    pred_mask = np.logical_or(all_data.score_musite >= threshold, pd.notnull(all_data.code_phosphat))    
    y_pred_values = [1 if x else 0 for x in pred_mask.values]
    all_data["y_pred"] = y_pred_values

    pred_recall = recall_score(all_data.y_true, all_data.y_pred)    
    #print(predOR_recall)
    pred_recalls.append(pred_recall)

    pred_precision = precision_score(all_data.y_true, all_data.y_pred)    
    #print(predOR_precision)
    pred_precisions.append(pred_precision)

    conf_matrix = confusion_matrix(all_data.y_true, all_data.y_pred)
    #print(conf_matrix)
    tn, fp, fn, tp = conf_matrix.ravel()
    false_negatives.append(fn)
    false_positives.append(fp)
    true_negatives.append(tn)
    true_positives.append(tp)
    
    pred_specificity = tn / (tn + fp)
    pred_specificities.append(pred_specificity)

    fpr = fp / (fp + tn)
    fprs.append(fpr)
    

In [None]:
# how many relevant items (positives) are selected?
sns.scatterplot(x = range(min_thr, threshold_count), y = pred_recalls, color = "steelblue")

# how many seleted items are relevant?
sns.scatterplot(x = range(min_thr, threshold_count), y = pred_precisions, color = "orange")

# tn / (tn + fp)
sns.scatterplot(x = range(min_thr, threshold_count), y = pred_specificities, color = "darkred")

plt.axvline(x = 85, ymin = 0, ymax = 1, color = "darkgrey")


In [None]:
max(false_negatives)

In [None]:
max(false_positives)

In [None]:
#false_positives_rel = false_positives / max(false_positives)
sns.scatterplot(x = range(min_thr, threshold_count), y = false_positives, color = "darkviolet")

sns.scatterplot(x = range(min_thr, threshold_count), y = true_positives, color = "silver")

sns.scatterplot(x = range(min_thr, threshold_count), y = true_negatives, color = "gold")

#false_negatives_rel = false_negatives / max(false_negatives)
sns.scatterplot(x = range(min_thr, threshold_count), y = false_negatives, color = "darkgreen")

plt.axvline(x = 85, ymin = 0, ymax = 1, color = "darkgrey")


In [None]:
sns.scatterplot(x = range(min_thr, threshold_count), y = fprs, color = "darkred")
plt.axvline(x = 85, ymin = 0, ymax = 1, color = "darkgrey")

In [None]:
score_min_1113 = min(scores)
print(score_min_1113)
