## Initial Setup

In [None]:
from sys import path
base_path = "/media/paulati/Nuevo vol/paula/ingebi/2020/agustina_mazzella/github/arabidopsis_phospho"
path.append(base_path)    

from preparation import util

## Imports

In [None]:
from os import path, remove
import pandas as pd
import numpy as np
import re

## Arabidopsis all ids

In [None]:
all_arabidopsisis_file_path = path.join(base_path, "data/preproc/all_ids.zip")
all_arabidopsisis = pd.read_csv(all_arabidopsisis_file_path, header = None)
print(all_arabidopsisis.shape)
all_arabidopsisis_codes = all_arabidopsisis.iloc[:, 0].unique()
len(all_arabidopsisis_codes)

## Phosphat

In [None]:
phosphat_2020_file_path = path.join(base_path, "data/raw/HiconfPred_psite_20200624.zip")
data_phosphat = pd.read_csv(phosphat_2020_file_path, sep=",")
data_phosphat.head()                             

In [None]:
data_phosphat["code_unified"] = data_phosphat.code.apply(lambda x: x.split(".")[0])
data_phosphat.head()

In [None]:
all_phosphat_ids = data_phosphat.code_unified.unique()
len(all_phosphat_ids)

In [None]:
valid_phosphat_ids = [x for x in all_phosphat_ids if x in all_arabidopsisis_codes]
len(valid_phosphat_ids)

In [None]:
phosphat_pred = pd.DataFrame(valid_phosphat_ids, columns=["code"])
phosphat_pred_file_path = path.join(base_path, "data/preproc/phosphat_prediction_2020.csv")
#phosphat_pred.to_csv(phosphat_pred_file_path, index = False, header = True)

## Musite Deep

### Input Preparation

In [None]:
fasta_proteins_file_path = path.join(base_path, "data/raw/Araport11_genes.201606.pep.fasta")
print(fasta_proteins_file_path)
fasta_proteins_file_path_gz = fasta_proteins_file_path + ".gz"
print(fasta_proteins_file_path_gz)

if not path.isfile(fasta_proteins_file_path):
    util.extractgz(fasta_proteins_file_path_gz, fasta_proteins_file_path)

In [None]:
lines_per_file = 7000
f_out_base_file_path = path.join(base_path,  "data/preproc/musitedeep/file{}_Araport11_genes.201606.pep.fasta")
f_out_base_file_path

Split fasta file in chunks of size lines_per_file to be processed by https://www.musite.net/

In [None]:
f  = open(fasta_proteins_file_path, 'r') 

current_line = ''
file_index = 1
output_lines = []

for line in f:
    
    if len(output_lines) < lines_per_file:            
        if line[0] == '>':
            if len(current_line) > 0:                
                output_lines.append(current_line)                
            current_line = line
        else:
            line_clean = line.replace('\n', '')
            current_line += line                        
    else:
        print(len(output_lines))
        # write data to file
        out_file_path = f_out_base_file_path.format(file_index)
        print(out_file_path)
        f_out  = open(out_file_path, 'w') 
        f_out.writelines(output_lines)
        f_out.close()
        file_index += 1
        output_lines = []
        
#save las part of sequences
print(len(output_lines))
# write data to file
out_file_path = f_out_base_file_path.format(file_index)
print(out_file_path)
f_out  = open(out_file_path, 'w') 
f_out.writelines(output_lines)
f_out.close()

# close fasta input file
f.close()
remove(fasta_proteins_file_path)

### Results analysis

Combine all results in a single file `musiteDeep_prediction_all_file_path`

In [None]:
musiteDeep_prediction_files_count = 7
musite_results_base_path = path.join(base_path, 'data/results/musitedeep')
musiteDeep_prediction_file_path = path.join(musite_results_base_path, 'file{}_Prediction_results.txt')
musiteDeep_prediction_all_file_path = path.join(musite_results_base_path, "Prediction_results_all.txt")
musiteDeep_prediction_all_scores_file_path = path.join(musite_results_base_path, "Prediction_results_all_scores.txt")
musiteDeep_prediction_score_by_protein_file_path = path.join(musite_results_base_path, "Prediction_results_score_by_protein.txt")


# TODO
# setar variables para lectura que sean .zip en lugar de txt

In [None]:
class ScoreItem:
    def __init__(self, protein_id, prediction_type, prediction_score):        
        switcher = {            
            'Phosphothreonine': [protein_id, prediction_score, None, None],
            'Phosphoserine': [protein_id, None, prediction_score, None],
            'Phosphotyrosine': [protein_id, None, None, prediction_score]                
            }
        self.result_row = switcher[prediction_type]
            
    def get_row(self):
        return self.result_row

In [None]:
score_rows = []

protein_ids = []

to_discard = []

row_index = 0

for i in range(1, musiteDeep_prediction_files_count + 1):

    musiteDeep_prediction_file_path_n = musiteDeep_prediction_file_path.format(i)        
    
    f  = open(musiteDeep_prediction_file_path_n, 'r') 
    
    for line in  f:
        if line[0] == '>' or line[0:2] == 'ID':    
            # do nothing
            #print(line)
            to_discard.append(line)
            j = 1
        else:
            parts = line.split('\t')
            # print(parts)
            if len(parts) > 0:
                                
                protein_id = parts[0]
                
                protein_ids.append(protein_id)
                
                # column PTMscores
                prediction = parts[len(parts) - 2]
                
                prediction_parts = prediction.split(':')
                
                if len(prediction_parts) > 1:
                    
                    prediction_score = prediction_parts[1]
                    prediction_type =  prediction_parts[0]
                    score_item = ScoreItem(protein_id, prediction_type, prediction_score)                                                        
                    score_rows.append(score_item.get_row())
                    
                else:
                    print(prediction_parts)
            else:
                print(parts)
    
    f.close()
    

In [None]:
scores_df = pd.DataFrame(score_rows, columns = ['protein_id', 'Phosphothreonine', 'Phosphoserine', 'Phosphotyrosine'])

In [None]:
scores_df.shape

In [None]:
# scores_df.to_csv(musiteDeep_prediction_all_file_path, sep='\t', index = None)

Analyze `to_discard` data.

How many ids are in `to_discard` data?

In [None]:
to_discard_ids = []

for line in to_discard:    
    if line[0] == '>':        
        parts = line.split('|')
        if len(parts) > 0:
            tmp = parts[0]
            id = tmp.strip()[1:len(tmp)]
            to_discard_ids.append(id)            
        
to_discard_ids_unique = np.unique(to_discard_ids)

print(len(to_discard))
print(len(to_discard_ids_unique))

How many of theses have not any prediction in `scores_df`

(This cell takes too much time, run only when required)

In [None]:
#for x in to_discard_ids_unique:
#    if x not in scores_df.protein_id.values:
#        print(x)

Ids that are included in results list but the have not score associated:

AT1G33355.1

AT1G64633.1

AT2G07617.1

AT2G21105.1

AT2G29925.1

AT5G38150.1 PLASTID MOVEMENT IMPAIRED protein (DUF827)

ATMG00665.1

<code>
    
>AT1G33355.1 | hypothetical protein | Chr1:12089639-12089662 FORWARD LENGTH=7 | 201606
MRKVLEN
    
>AT1G64633.1 | hypothetical protein | Chr1:24019584-24019586 REVERSE LENGTH=1 | 201606
M

>AT2G07617.1 | hypothetical protein | Chr2:3262540-3262563 REVERSE LENGTH=7 | 201606
MKMDGLR
    
>AT2G21105.1 | hypothetical protein | Chr2:9048192-9048203 FORWARD LENGTH=3 | 201606
FKD
    
>AT2G29925.1 | hypothetical protein | Chr2:12755040-12755237 REVERSE LENGTH=24 | 201606
MILVKWQQLKELKVKIRIWVRVLQ

This one has an score but there is a missing pipe | so that is not parsed properly:
    
>AT5G38150.1 PLASTID MOVEMENT IMPAIRED protein (DUF827) | Chr5:15223113-15225192 REVERSE LENGTH=1740 | 201606
MLNRAMENSDMKRNSSTLLDLPVVKSSLVVEAIHMSRKKLGWYNESRRDSETVKARVEAG
LSEVKKSVEELALLIKRSNRSAGFQEKDMEVLKMEEKYAEVMRVLEVVKEEVSRVKLDVS
SVLIERVAAEEKVEELRFKTEGGLRLLESLKKEIEVANEEHLMVALGKIEALKGYKEIER
QREGKAIKVLDLLVERNKRIKNMLEEAERSKDIEIELFETSTDVEMLETQLKLFKKMERR
VQGRDSSSMSRSNRSFGRGKYSLSVLKEVTEGKKEELASVKVEIFRVMTVMDALRNEIIR
ARDETACLGKILREDDVKIEKLNSKILIEKSKLEVVSIAEERISSLAENFVGSLEKIKKS
RNAAKKEEFLFKEEKTVTKAETQKTKLDIDKKESELNSKLDELEKVKHTEALVLEKLESL
VEDMMESREMESEHCSTITISRFEYEYLSKHASQAEETAEKKVAAAAAWVEALKASTKSF
LMKTETLMRESEMTKAEEEREVFRMERSLSTKRLVEGEIQKIKRNSEAEGYISPKPVGKF
TPVQRGKPRRYSSVGTPTFFVIKKKKKVPRLAKFFSRRS  
    
>ATMG00665.1 | NADH dehydrogenase 5B | ChrM:190740-190761 REVERSE LENGTH=7 | 201606
DMMIGLG

    
</code>




## Scores musiteDeep by protein

In [None]:
data = pd.read_csv(musiteDeep_prediction_all_file_path, sep='\t')

scores = data.groupby(by='protein_id').max()

protein_score_df = scores.reset_index()

protein_score_df.fillna(-1, inplace = True)

# protein_score_df.to_csv(musiteDeep_prediction_all_scores_file_path, sep = '\t', index = None)


In [None]:
protein_score_df["max_score"] = protein_score_df.apply(lambda x: max(x[1], x[2], x[3]), axis=1)
protein_score_df.head()

In [None]:
#protein_score_df.to_csv(musiteDeep_prediction_score_by_protein_file_path, sep='\t', index = None)

## Scores musiteDeep by base id

In [None]:
score_by_protein = pd.read_csv(musiteDeep_prediction_score_by_protein_file_path, sep='\t')

print(score_by_protein.shape)

score_by_protein


In [None]:
def get_id_base(data):
    
    parts = data.split('.')
    
    if len(parts) > 0:
        id_base = parts[0]
    else:
        id_base = data
        
    return(id_base)

In [None]:
score_by_protein["protein_id_base"] = score_by_protein.apply(lambda x: get_id_base(x['protein_id']), axis = 1)

In [None]:
print(score_by_protein.shape)
score_by_protein

There is not any max_score null:

In [None]:
(score_by_protein.max_score == -1).sum()

In [None]:
max_values = pd.DataFrame(score_by_protein.groupby('protein_id_base')['max_score'].max())
max_values = max_values.rename(columns={"max_score": "max_value"})

min_values = pd.DataFrame(score_by_protein.groupby('protein_id_base')['max_score'].min())
min_values = min_values.rename(columns={"max_score": "min_value"})

mean_values = pd.DataFrame(score_by_protein.groupby('protein_id_base')['max_score'].mean())
mean_values = mean_values.rename(columns={"max_score": "mean_value"})

In [None]:
tmp1 = min_values.join(max_values, how="outer")
score_by_id_base = tmp1.join(mean_values, how="outer")

In [None]:
score_by_id_base

In [None]:
cuantos de los ids de arabidopsisi tienen predicho??


In [None]:
musiteDeep_prediction_all_file_path

## Predictions

The value to be analyzed is post-translational modification (PTM) score