In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random

from Bio import SeqIO

In [2]:
data_dir = '/novo/projects/departments/cdd/molecular_ai/mlbp/data/static_input_data'

In [3]:
path = f'{data_dir}/sbxw_fibrillation_peptide_waltzdb-train-val_serrano-test_randpadded10x.csv'
rand_padded_waltz_df = pd.read_csv(path,index_col=0)
print(rand_padded_waltz_df.shape)
filt_df = rand_padded_waltz_df[rand_padded_waltz_df['dataset']=='waltzdb'].copy(deep=True).reset_index(drop=True)
filt_df['res_value_bool'] = filt_df.apply(lambda x: x['mask'] if x.value_bool else '0'*len(x.sequence),axis=1)
print(filt_df.shape)
filt_df.head()

(14238, 7)
(13990, 8)


Unnamed: 0,sequence,value_bool,data_split,dataset,mask,core_sequence,len,res_value_bool
0,THSTVPIEG,0,val,waltzdb,1111110,STVPIE,9,0
1,TDSTSTVPIEQDR,0,val,waltzdb,111111000,STVPIE,13,0
2,CDGGQRCSTVPIEQ,0,val,waltzdb,1111110,STVPIE,14,0
3,NGPRSTVPIEHCETNKHEG,0,val,waltzdb,111111000000000,STVPIE,19,0
4,DNDKKRSTVPIEHC,0,val,waltzdb,11111100,STVPIE,14,0


In [4]:
def convert_agg_string(agg,total_len,verbose=False):
    agg = agg.strip()
    if verbose:
        print(agg)
    assert agg[0]=='[' and agg[-1]==']'
    res_ranges = agg[1:-1].split(',')
    aa_set = set()
    for res_range in res_ranges:
        start_aa,stop_aa = res_range.split('-')
        for aa in range(int(start_aa),int(stop_aa)+1):
            if aa > total_len:
                raise ValueError("Residue cannot be from sequence")
            aa_set.add(aa)
    if verbose:
        print(aa_set)
    agg_res = ''.join(['1' if aa+1 in aa_set else '0' for aa in range(total_len)])
    return agg_res

In [5]:
convert_agg_string('[1-3]',5,True)

[1-3]
{1, 2, 3}


'11100'

In [6]:
convert_agg_string('[1-3]',2,True)

[1-3]


ValueError: Residue cannot be from sequence

In [7]:
convert_agg_string('[1-5,10-12]',20,True)

[1-5,10-12]
{1, 2, 3, 4, 5, 10, 11, 12}


'11111000011100000000'

In [8]:
data_list = []
for record in SeqIO.parse("aggreprot/AggreProt_1917_NAR_Supplementary_File_S1.txt", "fasta"):
    # print(record.id,record.seq)
    seqid, seqname, agg_region = record.description.split(';')
    name = f'{seqname}__{seqid}'
    seq = str(record.seq)
    agg_res_labels = convert_agg_string(agg_region,len(seq))
    data_list.append([name,seq,agg_res_labels])
amypro_df = pd.DataFrame(data_list,columns=['name','sequence','res_value_bool'])
amypro_df['dataset'] = 'amypro27'
amypro_df['data_split'] = 'test'
amypro_df['value_bool'] = 1
amypro_df['len'] = amypro_df['sequence'].apply(len)
amypro_df['mask'] = 'not_available'
amypro_df.to_csv('tables/250211_amypro27.csv')
amypro_df

Unnamed: 0,name,sequence,res_value_bool,dataset,data_split,value_bool,len,mask
0,Prolactin__AP00001,LPICPGGAARCQVTLRDLFDRAVVLSHYIHNLSSEMFSEFDKRYTH...,0000001111111111111111111111111111000000001111...,amypro27,test,1,199,not_available
1,Alpha-s2-casein__AP00004,KNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRN...,0000000000000000000000000000000000000000000000...,amypro27,test,1,207,not_available
2,Serum_Amyloid_A-1__AP00005,RSFFSFLGEAFDGARDMWRAYSDMREANYIGSDKYFHARGNYDAAK...,1111111111111111111111111110000000000000000000...,amypro27,test,1,104,not_available
3,Medin__AP00018,RLDKQGNFNAWVAGSYGNDQWLQVDLGSSKEVTGIITQGARNFGSV...,0000000000000000000000000000000111111111111111...,amypro27,test,1,50,not_available
4,Apoliprotein__AP00020,TQQPQQDEMPSPTFLTQVKESLSSYWESAKTAAQNLYEKTYLPAVD...,0000000000000000000000000000000000000000000000...,amypro27,test,1,79,not_available
5,Odontogenic_ameloblast-associated_protein__AP...,APLIPQRLMSASNSNELLLNLNNGQLLPLQLQGPLNSWIPPFSGIL...,0000000000000000000000000000000000000000000000...,amypro27,test,1,264,not_available
6,Beta-lactoglobulin__AP00025,LIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEEL...,0000000000111111111100000000000000000000000000...,amypro27,test,1,162,not_available
7,Apomyoglobin__AP00030,GLSDGEWQQVLNVWGKVEADIAGHGQEVLIRLFTGHPETLEKFDKF...,1111111111111111111111111111100000000000000000...,amypro27,test,1,153,not_available
8,Chaplin_H__AP00039,DSGAQGAAVHSPGVLSGNVVQVPVHVPVNVCGNTISVIGLLNPAFG...,0000000000001111111111111111100111111111111111...,amypro27,test,1,52,not_available
9,Microcin_E492__AP00040,GETDPNTQLLNDLGNNMAWGAALGAPGGLGSAALGAAGGALQTVGQ...,0000000000000000000000000000000000000000000000...,amypro27,test,1,84,not_available


# Combine datasets

In [51]:
new_df = pd.concat([filt_df,amypro_df],axis=0)
path = f'{data_dir}/sbxw_fibrillation_peptide_waltzdb-train-val_amypro27-test_randpadded10x.csv'
print(path)
new_df.to_csv(path)
new_df

/novo/projects/departments/cdd/molecular_ai/mlbp/data/static_input_data/sbxw_fibrillation_peptide_waltzdb-train-val_amypro27-test_randpadded10x.csv


Unnamed: 0,sequence,value_bool,data_split,dataset,mask,core_sequence,len,res_value_bool,name
0,THSTVPIEG,0,val,waltzdb,001111110,STVPIE,9,000000000,
1,TDSTSTVPIEQDR,0,val,waltzdb,0000111111000,STVPIE,13,0000000000000,
2,CDGGQRCSTVPIEQ,0,val,waltzdb,00000001111110,STVPIE,14,00000000000000,
3,NGPRSTVPIEHCETNKHEG,0,val,waltzdb,0000111111000000000,STVPIE,19,0000000000000000000,
4,DNDKKRSTVPIEHC,0,val,waltzdb,00000011111100,STVPIE,14,00000000000000,
...,...,...,...,...,...,...,...,...,...
22,MGQEQDTPWILSTGHISTQKRQDGQQTPKLEHRNSTRLMGHCQKTM...,1,test,amypro27,not_available,,87,0000000000000000000000000000000000000000000000...,PB1_F2csv__AP00125
23,QAKEPCVESLVSQYFQTVTDYGKDLMEKVKSPELQAEAKSYFEKSK...,1,test,amypro27,not_available,,98,0000000000000000000000000000000000000000000000...,ApoAII__AP00127
24,FGIPCCPVHLKRLLIVVVVVVLIVVVIVGALLMGL,1,test,amypro27,not_available,,35,00000000111111111111111111111111110,Lung_Surfactant__AP00130
25,IGDDSGPVSANGNGASQYFGNSMTTGNMSPQMALIQGSFNKPCIAV...,1,test,amypro27,not_available,,105,0000000000111111111111111111111111111111110000...,RdIB_protein__AP00133


In [52]:
new_df.groupby('data_split').size()

data_split
test        27
train    11220
val       2770
dtype: int64

In [53]:
new_df.groupby('data_split').len.sum()

data_split
test       4433
train    189835
val       47038
Name: len, dtype: int64

In [54]:
new_df[new_df['data_split']=='train'].head(30)

Unnamed: 0,sequence,value_bool,data_split,dataset,mask,core_sequence,len,res_value_bool,name
2770,TGCNSGETGNSGFHPSKHCDQDSHH,0,train,waltzdb,111111000000000,SGFHPS,25,0,
2771,HQHESSGFHPSSCE,0,train,waltzdb,111111000,SGFHPS,14,0,
2772,KCTRHSGFHPSPNKT,0,train,waltzdb,1111110000,SGFHPS,15,0,
2773,STSGFHPSQN,0,train,waltzdb,11111100,SGFHPS,10,0,
2774,NHSGFHPSCRRHDH,0,train,waltzdb,111111000000,SGFHPS,14,0,
2775,GRQRTDGPSGFHPSCKHSEH,0,train,waltzdb,111111000000,SGFHPS,20,0,
2776,SSRCQESGFHPSND,0,train,waltzdb,11111100,SGFHPS,14,0,
2777,SGSGFHPSK,0,train,waltzdb,1111110,SGFHPS,9,0,
2778,RHETRRESGFHPSCSEGPN,0,train,waltzdb,111111000000,SGFHPS,19,0,
2779,RSNREERSGFHPSKGGQNDR,0,train,waltzdb,1111110000000,SGFHPS,20,0,


In [48]:
new_df[new_df['data_split']=='test']

Unnamed: 0,sequence,value_bool,data_split,dataset,mask,core_sequence,len,res_value_bool,name
0,LPICPGGAARCQVTLRDLFDRAVVLSHYIHNLSSEMFSEFDKRYTH...,1,test,amypro27,,,199,0000001111111111111111111111111111000000001111...,Prolactin__AP00001
1,KNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRN...,1,test,amypro27,,,207,0000000000000000000000000000000000000000000000...,Alpha-s2-casein__AP00004
2,RSFFSFLGEAFDGARDMWRAYSDMREANYIGSDKYFHARGNYDAAK...,1,test,amypro27,,,104,1111111111111111111111111110000000000000000000...,Serum_Amyloid_A-1__AP00005
3,RLDKQGNFNAWVAGSYGNDQWLQVDLGSSKEVTGIITQGARNFGSV...,1,test,amypro27,,,50,0000000000000000000000000000000111111111111111...,Medin__AP00018
4,TQQPQQDEMPSPTFLTQVKESLSSYWESAKTAAQNLYEKTYLPAVD...,1,test,amypro27,,,79,0000000000000000000000000000000000000000000000...,Apoliprotein__AP00020
5,APLIPQRLMSASNSNELLLNLNNGQLLPLQLQGPLNSWIPPFSGIL...,1,test,amypro27,,,264,0000000000000000000000000000000000000000000000...,Odontogenic_ameloblast-associated_protein__AP...
6,LIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEEL...,1,test,amypro27,,,162,0000000000111111111100000000000000000000000000...,Beta-lactoglobulin__AP00025
7,GLSDGEWQQVLNVWGKVEADIAGHGQEVLIRLFTGHPETLEKFDKF...,1,test,amypro27,,,153,1111111111111111111111111111100000000000000000...,Apomyoglobin__AP00030
8,DSGAQGAAVHSPGVLSGNVVQVPVHVPVNVCGNTISVIGLLNPAFG...,1,test,amypro27,,,52,0000000000001111111111111111100111111111111111...,Chaplin_H__AP00039
9,GETDPNTQLLNDLGNNMAWGAALGAPGGLGSAALGAAGGALQTVGQ...,1,test,amypro27,,,84,0000000000000000000000000000000000000000000000...,Microcin_E492__AP00040
