In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import random

In [7]:
data_dir = '/novo/projects/departments/cdd/molecular_ai/mlbp/data/static_input_data'

In [19]:
df = new_waltz_df
df

Unnamed: 0,sequence,value_bool,data_split
0,HGGGGN,False,train
1,SDKKEE,False,train
2,VATTRT,False,train
3,NGKSNF,False,train
4,NNNRGG,False,train
...,...,...,...
1394,RGFFYT,False,test
1395,LATVYP,False,test
1396,SIEDSV,False,test
1397,QTVIIE,True,test


In [11]:
path = f'{data_dir}/sbxw_fibrillation_peptide_waltzdb.csv'
waltz_df = pd.read_csv(path,index_col=0)
print(waltz_df.shape)
display(waltz_df.groupby('value_bool').size())
display(waltz_df.groupby(['data_split','value_bool']).size())
print(waltz_df.sequence.nunique())
waltz_df = waltz_df.drop(columns='data_split')
waltz_df.head()

(1399, 3)


value_bool
False    892
True     507
dtype: int64

data_split  value_bool
test        False         177
            True          100
train       False         715
            True          407
dtype: int64

1399


Unnamed: 0,sequence,value_bool
0,STVPIE,False
1,GVIWIA,True
2,LATVYA,False
3,NATAHQ,False
4,STVGIE,False


In [21]:
waltz_df = waltz_df.sample(frac=1,random_state=42).reset_index(drop=True)

# 80% train / 10% val / 10% test
n_train = round(len(waltz_df)*.80)
n_val = round(len(waltz_df)*.10)
n_test = len(waltz_df)-n_train-n_val
print(n_train,n_val,n_test)

train_df = waltz_df.iloc[0:n_train].copy(deep=True)
train_df['data_split'] = 'train'
val_df = waltz_df.iloc[n_train:n_train+n_val].copy(deep=True)
val_df['data_split'] = 'val'
test_df = waltz_df.iloc[n_train+n_val:].copy(deep=True)
test_df['data_split'] = 'test'
assert len(train_df)+len(val_df)+len(test_df)==len(waltz_df)

new_waltz_df = pd.concat([train_df,val_df,test_df],ignore_index=True)
print(new_waltz_df.sequence.nunique())
display(new_waltz_df.groupby(['data_split','value_bool']).size())
new_waltz_df.head()

1119 140 140
1399


data_split  value_bool
test        False          83
            True           57
train       False         720
            True          399
val         False          89
            True           51
dtype: int64

Unnamed: 0,sequence,value_bool,data_split
0,WIVIFF,True,train
1,FQKQQK,False,train
2,YQQYNP,False,train
3,FINYTN,False,train
4,GQQSYS,False,train


# waltz-residuelevel

In [25]:
new_waltz_df['res_value_bool'] = new_waltz_df.apply(lambda x: len(x.sequence)*str(int(x.value_bool)),axis=1)
new_waltz_df.to_csv(f'{data_dir}/sbxw_fibrillation_peptide_waltz-residuelevel.csv')
new_waltz_df.head()

Unnamed: 0,sequence,value_bool,data_split,res_value_bool
0,WIVIFF,True,train,111111
1,FQKQQK,False,train,0
2,YQQYNP,False,train,0
3,FINYTN,False,train,0
4,GQQSYS,False,train,0


In [27]:
new_waltz_df.sample(1)

Unnamed: 0,sequence,value_bool,data_split,res_value_bool
619,STKIIE,False,train,0


# Function for sampling recombined sequences

In [69]:
def sampleString(length):
    nonhydrophobic = 'CDEGHKNPQRST'
    return ''.join([random.choice(nonhydrophobic) for _ in range(length)])

def add_linker(seq,res_labels,max_linker_length):
    # add linker (always negative)
    linker_length = np.random.randint(max_linker_length)+1
    sel_seq = sampleString(linker_length)
    seq+=sel_seq
    res_labels+='0'*len(sel_seq)
    return seq,res_labels

def add_seq(seq,res_labels,new_seq_list,label):
    sel_seq = random.choice(new_seq_list)
    seq+=sel_seq
    res_labels+=label*len(sel_seq)
    return seq,res_labels

def sample_recombined_sequence(max_total_length,pos_seqs,neg_seqs,max_linker_length):
    seq = ''
    res_labels = ''

    if max_linker_length > 0 and bool(np.random.randint(2)):
        # add linker (always negative)
        seq,res_labels = add_linker(seq,res_labels,max_linker_length)

    max_total_length = np.random.randint(10,max_total_length)+1
    while len(seq) < max_total_length:
        if bool(np.random.randint(2)):
            # pos_seq
            seq,res_labels = add_seq(seq,res_labels,pos_seqs,'1')
        else:
            # neg seq
            seq,res_labels = add_seq(seq,res_labels,neg_seqs,'0')

        if max_linker_length > 0 and bool(np.random.randint(2)):
            # add linker
            seq,res_labels = add_linker(seq,res_labels,max_linker_length)  
    return seq,res_labels
            
    
random.seed(42)
seq,res_labels = sample_recombined_sequence(50,
                                           list(new_waltz_df[new_waltz_df['value_bool']==True].sequence),
                                           list(new_waltz_df[new_waltz_df['value_bool']==False].sequence),
                                           0)
print(seq,res_labels)

LATVYMCGVIGIGSIAATREPTKVLGTVYVNQFNLMSTVIDEQATVYV 111111111111111111000000111111111111000000111111


# waltz-residuelevel-recombined

In [79]:
n_total = 10000 # desired size of dataset
max_seq_length = 50 # max length of peptide (min is 10)
max_linker_length = 0 # no linker in this one

random.seed(42)
np.random.seed(42)

n_train = round(n_total*.80)
n_val = round(n_total*.10)
n_test = n_total-n_train-n_val
print(n_train,n_val,n_test)
n_examples = {
    'train':n_train,
    'val':n_val,
    'test':n_test
}

df_list = []
for data_split,group_df in new_waltz_df.groupby('data_split',sort=False):
    data_list = []
    for i in range(n_examples[data_split]):
        pos_seqs = list(group_df[group_df['value_bool']==True].sequence)
        neq_seqs = list(group_df[group_df['value_bool']==False].sequence)
        seq,res_labels = sample_recombined_sequence(max_seq_length,
                                           pos_seqs,
                                           neq_seqs,
                                           0)
        data_list.append([seq,res_labels])
    split_df = pd.DataFrame(data_list,columns=['sequence','res_value_bool'])
    split_df['data_split'] = data_split
    df_list.append(split_df)
recombine_new_waltz_df = pd.concat(df_list,ignore_index=True)
recombine_new_waltz_df.to_csv(f'{data_dir}/sbxw_fibrillation_peptide_waltz-residuelevel-recombine50-linker0.csv')
recombine_new_waltz_df

8000 1000 1000


Unnamed: 0,sequence,res_value_bool,data_split
0,LATVYMSTLLYEGVFNNQLATHYVNQFNLMSTVIDEKVEDLKNFGT...,1111110000000000000000001111110000000000000000...,train
1,MAAAQAPQGGYQVGFGNNGTFFINQSGFGN,000000000000000000111111000000,train
2,HFVWIASTMSITDVSIEDEYSNFSGSHLVEALEEYTVQIVPKQTFLVN,111111111111000000111111000000111111111111111111,train
3,LAGVYVASSSNYMIHFGNETVIIEMAAAQALQSSWGVQIVYC,111111111111111111111111000000000000111111,train
4,KLLEIASTVIIVGYYQNYQQFNPQAAIDWFLATVYG,000000111111000000000000000000000000,train
...,...,...,...
9995,VQIVYASTNTIEVIGIAQFNPQGGDFNKFHWPNGITDFNKFHLYQLEN,111111000000111111000000111111000000111111111111,test
9996,YVSGFHNIGNNSFAIRHFVQITYKFGTFSISTGIIE,000000000000111111111111000000000000,test
9997,KEEKPVKLVSSSLATVYFVQGVYKLTQRGF,000000000000111111000000000000,test
9998,GERGFFVQIVYASTNIFENSNNSNSWGMMGFGTFSI,000000111111000000000000000000000000,test


In [None]:
new_waltz_df.set_index('sequence').loc['LATVYM']

In [85]:
new_waltz_df.set_index('sequence').loc['STLLYE']

value_bool         False
data_split         train
res_value_bool    000000
Name: STLLYE, dtype: object

In [80]:
recombine_new_waltz_df.groupby('data_split').size()

data_split
test     1000
train    8000
val      1000
dtype: int64

# waltz-residuelevel-recombined-linker

In [81]:
n_total = 10000 # desired size of dataset
max_seq_length = 50 # max length of peptide (min is 10)
max_linker_length = 5 # linker allowed here

random.seed(42)
np.random.seed(42)

n_train = round(n_total*.80)
n_val = round(n_total*.10)
n_test = n_total-n_train-n_val
print(n_train,n_val,n_test)
n_examples = {
    'train':n_train,
    'val':n_val,
    'test':n_test
}

df_list = []
for data_split,group_df in new_waltz_df.groupby('data_split',sort=False):
    data_list = []
    for i in range(n_examples[data_split]):
        pos_seqs = list(group_df[group_df['value_bool']==True].sequence)
        neq_seqs = list(group_df[group_df['value_bool']==False].sequence)
        seq,res_labels = sample_recombined_sequence(max_seq_length,
                                           pos_seqs,
                                           neq_seqs,
                                           0)
        data_list.append([seq,res_labels])
    split_df = pd.DataFrame(data_list,columns=['sequence','res_value_bool'])
    split_df['data_split'] = data_split
    df_list.append(split_df)
recombine_new_waltz_df = pd.concat(df_list,ignore_index=True)
recombine_new_waltz_df.to_csv(f'{data_dir}/sbxw_fibrillation_peptide_waltz-residuelevel-recombine50-linker5.csv')
recombine_new_waltz_df

8000 1000 1000


Unnamed: 0,sequence,res_value_bool,data_split
0,LATVYMSTLLYEGVFNNQLATHYVNQFNLMSTVIDEKVEDLKNFGT...,1111110000000000000000001111110000000000000000...,train
1,MAAAQAPQGGYQVGFGNNGTFFINQSGFGN,000000000000000000111111000000,train
2,HFVWIASTMSITDVSIEDEYSNFSGSHLVEALEEYTVQIVPKQTFLVN,111111111111000000111111000000111111111111111111,train
3,LAGVYVASSSNYMIHFGNETVIIEMAAAQALQSSWGVQIVYC,111111111111111111111111000000000000111111,train
4,KLLEIASTVIIVGYYQNYQQFNPQAAIDWFLATVYG,000000111111000000000000000000000000,train
...,...,...,...
9995,VQIVYASTNTIEVIGIAQFNPQGGDFNKFHWPNGITDFNKFHLYQLEN,111111000000111111000000111111000000111111111111,test
9996,YVSGFHNIGNNSFAIRHFVQITYKFGTFSISTGIIE,000000000000111111111111000000000000,test
9997,KEEKPVKLVSSSLATVYFVQGVYKLTQRGF,000000000000111111000000000000,test
9998,GERGFFVQIVYASTNIFENSNNSNSWGMMGFGTFSI,000000111111000000000000000000000000,test
