In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import math

from Bio import SeqIO

In [2]:
data_dir = '/novo/projects/departments/cdd/molecular_ai/mlbp/data/static_input_data'

In [15]:
path = 'tables/250211_amypro27.csv'
df = pd.read_csv(path,index_col=0)
df.drop(columns=['mask','data_split','value_bool'],inplace=True)
df['res_value_bool'] = [[int(y) for y in list(x)] for x in df.res_value_bool]
df['res_aa'] = [[y for y in list(x)] for x in df.sequence]
df['res_idx'] = df.sequence.apply(lambda x: range(len(x)))
df.drop(columns='sequence',inplace=True)
res_df = df.explode(['res_idx','res_aa','res_value_bool']).reset_index()
res_df.to_csv('tables/250217_amypro27_residues.csv')
res_df

Unnamed: 0,index,name,res_value_bool,dataset,len,res_aa,res_idx
0,0,Prolactin__AP00001,0,amypro27,199,L,0
1,0,Prolactin__AP00001,0,amypro27,199,P,1
2,0,Prolactin__AP00001,0,amypro27,199,I,2
3,0,Prolactin__AP00001,0,amypro27,199,C,3
4,0,Prolactin__AP00001,0,amypro27,199,P,4
...,...,...,...,...,...,...,...
4428,26,Galectin-7__AP00139,1,amypro27,136,S,131
4429,26,Galectin-7__AP00139,1,amypro27,136,V,132
4430,26,Galectin-7__AP00139,1,amypro27,136,R,133
4431,26,Galectin-7__AP00139,1,amypro27,136,I,134


In [3]:
path = 'tables/250211_amypro27.csv'
df = pd.read_csv(path,index_col=0)
df.drop(columns=['mask','res_value_bool','data_split','value_bool'],inplace=True)
df

Unnamed: 0,name,sequence,dataset,len
0,Prolactin__AP00001,LPICPGGAARCQVTLRDLFDRAVVLSHYIHNLSSEMFSEFDKRYTH...,amypro27,199
1,Alpha-s2-casein__AP00004,KNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRN...,amypro27,207
2,Serum_Amyloid_A-1__AP00005,RSFFSFLGEAFDGARDMWRAYSDMREANYIGSDKYFHARGNYDAAK...,amypro27,104
3,Medin__AP00018,RLDKQGNFNAWVAGSYGNDQWLQVDLGSSKEVTGIITQGARNFGSV...,amypro27,50
4,Apoliprotein__AP00020,TQQPQQDEMPSPTFLTQVKESLSSYWESAKTAAQNLYEKTYLPAVD...,amypro27,79
5,Odontogenic_ameloblast-associated_protein__AP...,APLIPQRLMSASNSNELLLNLNNGQLLPLQLQGPLNSWIPPFSGIL...,amypro27,264
6,Beta-lactoglobulin__AP00025,LIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEEL...,amypro27,162
7,Apomyoglobin__AP00030,GLSDGEWQQVLNVWGKVEADIAGHGQEVLIRLFTGHPETLEKFDKF...,amypro27,153
8,Chaplin_H__AP00039,DSGAQGAAVHSPGVLSGNVVQVPVHVPVNVCGNTISVIGLLNPAFG...,amypro27,52
9,Microcin_E492__AP00040,GETDPNTQLLNDLGNNMAWGAALGAPGGLGSAALGAAGGALQTVGQ...,amypro27,84


In [4]:
from collections.abc import Iterable

def get_kmers(x,k=6):
    assert isinstance(x, Iterable)
    for i in range(len(x)-k+1):
        kmer = x[i:i+k]
        yield i,kmer

list(get_kmers('LPICPGGAARCQVTLRDLF',6))

[(0, 'LPICPG'),
 (1, 'PICPGG'),
 (2, 'ICPGGA'),
 (3, 'CPGGAA'),
 (4, 'PGGAAR'),
 (5, 'GGAARC'),
 (6, 'GAARCQ'),
 (7, 'AARCQV'),
 (8, 'ARCQVT'),
 (9, 'RCQVTL'),
 (10, 'CQVTLR'),
 (11, 'QVTLRD'),
 (12, 'VTLRDL'),
 (13, 'TLRDLF')]

In [5]:
row_list = []
for i,row in df.iterrows():
    for i,kmer in get_kmers(row['sequence'],k=6):
        row_copy = row.copy(deep=True)
        row_copy['sequence'] = kmer
        row_copy['kmer_position'] = i
        row_list.append(row_copy)
kmer_df = pd.DataFrame(row_list)
kmer_df.to_csv('tables/250213_amypro27_kmers.csv')
kmer_df

Unnamed: 0,name,sequence,dataset,len,kmer_position
0,Prolactin__AP00001,LPICPG,amypro27,199,0
0,Prolactin__AP00001,PICPGG,amypro27,199,1
0,Prolactin__AP00001,ICPGGA,amypro27,199,2
0,Prolactin__AP00001,CPGGAA,amypro27,199,3
0,Prolactin__AP00001,PGGAAR,amypro27,199,4
...,...,...,...,...,...
26,Galectin-7__AP00139,DVQLDS,amypro27,136,126
26,Galectin-7__AP00139,VQLDSV,amypro27,136,127
26,Galectin-7__AP00139,QLDSVR,amypro27,136,128
26,Galectin-7__AP00139,LDSVRI,amypro27,136,129


In [6]:
df

Unnamed: 0,name,sequence,dataset,len
0,Prolactin__AP00001,LPICPGGAARCQVTLRDLFDRAVVLSHYIHNLSSEMFSEFDKRYTH...,amypro27,199
1,Alpha-s2-casein__AP00004,KNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRN...,amypro27,207
2,Serum_Amyloid_A-1__AP00005,RSFFSFLGEAFDGARDMWRAYSDMREANYIGSDKYFHARGNYDAAK...,amypro27,104
3,Medin__AP00018,RLDKQGNFNAWVAGSYGNDQWLQVDLGSSKEVTGIITQGARNFGSV...,amypro27,50
4,Apoliprotein__AP00020,TQQPQQDEMPSPTFLTQVKESLSSYWESAKTAAQNLYEKTYLPAVD...,amypro27,79
5,Odontogenic_ameloblast-associated_protein__AP...,APLIPQRLMSASNSNELLLNLNNGQLLPLQLQGPLNSWIPPFSGIL...,amypro27,264
6,Beta-lactoglobulin__AP00025,LIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEEL...,amypro27,162
7,Apomyoglobin__AP00030,GLSDGEWQQVLNVWGKVEADIAGHGQEVLIRLFTGHPETLEKFDKF...,amypro27,153
8,Chaplin_H__AP00039,DSGAQGAAVHSPGVLSGNVVQVPVHVPVNVCGNTISVIGLLNPAFG...,amypro27,52
9,Microcin_E492__AP00040,GETDPNTQLLNDLGNNMAWGAALGAPGGLGSAALGAAGGALQTVGQ...,amypro27,84
