-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocessing.py
149 lines (116 loc) · 5.83 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 1 14:43:43 2022
@author: mufassirin
"""
import os
from Bio import SeqIO
import numpy as np
import pickle
Features_DataPath = '/home/mufassirin/research/ss_predictor/dataset/My_Dataset/features/generated_features/test/CASP/npyForm'
Fasta_DataPath = '/home/mufassirin/research/ss_predictor/dataset/My_Dataset/features/test_sets/CASP/Fasta/'
HHM_DataPath = '/home/mufassirin/research/ss_predictor/dataset/My_Dataset/features/test_sets/CASP/HHM/'
PSSM_DataPath = '/home/mufassirin/research/ss_predictor/dataset/My_Dataset/features/test_sets/CASP/PSSM/'
PSP19_DataPath = '/home/mufassirin/research/ss_predictor/dataset/My_Dataset/features/test_sets/CASP/PSP19/'
Physico_DataPath = '/home/mufassirin/research/ss_predictor/dataset/My_Dataset/features/test_sets/CASP/7PCP/'
aa_index = { 'A' : -10, 'C' : -9, 'D' : -8, 'E' : -7, 'F' : -6, 'G' : -5,
'H' : -4, 'I' : -3, 'K' : -2, 'L':-1, 'M' : 1, 'N' : 2,
'P' : 3, 'Q' : 4, 'R' : 5, 'S' : 6, 'T' : 7, 'V' : 8,
'W' : 9, 'Y' : 10}
def get_protein_ids_with_datapath(search_path, file_format):
file_names = []
files_path = []
file_extension = file_format
if os.path.exists(search_path):
for (root, dirs, files) in os.walk(search_path):
for file in files:
if file.split('.')[-1] == file_extension: # e.g. file_extension='pdb', not '.pdb'
#file_names.append(file.rstrip('.' + file_extension))
file_names.append(file[:-(len(file_extension)+1)]) # keeping file name only excluding extension and dot sign
files_path.append(root) # full path except file name
return file_names
def extract_pssm(pssm_path,aa_dict):
np_pssm_temp=np.zeros((0, 21))
with open(pssm_path) as p:
lines = p.readlines()
p.close()
# print(lines)
#print (protein_name)
for line in lines:
split_line = line.split()
if (len(split_line) == 44) and (split_line[0] != '#'):
aa_symbol = split_line[1]
aa_number = aa_dict[aa_symbol]
pssm_temp = [-float(i) for i in split_line[2:22]]
pssm_temp.insert(0, aa_number)
vect_pssm_temp=np.array(pssm_temp)
vect_pssm_temp=vect_pssm_temp.reshape(1,21)
#print(vect_pssm_temp.shape)
#print(type(vect_pssm_temp))
np_pssm_temp = np.concatenate(( np_pssm_temp,vect_pssm_temp), axis=0)
#row=np_pssm_temp.shape[0]
return np_pssm_temp
def extract_hhm(hhm_path):
# for a single protein
hhm_scores_all_residues = []
seq_len = None
with open(hhm_path) as hhm_file:
lines = hhm_file.readlines()
# Preprocess header - find seq_len and hhm score matrix's start location
hhm_start_line_no = None
for c_line_no, current_line in enumerate(lines):
if current_line.startswith('LENG '):
seq_len = int(current_line.split()[1])
elif current_line.startswith('#\n') and lines[c_line_no+1].startswith('NULL ') and lines[c_line_no+2].startswith('HMM '):
# HHM matrix starts after 3 consecutive lines starts wtih word '#', 'NULL' and 'HMM' Newline and. Space after these words is required as these may appear in other sentences
# above these lines are meta information of HHm formation.
hhm_start_line_no = c_line_no + 5 # HHM matrix header in 5 lines; skip thsese.
break;
# now for each of the residues there are three consecutive lines, read first one, skip next two lines
if seq_len is not None and hhm_start_line_no is not None:
residue_line_no = hhm_start_line_no
for residue_no in range(seq_len):
residual_fields = lines[residue_line_no].replace('*', '0').split() # first row - score for each individual residues; # replacing score '*' by '0'
hhm_scores = [(-float(score))/1000 for score in residual_fields[2:22]] # 2:22 contains score for each amino acid (e.g. 20 standard amino acids)
residue_line_no += 1
residual_joint_fields = lines[residue_line_no].replace('*', '0').split() # 10 fields for pair score e.g. for M->M field
hhm_scores.extend([(-float(score))/1000 for score in residual_joint_fields])
hhm_scores = np.power(2, hhm_scores)
hhm_scores_all_residues.append(hhm_scores)
residue_line_no += 2 # skip blank line, move to next residue start line
return np.stack(hhm_scores_all_residues)
##main function
#protein_ids = np.loadtxt('/home/research/ss_predictor/CASP13.lst', dtype='str').tolist()
file_format = 'fasta'
protein_ids = get_protein_ids_with_datapath(Fasta_DataPath, file_format)
i = 0
for protein_name in protein_ids:
# Add sequence
features = {}
aa_seq_record = SeqIO.read(os.path.join(Fasta_DataPath,protein_name) +'.fasta', 'fasta')
seq = str(aa_seq_record.seq)
features['seq'] = seq
# Add PSSM
pssm_file = PSSM_DataPath + protein_name + ".pssm"
pssm = extract_pssm(pssm_file,aa_index)
features['pssm'] = pssm
# Add HHM
hhm_path = os.path.join(HHM_DataPath,protein_name) + '.hhm'
hhm_scores = extract_hhm(hhm_path)
features['hhm'] = hhm_scores
## Add Physicochemical encoding
myphysico = np.load(os.path.join(Physico_DataPath,protein_name) + '.npy')
assert myphysico.shape == (len(seq),7)
features['physico'] = myphysico
## Add psp19 encoding
mypsp19 = np.load(os.path.join(PSP19_DataPath,protein_name) + '.npy')
assert mypsp19.shape == (len(seq),19)
features['psp19'] = mypsp19
# save
pklfile = os.path.join(Features_DataPath,protein_name) +'.npy'
f = open(pklfile, 'wb')
pickle.dump(features,f)
f.close()
i = i+1
print(i, protein_name, 'completed')