In [1]:
import pandas as pd
import numpy as np
import os
from biopandas.pdb import PandasPdb as ppdb
from sklearn.model_selection import train_test_split

In [2]:
 refer_protein = {
    'ALA': 0,
    'GLY': 1,
    'ILE': 2,
    'LEU': 3,
    'PRO': 4,
    'VAL': 5,
    'PHE': 6,
    'TRP': 7,
    'TYR': 8, 
    'ASP': 9,
    'GLU': 10,
    'ARG': 11,
    'HIS': 12,
    'LYS': 13,
    'SER': 14,
    'THR': 15,
    'CYS': 16,
    'MET': 17,
    'MSE': 17,
    'MSO': 17,
    'ASN': 18,
    'GLN': 19
}

In [3]:
def extract_single_pdb(f):
    data = ppdb().read_pdb(f)
     
    # read starting helix range
    dbref = data.df['OTHERS'][data.df['OTHERS']['record_name'] == 'DBREF']['entry']   
    if len(dbref) == 0:
        dbref = data.df['OTHERS'][data.df['OTHERS']['record_name'] == 'DBREF2']['entry']
        if len(dbref) == 0:
            return
        start_range = dbref[dbref.first_valid_index()][39:49]
    else:
        start_range = dbref[dbref.first_valid_index()][49:54]
    
    # get the helic ranges
    helix_ranges = []
    for string in data.df['OTHERS'][data.df['OTHERS']['record_name'] == 'HELIX']['entry']:
        # Only get for model A
        if(string[13].strip() == 'A'): 
            start = int(string[16:19]) - int(start_range)
            end = int(string[28:31]) - int(start_range)
            helix_ranges.append((start, end))

    # gets the amino acids sequences
    final_str = []
    for string in data.df['OTHERS'][data.df['OTHERS']['record_name'] == 'SEQRES']['entry']:
        # Only get for model A
        if(string[5].strip() == 'A'):
            final_str.extend(string[13:].split(sep=' '))
    
    # create the labels
    label = np.zeros(len(final_str))
    for st, end in helix_ranges:
        for i in range(st, end+1):
            label[i] = 1

    # create the dataframe
    df = pd.DataFrame({'acids':final_str, 'helix':label})
    df['acid_num'] = df['acids'].apply(lambda x: refer_protein[x])

    return df

def extract_data_from_pdb(sub_dir):
    files_list =  [f for f in os.listdir('./data/'+sub_dir)]
    df_list = [extract_single_pdb('./data/'+sub_dir+"/"+f) for f in files_list]
    
    return df_list

In [4]:
df_list = extract_data_from_pdb('d1')

In [20]:
df2_list = []
for df in df_list:
    df2 = df.drop(['acids'], axis=1)
    hot_encode = pd.get_dummies(df['acid_num'])
    df2 = df2.drop(['acid_num'], axis=1)
    df2 = pd.concat([df2, hot_encode], axis=1)
    df2_list.append(df2)

In [22]:
# df2_list[0]

In [43]:
from keras.layers import Dense
from keras.models import Sequential

In [44]:
model = Sequential()
model.add(Dense(units=200, activation='relu', input_dim=200))
model.add(Dense(units=20, activation='relu'))
model.add(Dense(units=10, activation='softmax'))

In [None]:
for df in df2_list:
    # get 10 rows at a time
    for i in range(0, len(df)-9):
        ip = df[i:i+10] 