In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdmolfiles
import itertools

In [2]:
# Import sequences for feature set
seqs = pd.read_csv('./datasets/methylated_sites_info.csv', index_col=0)

In [3]:
# Prepare sequences for feature generation
seqs['ID'] = seqs['ACC_ID'] + '_' + seqs['MOD_RSD']
sequences = seqs[['ID', 'SITE_+/-7_AA', 'METHYLATED']]
sequences = sequences.set_index('ID')

In [4]:
# Import ProtDCal dataframe with values for amino acids
protdcal = pd.read_csv('./datasets/protdcal_features.csv', index_col=0)

In [5]:
# Create a method we can call to make our feature set for the set8 model
def FeatureGen (sequence, pdset):
    # FIRST: GENERATE PROTDCAL VALUES
    slist = list(sequence)   # first split sequence up into list
    # Go through sequence to get protdcal value
    pd = []
    for i in slist:
        pd.append(protdcal.loc[i].tolist())
    values = list(map(lambda *x: sum(x), *pd))   # add up values 
    headers =  protdcal.columns.tolist()   # include headers
    
    
    # SECOND: GENERATE ONE-HOT ENCODING
    aa = ['K', 'R', 'H', 'A', 'I', 'L', 'M', 'V', 'F', 'W', 'Y',
          'N', 'C', 'Q', 'S', 'T', 'D', 'E', 'G', 'P']   # possible amino acids
    # Make headers and one-hot encoding for each letter
    for i in aa:
        j = 0
        while j < len(sequence):
            headers.append('ONE-HOT_' + str(j) + '-' + i)  # make header
            if sequence[j] == i:
                values.append(1)
            else:
                values.append(0)
            j+=1
    
    
    # THIRD: GENERATE MACCS KEYS
    # Generate maccs keys
    mol = (rdmolfiles.MolFromFASTA(sequence))
    fp = (MACCSkeys.GenMACCSKeys(mol))
    maccs = fp.ToBitString()
    binary = list(maccs)   # split up into list
    values.extend(binary)   # add list onto resulting values
    # Generate headers for maccs keys
    mt = list(itertools.chain(range(len(binary))))
    mt = [str(s) + '_maccs' for s in mt]
    headers.extend(mt)   # append header values
    
    return values, headers

In [6]:
# Putting things together!
def RunFeats (sequences, protdcal):
    # First generate features for our data
    
    print('FEATURE GENERATION HAS COMMENCED !')
    
    # Create df for results to go into
    v, h = FeatureGen(sequences.values[0][0], protdcal)
    features = pd.DataFrame(columns=h)
    features.loc[len(features)] = v
    
    i = 1

    # Go through rest of sequences to generate feature set
    while i < len(sequences):
        ts = sequences.values[i][0]
        value, header = FeatureGen(ts, protdcal)
        features.loc[len(features)] = value
        i+=1
        if i % 500 == 0:
            print('FEATURE GENERATION:', i, 'of', len(sequences), 'complete')

    # Make the index the same as our initial dataframe
    features = features.set_index(sequences.index)

    # Isolate the methylated condition from the sequences as our y value 
    y = sequences['METHYLATED']
    
    return features, y

In [7]:
s8_train_x, s8_train_y = RunFeats(sequences, protdcal) 

FEATURE GENERATION HAS COMMENCED !
FEATURE GENERATION: 500 of 4593 complete
FEATURE GENERATION: 1000 of 4593 complete
FEATURE GENERATION: 1500 of 4593 complete
FEATURE GENERATION: 2000 of 4593 complete
FEATURE GENERATION: 2500 of 4593 complete
FEATURE GENERATION: 3000 of 4593 complete
FEATURE GENERATION: 3500 of 4593 complete
FEATURE GENERATION: 4000 of 4593 complete
FEATURE GENERATION: 4500 of 4593 complete


In [8]:
# Save feature data (x values) and labels (y values)
s8_train_x['METHYLATED'] = s8_train_y
s8_train_x.to_csv('./feature_data.csv')