### Imports

In [13]:
from __future__ import print_function
import pandas as pd
import numpy as np
import os
import json

### Single File

In [9]:
df = pd.read_csv('../clean_dssp_csv/1i3n.csv')
ex_ss = df['SS']
print(ex_ss.shape)

(692,)


#### Structural Labels

In [10]:
print(ex_ss.values[:15])

['C' 'C' 'E' 'E' 'E' 'E' 'E' 'T' 'T' 'T' 'S' 'H' 'H' 'H' 'H']


#### Numerical Labels

In [17]:
# get encoding
with open('../Encodings/label_encoding.json') as f:
    enc = json.load(f)

print(enc)

{u'C': 0, u'B': 4, u'E': 2, u'G': 7, u'I': 5, u'H': 1, u'S': 6, u'T': 3}


In [18]:
# create numerical labels
num_labels = np.asarray([enc[v] for v in ex_ss.values])

print(num_labels[:15])

[0 0 2 2 2 2 2 3 3 3 6 1 1 1 1]


#### One Hot Labels

In [19]:
def construct_one_hot(labels):
    """
    INPUT: labels is a single column DataFrame
    OUTPUT: data as a dummitized DataFrame
    """
    # initial variables
    cols = ['B', 'C', 'E', 'G', 'H', 'I', 'S', 'T']
    n = len(cols)
    data = []
    
    # have to reshape if passing in a DF
    labels = labels.values.reshape(-1, )
    
    for l in labels:
        row = np.zeros(n, dtype=int)
        i = cols.index(l)
        row[i] = 1
        data.append(row)
    
    data = np.asarray(data)
    df = pd.DataFrame(data, columns=cols)
    
    return df

In [21]:
one_hot_labels = construct_one_hot(df[['SS']])
one_hot_labels.head()

Unnamed: 0,B,C,E,G,H,I,S,T
0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0


### On all files

In [2]:
topdir, _, files = next(os.walk('../clean_dssp_csv/'))

In [None]:
n = len(files)

for i, fi in enumerate(files):
    per = int((i+1)*100.0/n)
    
    # print progress
    print('\r progress: '+str(per)+'%', end='')
    
    # get the data
    df = pd.read_csv(topdir+fi)
    
    # get labels
    labels = df[['SS']]
    
    # one hot
    one_hot = construct_one_hot(labels)
    
    # write out
    one_hot.to_csv('../one_hot_labels/'+fi, index=False)