In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data = pd.read_csv('../csv_data/cleaned_ord_5.csv')
data.head()

Unnamed: 0,pos,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
0,2,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
def get_aminos(df, i):
    """
    Given the dataframe and index, find the aminos that are "on" (ie 1)
    """
    ser = df.ix[i]
    
    aminos = []
    
    for i, v in enumerate(ser.values):
        if v==1:
            amino = ser.index.values[i]
            aminos.append(amino)
            
    return aminos

In [5]:
l = get_aminos(data, 0)
l

['--2', 'V-1', 'L0', 'S1', 'P2']

In [6]:
def make_prev_row(l):
    """
    Given positional aminos, ex:
    ['--2', 'V-1', 'L0', 'S1', 'P2']
    Reconstruct the missing previous seq
    
    Note: only works on position 2 of a protein
    """
    
    n = len(l)
    
    # get the letters without position
    aas = [v[0] for v in l]
    
    # now shift one
    aas = ['-'] + aas[:n-1]
    
    # generate the positions
    pos = map(str, range(-(n//2), (n//2)+1))
    
    # combine them
    r = [v+p for v, p in zip(aas, pos)]
    
    return r

In [7]:
r0 = make_prev_row(l)
r0

['--2', '--1', 'V0', 'L1', 'S2']

In [8]:
def construct_row(l, df):
    """
    Given positional aminos, ex:
    ['--2', '--1', 'V0', 'L1', 'S2']
    Reconstruct the dataframe row
    """
    
    cols = df.columns.values
    
    n = df.columns.shape[0]
    
    row = np.zeros(n)
    
    for aa in l:
        i = np.where(aa==cols) # finds the index that matches
        row[i] = 1
        
    # set the position to 1
    row[0] = 1
    
    # convert to simple ints
    row = np.asarray([int(v) for v in row])
        
    return row.reshape(1, -1)

In [10]:
full_r0 = construct_row(r0, data)
full_r0

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

In [11]:
pd.DataFrame(full_r0, columns=data.columns.values)

Unnamed: 0,pos,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [189]:
def get_middle(df, i):
    return get_aminos(df, i)[2][0]

In [190]:
get_middle(data, 0)

'L'

### Now that we can do it for a single example, let's get the indices

In [59]:
def find_proteins(data):

    i = 1

    indices = []

    start_i = 0

    while i < data.shape[0]:
        curr_pos = data.ix[i]['pos']
        prev_pos = data.ix[i-1]['pos']

        # ie we've transitioned to a new start of a protein
        if prev_pos > curr_pos:
            print 'i:', i
            print 'prev:', prev_pos 
            print 'curr:', curr_pos
            ind = [start_i, i]
            indices.append(ind)
            start_i = i

        i += 1

    # dont forget to snag the last sequence!
    final = [start_i, data.shape[0]]
    indices.append(final)
    
    return indices

In [60]:
indices = find_proteins(data)

i: 140
prev: 141
curr: 2
i: 550
prev: 411
curr: 2
i: 695
prev: 146
curr: 2
i: 856
prev: 162
curr: 2
i: 1155
prev: 300
curr: 2
i: 2968
prev: 1814
curr: 2
i: 3314
prev: 347
curr: 2
i: 3721
prev: 408
curr: 2
i: 3883
prev: 163
curr: 2
i: 4035
prev: 153
curr: 2
i: 4310
prev: 276
curr: 2
i: 4372
prev: 63
curr: 2
i: 4535
prev: 164
curr: 2
i: 4817
prev: 283
curr: 2
i: 5270
prev: 454
curr: 2
i: 5434
prev: 165
curr: 2
i: 5846
prev: 413
curr: 2
i: 6040
prev: 195
curr: 2
i: 6332
prev: 293
curr: 2
i: 6727
prev: 396
curr: 2
i: 7216
prev: 490
curr: 2
i: 7531
prev: 316
curr: 2
i: 7872
prev: 342
curr: 2
i: 8110
prev: 239
curr: 2
i: 8593
prev: 484
curr: 2
i: 9188
prev: 596
curr: 2
i: 9252
prev: 65
curr: 2
i: 9943
prev: 692
curr: 2
i: 10041
prev: 99
curr: 2
i: 10534
prev: 494
curr: 2
i: 10694
prev: 161
curr: 2
i: 10949
prev: 256
curr: 2
i: 11195
prev: 247
curr: 2
i: 11490
prev: 296
curr: 2
i: 11902
prev: 413
curr: 2
i: 12131
prev: 230
curr: 2
i: 12498
prev: 368
curr: 2
i: 12727
prev: 230
curr: 2
i: 13123

In [61]:
print len(indices)
indices[:2]

347


[[0, 140], [140, 550]]

# Example of getting the first protein constructed

In [20]:
# fully captured first protein
protein_0 = data.values[indices[0][0]:indices[0][1]]
protein_0

array([[  2,   0,   0, ...,   0,   0,   0],
       [  3,   0,   0, ...,   0,   0,   0],
       [  4,   0,   0, ...,   0,   0,   0],
       ..., 
       [139,   0,   0, ...,   0,   0,   0],
       [140,   0,   0, ...,   0,   0,   1],
       [141,   0,   0, ...,   0,   0,   1]])

In [23]:
# now need to calculate the position 1 aa
protein_0_start = protein_0[0]
protein_0_start

array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
# will just change the first index of indices for each protein
protein0_row0 = construct_row(make_prev_row(get_aminos(data, indices[0][0])), data)
protein0_row0

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

In [39]:
full_protein0 = np.concatenate((protein0_row0, protein_0))
print full_protein0.shape
full_protein0

(141, 106)


array([[  1,   0,   0, ...,   0,   0,   0],
       [  2,   0,   0, ...,   0,   0,   0],
       [  3,   0,   0, ...,   0,   0,   0],
       ..., 
       [139,   0,   0, ...,   0,   0,   0],
       [140,   0,   0, ...,   0,   0,   1],
       [141,   0,   0, ...,   0,   0,   1]])

In [51]:
protein0_df = pd.DataFrame(full_protein0[:, 1:], columns=data.columns[1:], index=full_protein0[:, 0])
protein0_df.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
protein0_df.to_csv('../proteins/protein0_data.csv')

In [58]:
saved_protein0 = pd.read_csv('../proteins/protein0_data.csv', index_col=0)
saved_protein0

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
# now get the labels correct too
one_hot = pd.read_csv('../one_hot_labels.csv').drop(0)
one_hot.head()

Unnamed: 0,H,E,T,S,B,U
1,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0


In [65]:
protein0_labels = one_hot.values[indices[0][0]:indices[0][1]]
protein0_labels

array([[ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0

In [63]:
one_hot.shape

(134815, 6)

In [66]:
protein0_labels.shape

(140, 6)

In [72]:
lr0 = one_hot.values[0].reshape(1, -1)
lr0

array([[ 0.,  0.,  0.,  0.,  0.,  1.]])

In [74]:
protein0_labels = np.concatenate((lr0, protein0_labels))

In [75]:
print protein0_labels.shape
print protein0_df.shape

(141, 6)
(141, 105)


In [80]:
protein0_labels_df = pd.DataFrame(protein0_labels, columns=one_hot.columns, index=xrange(1, 142))
protein0_labels_df.head()

Unnamed: 0,H,E,T,S,B,U
1,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0


In [81]:
protein0_labels_df.to_csv('../proteins/protein0_labels.csv')

In [82]:
pd.read_csv('../proteins/protein0_labels.csv', index_col=0)

Unnamed: 0,H,E,T,S,B,U
1,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0,0.0,0.0


# Now iterate over all of the indices and construct proteins and full DF

### DONT FORGET --- We need to add these rows to the labels data frame too

In [112]:
proteins = [np.asarray(data.values[s:e]) for s, e in indices]

In [124]:
len(proteins)

347

In [139]:
proteins[0][:10]

array([[ 2,  0,  0, ...,  0,  0,  0],
       [ 3,  0,  0, ...,  0,  0,  0],
       [ 4,  0,  0, ...,  0,  0,  0],
       ..., 
       [ 9,  0,  0, ...,  0,  0,  0],
       [10,  0,  0, ...,  0,  0,  0],
       [11,  0,  0, ...,  0,  0,  0]])

In [130]:
proteins[0].shape

(140, 106)

In [126]:
protein_labels = [np.asarray(one_hot.values[s:e]) for s, e in indices]

In [138]:
len(protein_labels)

347

In [135]:
protein_labels[0][:10]

array([[ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.]])

In [129]:
protein_labels[0].shape

(140, 6)

In [131]:
proteins[0][:10]

array([[ 2,  0,  0, ...,  0,  0,  0],
       [ 3,  0,  0, ...,  0,  0,  0],
       [ 4,  0,  0, ...,  0,  0,  0],
       ..., 
       [ 9,  0,  0, ...,  0,  0,  0],
       [10,  0,  0, ...,  0,  0,  0],
       [11,  0,  0, ...,  0,  0,  0]])

In [140]:
construct_row(make_prev_row(get_aminos(data, indices[0][0])), data)

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

In [183]:
get_aminos(data, 0)[2][0]

'L'

In [188]:
get_aminos(label_df, 1)[0]

'U'

In [157]:
protein_label0 = np.asarray([[0, 0, 0, 0, 0, 1]])

for i, (p, pl) in enumerate(zip(proteins, protein_labels)):
    
    # first construct the full protein
    protein_row0 = construct_row(make_prev_row(get_aminos(data, indices[i][0])), data)
    full_protein = np.concatenate((protein_row0, p))
    
    # then construct the protein label
    full_label = np.concatenate((protein_label0, pl))
    
    # construct dataframes with indexes
    protein_df = pd.DataFrame(full_protein[:, 1:], 
                              columns=data.columns[1:], 
                              index=full_protein[:, 0])
    
    actual_aa = 
    
    
    label_df = pd.DataFrame(full_label, 
                            columns=one_hot.columns, 
                            index=xrange(1, 1 + full_label.shape[0]))
    
    # construct filenames
    data_filename = '../proteins/protein' + str(i) + '_data.csv'
    label_filename = '../proteins/protein' + str(i) + '_labels.csv' 
    
    # finally, save
    protein_df.to_csv(data_filename)
    label_df.to_csv(label_filename)


In [158]:
last_protein = pd.read_csv('../proteins/protein346_data.csv', index_col=0)
last_protein.head()

Unnamed: 0,A-2,C-2,D-2,E-2,F-2,G-2,H-2,I-2,K-2,L-2,...,N2,P2,Q2,R2,S2,T2,V2,W2,Y2,-2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [160]:
last_protein_labels = pd.read_csv('../proteins/protein346_labels.csv', index_col=0)
last_protein_labels.head()

Unnamed: 0,H,E,T,S,B,U
1,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0


In [164]:
last_protein.shape

(499, 105)

In [165]:
last_protein_labels.shape

(499, 6)

In [169]:
# check that we still have the right breakdown
np.diff(indices[-1])+1

array([499])

In [172]:
last_protein.values[1:, :]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [170]:
data.values[indices[-1][0]:indices[-1][1]]

array([[  2,   0,   0, ...,   0,   0,   0],
       [  3,   0,   0, ...,   0,   0,   0],
       [  4,   0,   0, ...,   0,   0,   0],
       ..., 
       [497,   0,   0, ...,   0,   0,   0],
       [498,   0,   0, ...,   0,   0,   1],
       [499,   0,   0, ...,   0,   0,   1]])

In [174]:
np.all(data.values[indices[-1][0]:indices[-1][1]][:, 1:] == last_protein.values[1:, :])

True

# Now concatenate all of the proteins together to make the fixed 5 file