**Load libraries**

In [1]:
import os, sys
from os.path import join, dirname, basename, exists, isdir

import numpy as np
import pandas as pd
from asra import asra_ordering


# Background
Implementing and testing ASRA from DOI: 10.1002/chem.201103811.

# Load data

In [2]:
df = pd.read_csv('asra_data.tsv', sep='\t', skiprows=0)

X = []
y = []
sigma = []
for aa1, aa2, e, s in zip(df['idxat215'], df['idxat217'], df['eAverage'], df['eStd.']):
    X.append([aa1, aa2])
    y.append(e)
    sigma.append(s)


X = np.asarray(X)
y = np.asarray(y)
sigma = np.asarray(sigma)

display(X.shape)
display(y.shape)
display(sigma.shape)

(95, 2)

(95,)

(95,)

## Set up encoders and decoders

In [3]:
aa = ['A', 'C', 'D', 'E',
 'F', 'G', 'H', 'I',
 'K', 'L', 'M', 'N',
 'P', 'Q', 'R', 'S',
 'T', 'V', 'W', 'Y']
idx = list(range(0, 20))

encoder = {k:v for k,v in zip(aa, idx)}
decoder = {k:v for k,v in zip(idx, aa)}


In [6]:
ordering, Q = asra_ordering(X, y, levels=20, sigma=sigma, w=1.0)
for item in ordering:
    print([decoder[i] for i in item])
print(Q)

['K', 'Q']
['R', 'K']
['W', 'R']
['P', 'E']
['D', 'S']
['E', 'A']
['V', 'P']
['G', 'Y']
['A', 'W']
['H', 'T']
['I', 'H']
['Y', 'G']
['C', 'I']
['Q', 'C']
['L', 'F']
['T', 'M']
['N', 'V']
['S', 'N']
['M', 'D']
['F', 'L']
[[ -4.29530095  -4.31241206]
 [  3.1390699    2.85115565]
 [-10.51868483   9.64778507]
 [ -8.94709282  -7.54699874]
 [ 32.9477295    4.00286104]
 [ -5.16984252  -0.72974762]
 [ -1.94327226  -2.3661454 ]
 [ -1.19381279   1.95466822]
 [-15.40424494  -8.04463168]
 [  6.6229562   30.06786043]
 [ 15.4333719    4.77580976]
 [ 11.99478599   8.28311902]
 [-11.27804012  -3.97378972]
 [  4.6713264  -13.34431568]
 [-12.77174911  -7.76272991]
 [ 14.88445931  -6.52932923]
 [  7.42840974  -2.48316211]
 [ -6.83030188   7.47015864]
 [-11.5479679   -3.2422366 ]
 [  1.80326389  -3.26382188]]


### Test on more dimensions **Unfinished**

In [18]:
# test on simulated data with more than two mutation sites
X_sim = np.random.randint(5, size=(500, 4))
X_sim = np.unique(X_sim, axis=0)

y_sim = np.random.uniform(low=0.5, high=13.3, size=(X_sim.shape[0],))

ordering, _ = asra_ordering(X_sim, y_sim, levels=5)

np.stack([np.asarray(aa)[ordering[:,0]], 
          np.asarray(aa)[ordering[:,1]],
          np.asarray(aa)[ordering[:,2]],
          np.asarray(aa)[ordering[:,3]]], axis=1)
X_sim.shape

(339, 4)

In [19]:
y_sim

array([10.29823081,  3.38493895,  2.57529609,  7.68926106,  8.96064931,
        8.8500369 ,  6.12292175,  3.00820965,  6.00628013,  9.05074317,
       10.5055813 , 13.01932882,  3.27607809, 11.33753769,  5.39593874,
        7.51206143,  5.84882507, 12.75978166,  8.5598264 ,  8.9019668 ,
        8.2897067 ,  5.76879892, 12.52846832,  7.1995339 ,  4.48849959,
        5.60762995,  0.81499721,  9.58634732, 11.80832185, 12.24407032,
       11.29179529,  5.95390663, 12.80207449,  6.27983676,  0.52052101,
       10.39143305,  2.42673838, 12.46225247,  1.99175028,  5.69980101,
        2.37421472,  4.60610478, 10.03989524, 13.23755159,  9.45089548,
        7.80218558,  2.53258294,  5.17622142,  4.74179645,  3.22632345,
       11.54505883,  2.2398063 ,  0.76059847,  3.69168724,  8.1586994 ,
        4.53073857,  6.08071988,  3.89888552,  6.80653613, 10.86183741,
        2.44660684,  3.37854203,  8.32827876,  7.97150027,  8.64407809,
        8.31028933, 10.08004848, 10.42977145,  2.65186184,  4.72