In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_training_data = pd.DataFrame.from_csv("/workspace/notebooks_data/training_set.csv")

In [3]:
# Read and return the Saxon test set 
def fetch_testing_data():
    df = pd.DataFrame.from_csv("data/proteasomal_cleavage/s6_in_vivo_mhc_1_ligands_dataset.csv")
    df = df[["Sequences", "Activity"]]
    df.columns = ["sequence", "is_cleaved"]
    df.is_cleaved[df.is_cleaved == -1] = 0
    return df

raw_testing_data = fetch_testing_data()

In [4]:
from aa_props import seq_to_aa_props
from sklearn.preprocessing import MinMaxScaler

# We need to remove the Saxova samples from our training set
print "There are %d sample before removing the Saxova sequences." % (
    raw_training_data.shape[0])
raw_training_data = raw_training_data[raw_training_data.sequence.isin(
    raw_testing_data.sequence) == False]
print "There are %d sample after filtering out Saxova." % (
    raw_training_data.shape[0])

# Filter to check for selenocysteine (TODO) and an invalid "'"
seq_filter = lambda seq: seq.find("U") == -1 and seq.find("'") == -1
# Filter AA seqs and expand to AA features
training_X_y = [(seq_to_aa_props(seq), is_cleaved)
             for (i, seq, is_cleaved) in raw_training_data.itertuples()
             if seq_filter(seq)]

# Scale the data
scaler = MinMaxScaler()

training_X = pd.DataFrame(scaler.fit_transform([x for (x, y) in training_X_y]))
training_y = pd.DataFrame([y for (x, y) in training_X_y])

# We have 50 attributes for each AA. Get the scaling for each.
training_data_scale = scaler.scale_[0:50]

There are 49138 sample before removing the Saxova sequences.
There are 49116 sample after filtering out Saxova.


In [5]:
data_dir = "/workspace/notebooks_data/"
train_file = data_dir + "training_data.ll"

with open(train_file, 'w') as out:
    for (x, y) in zip(training_X.itertuples(), training_y.itertuples()):
        out.write("%s1 " % ('+' if y._1 == 1 else '-'))
        for (i, score) in enumerate(x):
            out.write("%d:%s " % (i + 1, score))
        out.write("\n")

In [6]:
findC = !train -C -s 0 $train_file
print "\n".join(findC)

Doing parameter search with 5-fold cross validation.
log2c= -47.00	rate=49.4757
log2c= -46.00	rate=49.4757
log2c= -45.00	rate=49.4757
log2c= -44.00	rate=49.4757
log2c= -43.00	rate=49.4757
log2c= -42.00	rate=49.4757
log2c= -41.00	rate=49.4757
log2c= -40.00	rate=49.4757
log2c= -39.00	rate=49.4777
log2c= -38.00	rate=49.4777
log2c= -37.00	rate=49.4798
log2c= -36.00	rate=49.4798
log2c= -35.00	rate=49.4798
log2c= -34.00	rate=49.4859
log2c= -33.00	rate=49.492
log2c= -32.00	rate=49.5022
log2c= -31.00	rate=49.5225
log2c= -30.00	rate=49.5571
log2c= -29.00	rate=49.6386
log2c= -28.00	rate=49.7567
log2c= -27.00	rate=50.1191
log2c= -26.00	rate=50.5854
log2c= -25.00	rate=51.9883
log2c= -24.00	rate=54.1975
log2c= -23.00	rate=59.1188
log2c= -22.00	rate=63.979
log2c= -21.00	rate=74.0089
log2c= -20.00	rate=75.2428
log2c= -19.00	rate=77.0855
log2c= -18.00	rate=77.3359
log2c= -17.00	rate=80.7912
log2c= -16.00	rate=80.8299
log2c= -15.00	rate=81.06
log2c= -14.00	rate=81.2066
log2c= -13.00	rate=81.4672
log2c=

In [8]:
bestC = findC[-1].split(" ")[3]
print "Using C=%s to create the model" % (bestC)
createModel = !train -c $bestC -s 0 $train_file
print "\n".join(createModel)

Using C=0.00390625 to create the model
iter  1 act 5.717e+01 pre 5.046e+01 delta 1.834e+00 f 1.330e+02 |g| 1.141e+02 CG   7
iter  2 act 1.209e-02 pre 1.209e-02 delta 1.834e+00 f 7.581e+01 |g| 2.550e+04 CG   1
iter  3 act 1.018e+01 pre 8.770e+00 delta 1.834e+00 f 7.580e+01 |g| 2.406e+01 CG  11
iter  4 act 4.721e-05 pre 4.721e-05 delta 1.834e+00 f 6.562e+01 |g| 1.394e+03 CG   1
iter  5 act 1.151e+00 pre 1.072e+00 delta 1.834e+00 f 6.562e+01 |g| 5.582e+00 CG  16
iter  6 act 3.102e-05 pre 3.102e-05 delta 1.834e+00 f 6.447e+01 |g| 1.063e+03 CG   1
iter  7 act 2.671e-02 pre 2.636e-02 delta 1.834e+00 f 6.447e+01 |g| 8.769e-01 CG  16
iter  8 act 5.240e-08 pre 5.240e-08 delta 1.834e+00 f 6.445e+01 |g| 4.333e+01 CG   1


TODO: use trainging_data_scale to generate C matrix for AA features