In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from collections import Counter


from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
#from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

In [2]:
with open('train.dat', 'r') as fh:
    train = fh.readlines()

with open('test.dat', 'r') as fh:
    test = fh.readlines()

In [3]:
# ytrain, = labels for the training set
ytr = [int(p.split('\t')[0]) for p in train]
tr = [p.split('\t')[1].strip() for p in train]
te = [p.strip() for p in test]

In [4]:
def kmer(p, k=2):
    return [p[i:i+k] for i in range(0, len(p)-k+1)]

def kmers(p, k=2):
    els = []
    for i in range(1,k):
        els.extend(kmer(p, k=i))
    return els

In [5]:
tr = [kmers(p, k=3) for p in tr]
te = [kmers(p, k=3) for p in te]

In [6]:
#Assign IDS and create dictionary. each peptide is represented as a vector with 436 features
# Now we put the count in that vector of how many times we find a particular feature
# Dense is bad, as it keeps all features including counts of 0
# Sparse is much faster as it removes the 0 values from the vector features

mp = {} # dictionary
for p in tr+te:
    for e in p:
        if e not in mp:
            mp[e] = len(mp)

len(mp)

436

In [7]:
# for each of the peptides we create a count of their features. by default it is dense, but we can do work to remove the 0's to make it sparse.

# transforming peptide to vector
def dense(p):
    x  = np.zeros(len(mp))
    for e in p:
        x[mp[e]] += 1
    return x

dense(tr[0])

array([2., 2., 2., 3., 1., 1., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [8]:
# sparse dataset from all the peptides. # csr matrix only stores non-zeros. it does not store the zeros
def sparse(ds):
    nrows = len(ds) # number of rows
    ptr = np.zeros(nrows+1, dtype=int) #pointer array
    nnz = 0 # number of non-zeros
    for i, p in enumerate(ds): # figuring out size of each row
        ps = set(p)
        nnz += len(ps)
        ptr[i+1] = nnz # populate pointers of where the row starts and ends
    ind = np.zeros(nnz, dtype=int)
    val = np.zeros(nnz, dtype=float)
    nnz = 0
    for p in ds:
        ct = Counter(p).most_common()
        for e, c in ct:
            val[nnz] = c
            ind[nnz] = mp[e]
            nnz += 1
    return csr_matrix((val, ind, ptr), shape=(nrows, len(mp)))

In [9]:
sptr = sparse(tr)
spte = sparse(te)

In [10]:
len(sptr.toarray())

1566

In [13]:
X_train = sptr[0:783].toarray()
#X_test = sptr[782:1565].toarray()
y_train = ytr[0:783]
#y_test = ytr[782:1565]
#len(y_test)
X_test = spte.toarray()

In [14]:
#clf = MLPClassifier(random_state=1, max_iter=100).fit(X_train, y_train) # X_train, y_train
#predic = clf.predict(X_test) # X_test
#clf.score(X_test, y_test) # X_test, y_test
#mcc = matthews_corrcoef(y_test, predic) # y_test, predic
#print(mcc)

In [18]:
oversample = RandomOverSampler(sampling_strategy='minority')
sptr_over, ytr_over = oversample.fit_resample(X_train, y_train)
clf_over = MLPClassifier(random_state=1, max_iter=100).fit(sptr_over, ytr_over)
predic_over = clf_over.predict(X_test)
#mcc_over = matthews_corrcoef(y_test, predic_over)
#print(mcc_over)

In [19]:
perceptronPredictions = clf_over.predict(X_test)

In [20]:
with open("test.txt", 'w') as fh:
    fh.write("\n".join(map(str, perceptronPredictions)))