# Microarray Classifying

In [None]:
# To make sure all of the correct libraries are installed, import each module and print the version number
# Check versions of the libraries
import sys
import scipy
import numpy
import matplotlib
import pandas
import sklearn
import skrebate

print('Python:     {}'.format(sys.version))
print('scipy:      {}'.format(scipy.__version__))
print('numpy:      {}'.format(numpy.__version__))
print('matplotlib: {}'.format(matplotlib.__version__))
print('pandas:     {}'.format(pandas.__version__))
print('sklearn:    {}'.format(sklearn.__version__))
print('skrebate:   {}'.format(skrebate.__version__))

In [None]:
# Import, change module names
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import time
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, KFold, StratifiedKFold
from sklearn.linear_model import Lasso
from sklearn import preprocessing
from sklearn.svm import LinearSVC

In [None]:
# Dataset            Disease          Samples  Features  Classes
# Alon (1999)        Colon Caner       62       2000     2
# Burcyznski (2006)  Crohn’s Disease  127      22283     3
# Golub (1999)       Leukemia          72       7129     2

# Alon, U., Barkai, N., Notterman, D. A., Gish, K., Ybarra, S., Mack, D., & Levine, A. J. (1999). 
# Broad patterns of gene expression revealed by clustering analysis of tumor and normal colon tissues probed by oligonucleotide arrays. 
# Proceedings of the National Academy of Sciences, 96(12), 6745-6750.

# Burczynski, M. E., Peterson, R. L., Twine, N. C., Zuberek, K. A., Brodeur, B. J., Casciotti, L., ... & Spinelli, W. (2006). 
# Molecular classification of Crohn’s disease and ulcerative colitis patients using transcriptional profiles in peripheral blood mononuclear cells. 
# The journal of molecular diagnostics, 8(1), 51-61.

# Golub, T. R., Slonim, D. K., Tamayo, P., Huard, C., Gaasenbeek, M., Mesirov, J. P., ... & Bloomfield, C. D. (1999). 
# Molecular classification of cancer: class discovery and class prediction by gene expression monitoring. 
# Science, 286(5439), 531-537

# Data = ['alon', 'burczynski', 'golub']
name = "alon"
type = "lasso"

# Import data
features = pd.read_csv('data/' + name + '_inputs.csv', header = None)
labels = pd.read_csv('data/' + name + '_outputs.csv', header = None)

In [None]:
features.fillna(0, inplace = True)

features = np.asarray(features.values)
print(features)
labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int))
print(labels)

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
features = min_max_scaler.fit_transform(features)

## Lasso

In [None]:
lasso = Lasso(alpha=0.001)
lasso.fit(features, labels)
indexes = np.asarray(np.where(lasso.coef_ != 0))
np.savetxt('features/' + name + '_lasso.txt', lasso.coef_)

## Relief

In [None]:
fs = ReliefF()
fs.fit(features, labels)
np.savetxt('features/' + name + '_relieff.txt', fs.feature_importances_)

In [None]:
# type lasso or relief
if type == 'lasso':
    gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
    indexes = np.where(gains != 0)[0]
else:
    gains = np.asarray(np.loadtxt('features/' + name + '_lasso.txt'))
    indexes = np.where(gains != 0)[0]
    gains = np.asarray(np.loadtxt('features/' + name + '_relieff.txt')) 
    indexes = gains.argsort()[-indexes.shape[0]:][::-1]

In [None]:
scores = []
loo = LeaveOneOut()
startTime = time.time()

for train_index, test_index in loo.split(features):
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    X_train = x_train[:, indexes]
    X_test = x_test[:, indexes]
    Y_train = y_train[:]
    Y_test = y_test[:]

    batch_size = 1
    num_classes = np.max(labels) + 1
    epochs = 50

    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = Y_train[:]
    Y_test = Y_test[:]

    clf = LinearSVC(random_state=0)

    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)

    scores.append(score)

endTime = time.time()

In [None]:
with open('results/' + name + '_svm_' + type + '.txt', 'w') as file:
    file.write('Score: ' + str(np.average(scores)) + '\n')
    file.write('Time: ' + str(endTime - startTime))
    file.close()

In [None]:
print('Score: ' + str(np.average(scores)))
print('Time: ' + str(endTime - startTime))