# Logistic Regression

The dataset was obtained from https://archive.ics.uci.edu/ml/datasets/gene+expression+cancer+RNA-Seq

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../datasets/pancan-rna-seq/data.csv')
data = data.iloc[:, 1:]
print(data.info())
data = data.values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Columns: 20531 entries, gene_0 to gene_20530
dtypes: float64(20531)
memory usage: 125.5 MB
None


In [3]:
labels = pd.read_csv('../datasets/pancan-rna-seq/labels.csv')
labels = labels.iloc[:, 1:]
print(labels.info())
labels = labels.values.flatten()
# pre-processing data by encoding string labels to integers
label2id = {
    'BRCA' : 0,
    'COAD' : 1,
    'KIRC' : 2,
    'LUAD' : 3,
    'PRAD' : 4
}
labels = np.array([label2id[label] for label in labels])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 1 columns):
Class    801 non-null object
dtypes: object(1)
memory usage: 6.3+ KB
None


In [4]:
class LogisticRegression:
    def __init__(self, X, Y, N):
        self.N = N
        # feature averages for respective labels
        self.f_avg = np.array([X[Y == label].mean(axis=0) for label in np.unique(Y)])
        # size of weights array must be equal to the number of features
        self.weights = np.random.random([1, X.shape[1]])
        self.bias = 0
        self.train(X,Y)
    
    def features(self, x):
        # feature vector of a datapoint is its projection on the feature averages
        return ((self.f_avg.T * np.diag(x.dot(self.f_avg.T)/
                                              self.f_avg.dot(self.f_avg.T))).T)

    def train(self, X, Y):
        for _ in range(self.N):
            # process each data point
            for i in range(X.shape[0]):
                Yi_hat = int(self.predict(X[i,:]))
                # check if incorrect
                if Y[i] != Yi_hat:
                    # update parameters
                    self.weights += Y[i] * X[i,:]
                    self.bias += Y[i]

    def predict_batch(self, X):
        return np.array([self.predict(x) for x in X])
    
    def predict(self, x):
        # decision rule: argmax_y[exp(w.f(x,y)^T) / sum_y(exp(w.f(x,y)^T))]
        w_f = self.weights.dot(self.features(x).T)
        # normalizing w_f because of overflow error
        w_f = np.exp(w_f / w_f.sum())
        return np.argmax(w_f)

    def calculate_accuracy(self, X_test, Y_test):
        Y_pred = self.predict_batch(X_test)
        return np.sum(np.equal(Y_pred, Y_test), dtype=float) / len(Y_test)

In [6]:
%%time
# 75-25% split
train_len = 600
log_reg = LogisticRegression(data[:train_len], labels[:train_len], 1)
print("Accuracy: ", log_reg.calculate_accuracy(data[train_len:], labels[train_len:]))

Accuracy:  0.9800995024875622
CPU times: user 1.52 s, sys: 345 ms, total: 1.86 s
Wall time: 1.05 s
