In [0]:
#Linear Classification / Logistic Regression

In [0]:
#importing lib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [0]:
class LinearClassification(object):
    
    def __init__(self):
        #defining hyperparams
        self.learning_rate = 0.0001
        self.batch_size = 200
        self.no_of_iter = 1000
        #videcu za ovaj
        self.reg = 0.000001
        
    
    #Input NOTE: X - matrix of data, can be used on images or numerical data (N x D)
    #          N - Number of samples, D - Number of features
    #          In case you use images make sure that X.shape[0] represent NUMBER of samples
    #          y - labels (Nx1)
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
        #0 notation - so we add + 1 to max value from y
        self.no_of_classes = np.max(y) + 1
        
        #defining hyperparams
        # W - matrix of weights (No_of_classes x No_of_features)
        self.W = np.random.rand(self.no_of_classes, self.X_train.shape[1]) * 0.001
        
        self.W, loss_history = self.SGD(self.W, self.X_train, self.y_train, self.learning_rate, self.batch_size, self.no_of_iter, self.reg)
        
        return loss_history
    
    #STOCHASTIC GRADIENT DESCENT
    #Inputs: W - weights that we are trying to update
    #        X - feautere of training set
    #        y - wanted labels
    #        learning_rate - how fast it is going to find good parameters
    #        batch_size - how big PART of training set algo is using per iter
    #        no_of_iter -  how many times it is going to run
    #        reg - regularization
    #
    #Outputs: W_updated - updated weights matrix acording to loss function used
    #         loss_history for verbose reptresentation of our loss computation
    def SGD(self, W, X, y, learning_rate, batch_size, no_of_iter, reg):
        W_updated = W
        
        no_of_train = X.shape[0]
        #It is not necessities, but we can define loss_hitory to be sure that algo is working good
        loss_history = []
        
        for i in range(no_of_iter):
            batch_inx = np.random.choice(no_of_train, batch_size, replace=True)
            #creting smallers train sets to fit in our SGD
            X_batch = X[batch_inx,:]
            y_batch = y[batch_inx]
            
            
            loss, grad = self.SVM_classfier(W_updated, X_batch, y_batch, reg)
            loss_history.append(loss)
            #Update W:
            W_updated = W_updated - (learning_rate * grad)
            
        return W_updated, loss_history
            
    #Inputs: W - current weights
    #        X - training set features
    #        y - training set labels
    #        reg - regularization strenght
    #
    #Outputs: gradient_W - values to updated starting W
    #         loss - to see if we are updaing in good direction
    def SVM_classfier(self, W, X, y, reg):
        
        no_of_classes = np.max(y) + 1
        #creating matrix with zeros, same shape as starting weights
        
        gradient_W = np.zeros(W.shape)
        
        loss = 0.0 
        for i in range(X.shape[0]):
            #First we need to multiply weights and x for particular sample
            #need to transpose to long vector current sample
            scores = W.dot(X[i, :].T)
            #we are getting values for currect class
            correct_class = scores[y[i]]
            for j in range(no_of_classes):
                if j == y[i]:
                    continue
                # This is simple formula for SVM
                current_class_margin = scores[j] - correct_class + 1 #one is 
                if current_class_margin > 0:
                    loss +=  current_class_margin
                
                    gradient_W[y[i]:1, :] -= X[i, :] #This is where we are creating gradient for CURRECT class
                    gradient_W[j:1, :] += X[y[i], :]
        
        #average over number of train samples
        loss /= X.shape[0]
        gradient_W /= X.shape[0]
        
        loss += 0.5 * reg * np.sum(W * W)
        
        gradient_W += reg*W
    
        return loss, gradient_W
    
    #Predict function
    #Input: X - test set 
    #
    #Output: predict - list of classes
    def predict(self, X):
        pred = []
        for i in range(X.shape[0]):
            pred.append(np.argmax(np.dot(self.W,X[i, :].T)))
        return pred

In [0]:
#to check how much did algo predict right
def accuracy(y_tes, y_pred):
    correct = 0
    for i in range(len(y_pred)):
        if(y_tes[i] == y_pred[i]):
            correct += 1
    return (correct/len(y_tes))*100

In [0]:
def run():
    # Importing the dataset
    dataset = pd.read_csv('breastCancer.csv')
    dataset.replace('?', 0, inplace=True)
    dataset = dataset.applymap(np.int64)
    X = dataset.iloc[:, 1:-1].values    
    y = dataset.iloc[:, -1].values
    #handling labels column
    y_new = []
    for i in range(len(y)):
        if y[i] == 2:
            y_new.append(0)
        else:
            y_new.append(1)
    y_new = np.array(y_new)

    
    # Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size = 0.25, random_state = 0)
    

    # Feature Scaling
#     from sklearn.preprocessing import StandardScaler
#     sc = StandardScaler()
#     X_train = sc.fit_transform(X_train)
#     X_test = sc.transform(X_test)

   
    classifier = LinearClassification()
    loss_history = classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    
    #Sklearn test
    from sklearn.linear_model import LogisticRegression
    reg = LogisticRegression(random_state=0)
    reg.fit(X_train, y_train)
    
    y_pred_sk = reg.predict(X_test)

# Uncomment if you want to print out losses
#     for i in range(len(loss_history)):
#         print(loss_history[i])
    
    print("My algorithm on this dataset: ",accuracy(y_test, y_pred), "%")
    print("Sklearn Logistic regression score: ",accuracy(y_test, y_pred_sk),"%")

In [0]:
run()

My algorithm on this dataset:  65.71428571428571 %
Sklearn Logistic regression score:  96.57142857142857 %
