In [1]:
from scipy.io import arff
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

arff_file_path = './Rice_Cammeo_Osmancik.arff'

data, meta = arff.loadarff(arff_file_path)

df = pd.DataFrame(data)

df = df.sample(frac=1).reset_index(drop=True)

scaler = MinMaxScaler()

column_to_be_normalized = ['Area', 'Perimeter','Major_Axis_Length','Minor_Axis_Length','Convex_Area']

scaler.fit(df[column_to_be_normalized])

df[column_to_be_normalized] = scaler.transform(df[column_to_be_normalized])

features = df.drop("Class", axis=1)
label = df["Class"]


X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.16, random_state=42)


class LogisticRegressionGD:
    def __init__(self, data, labels,learning_rate):
        self.weights = np.random.rand(data.shape[1])
        self.data = data
        self.labels = labels
        self.learning_rate = learning_rate
        self.N = len(data)
        self.threshold = 0.00004

    def fit(self):
        prev_weights = None
        while not self.termination_condition(self.weights,prev_weights):
            prev_weights = self.weights + np.zeros(self.weights.shape[0])
            self.predict()
            print(self.calculate_error(self.weights))
            
    def calculate_error(self,weights):
        sum = 0
        for i in range(self.N):
            sum = sum + np.log(1 + np.exp(-self.labels[i]*(weights @ self.data[i])))
        sum =  (1/self.N)*sum
        return sum

        
    def termination_condition(self,curr_weights, prev_weights = None):
           if (prev_weights is not None) and abs(self.calculate_error(curr_weights) - self.calculate_error(prev_weights))<self.threshold:
                return True
           return False
           
           
    def predict(self):
        gradient = 0
        for i in range(self.N):
            gradient = gradient + (self.labels[i]*self.data[i])/(1 + np.exp(self.labels[i]*(self.weights @ self.data[i])))
        gradient =  (-1/self.N)*gradient
        direction = -gradient
        self.weights = self.weights + self.learning_rate*direction


class LogisticRegressionSGD:
    def __init__(self, data, labels,learning_rate):
        self.weights = np.random.rand(data.shape[1])
        self.data = data
        self.labels = labels
        self.learning_rate = learning_rate
        self.N = len(data)
        self.threshold = 0.00004
        self.error_datas = list()

    def fit(self):
        prev_weights = None
        while True:
            random_index = None
            for round in range(self.N):
                random_index = np.random.randint(0, self.N)
                prev_weights = self.weights + np.zeros(self.weights.shape[0])
                self.predict(random_index)
                random_index = np.random.randint(0, self.N)
                self.error_datas.self.calculate_error(self.weights,random_index))
            if random_index is not None and self.termination_condition(self.weights,prev_weights,random_index):
                break
                    
    def calculate_error(self,weights,random_index):
        return np.log(1 + np.exp(-self.labels[random_index]*(weights @ self.data[random_index])))

        
    def termination_condition(self,curr_weights,prev_weights,random_index):
           if (prev_weights is not None) and abs(self.calculate_error(curr_weights,random_index) - self.calculate_error(prev_weights,random_index))<self.threshold:
                return True
           return False
           
           
    def predict(self,random_index):
        direction = (self.labels[random_index]*self.data[random_index])/(1 + np.exp(self.labels[random_index]*(self.weights @ self.data[random_index])))
        self.weights = self.weights + self.learning_rate*direction

In [2]:
def main():
    tranformed_y_train = [-1 if x == b'Cammeo' else 1 for x in y_train.values]
    logistic_regression = LogisticRegressionSGD(X_train.values, tranformed_y_train, 0.4)
    logistic_regression.fit()

if __name__ == "__main__":
    main()

3.5418517539128973
0.17006794087738508
0.3621467183117195
0.6499890086606545
0.4718803892523167
0.7638084055526646
1.0555993426913313
0.5904610513345813
0.8759670758152495
0.5927277631915423
0.8064473865866946
0.42779690339195703
1.3841283561091666
1.7074946157311308
0.2260359263189191
0.47381243895513325
0.36994321211819925
0.6263111803338806
0.817308598059244
0.40652615713778195
0.32305762485401707
1.4091487632408843
0.21067261039292215
0.22723234634516362
0.1567189978347841
1.1014956631116324
1.213884862623534
0.26085932218702357
0.7724763848340779
0.870222183792893
0.6127961636472598
0.6915842283302277
0.6291044601360138
0.44130055284304903
0.5776181634863011
0.8283120008279032
0.8921789739121321
1.162620941027251
0.2596106891022271
0.3723065782454947
0.37985688983504284
0.6974396465368298
0.2754506428674715
0.6364593051263622
0.29803411787056117
0.5364738320241457
0.46800375398948924
0.6549364263741829
0.47001577521206067
0.863367201075578
0.583069143654722
0.7777956160159151
0.80