## Used libraries

In [1]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
from abc import ABC, abstractmethod

# 1. Preprocessing Data :

In [2]:
import warnings
warnings.filterwarnings(action="ignore")

In [3]:
mnist = fetch_openml('mnist_784')

## 1.1 Convert data to Data Frame:


In [4]:
df = pd.DataFrame().from_dict(mnist.data)

## 1.2 Filter data with target 0 , 1:

In [5]:
df['target'] = mnist.target.astype('int32')
df = df.loc[ df['target'].isin( [0 , 1]) ]

## 1.3 Remove columns that have same value:

In [6]:
for col in df.columns:
    if (df[col] == 0).all() :
        df.drop(col , axis= 1 , inplace=True)

##1.4 Split data

In [7]:
X = df.drop(columns=['target'])
y = df['target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 1.5 standariza Data :

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 2. Optimizers:

In [10]:
class Optimizer:
    def __init__(self ,eta = 0.01 , epsilon = 1e-7 , beta = 0.99, beta2= 0.99 ):
        self.eta = eta
        self.epsilon = epsilon
        self.beta = beta
        self.beta2 = beta2
            
    def gradient_descent(self , w , b , dw , db) :
        w = w - self.eta * dw
        b = b - self.eta * db 
        return w , b      
    
    def RMS_prop(self , prev_vdw , prev_vdb, dw , db , w , b ):

        current_vdw = self.beta * prev_vdw + (1 - self.beta) * ( dw ** 2 ) 
        current_vdb = self.beta * prev_vdb + (1 - self.beta) * ( db ** 2 )

        w = w - self.eta * (dw / ( np.sqrt(current_vdw) + self.epsilon ))
        b = b - self.eta * (db / ( np.sqrt(current_vdb) + self.epsilon ))
        
        return w , b , current_vdw , current_vdb
    
    def momentum(self , prev_vdw , prev_vdb, dw , db , w , b ):
        
        current_vdw = self.beta * prev_vdw + (1 - self.beta) * dw 
        current_vdb = self.beta * prev_vdb + (1 - self.beta) * db 
        
        w = w - self.eta * dw 
        b = b - self.eta * db 
        
        return w , b , current_vdw , current_vdb
    
    def Adam(self ,vdw_momentum , vdb_momentum , vdw_rms , vdb_rms , w , b , dw, db, t):
        vdw_momentum = self.beta * vdw_momentum + (1 - self.beta) * dw
        vdb_momentum = self.beta * vdb_momentum + (1 - self.beta) * db

        vdw_rms = self.beta2 * vdw_rms + (1 - self.beta2) * (dw ** 2)
        vdb_rms = self.beta2 * vdb_rms + (1 - self.beta2) * (db ** 2)

        w = w - self.eta * (vdw_momentum / (np.sqrt(vdw_rms) + self.epsilon))
        b = b - self.eta * (vdb_momentum / (np.sqrt(vdb_rms) + self.epsilon))
        
        return w, b, vdw_momentum, vdb_momentum, vdw_rms, vdb_rms

# 3. Logistic Regression Implementation:

In [11]:
class LogisticRegression:
    def __init__(self , eta = 0.01 , epoches = 100 , tolerance = 1e-7, lambda_ = 5):
        self.lambda_ = lambda_
        self.eta = eta
        self.epoches = epoches
        self.tolerance = tolerance
    
    def sigmoid(self , z):
        return 1 / ( 1 + np.exp(-z)) 
    
    
    def calculate_dw_logistic(self , X , y ,  y_predicted, w):
        length = len(y) 
        return (1 / length) * ( np.dot(y_predicted - y , X) + ( np.where(w > 0 , self.lambda_ , - self.lambda_ ) ) ) 
    
    def calculate_db_logistic(self , y ,  y_predicted ):
        return np.mean( y_predicted - y)
    

    def yPredict(self ,X , w , b) :
        return np.dot(w , X.T) + b
    
    def identifyClassWithThreshold(self ,y_predict , threshold = 0.5):
        for i in range(len(y_predict)):
            if y_predict[i] >= threshold :
                y_predict[i] = 1
            else:
                y_predict[i] = 0

    
    def accuracy(self , y , y_predict):
        true = 0 
        for (actual , predicted ) in zip(y, y_predict):
            if actual == predicted:
                true += 1
        return ( true / len(y) )   * 100 
    
    def costWithCrossEntropy(self , y , y_predicted):
        length = len(y) 
        first_term  =  np.dot(-y , np.log(y_predicted) )
        second_term =  np.dot((1 - y) , np.log(1 - y_predicted))
        result = (1 / length) * (first_term - second_term)
        return result 
    
    def fit(self, X , y , batch_size = 1 , algo_optimizer = "GD"):

        # reset model on re-fit 
        self.__init__(self.eta,self.epoches,self.tolerance,self.lambda_)

        # intializa the weights randomly
        w = np.random.rand(X.shape[1]) 
        b = np.random.rand(1) 
        length = len(X)
        n = len(X[0]) # number of features 
        
        prev_vdw_momentum =  prev_vdb_momentum = prev_vdb_RMS = prev_vdw_RMS = 0
        for epoch in range(self.epoches):
            last_index = 0 

            # batching
            for last_index in range(0 , length , batch_size ):

                # splitting                 
                x_batch = X[last_index : min(last_index + batch_size , length) ]
                y_batch = y[last_index : min(last_index + batch_size , length) ]
                
                
                # calculate net value
                z = self.yPredict(x_batch , w , b)
            
                # plugin activation function to net_value to get predicted value
                y_predicted = self.sigmoid(z) 
                
                
                
                # calculating cost with cross entropy
                error = self.costWithCrossEntropy(y_batch , y_predicted)
                
                
                if error <= self.tolerance:
                    break

            
                # update the weights 
                dw =  self.calculate_dw_logistic(x_batch , y_batch , y_predicted, w)
                db =  self.calculate_db_logistic( y_batch , y_predicted)
                optimizer = Optimizer(eta=self.eta)


                # L1 Reg 
                prev_vdw_RMS = 0
                prev_vdb_RMS = 0
                prev_vdw_momentum = 0
                prev_vdb_momentum = 0
                vdw_momentum      = 0
                vdb_momentum      = 0
                vdw_rms           = 0
                vdb_rms           = 0
                # optimizers switch
                if algo_optimizer == "GD":
                    w,b = optimizer.gradient_descent(w, b, dw, db)

                elif algo_optimizer == "RMSProp":
                    w, b, prev_vdw_RMS, prev_vdb_RMS = optimizer.momentum(prev_vdw_RMS , prev_vdb_RMS , dw , db ,w , b)

                elif algo_optimizer == "momentum":
                    w, b, prev_vdw_momentum, prev_vdb_momentum = optimizer.RMS_prop(prev_vdw_momentum , prev_vdb_momentum , dw , db ,w , b)

                elif algo_optimizer == "Adam":
                    w, b, vdw_momentum, vdb_momentum, vdw_rms, vdb_rms = optimizer.Adam(vdw_momentum ,vdb_momentum , vdw_rms , vdb_rms ,w , b, dw, db, batch_size)

                else:
                    print(f"Optimizer  : {algo_optimizer} optimizer ,  not found !")
                    return np.zeros(n) , 0
            # print(f"Epoch :{epoch} ----------> error :{error}  with learning rate : {self.eta}")
            
        return w , b    
            
        
        

# 4. Testing and report

## 4.1 Constants 

In [12]:
optimizers = [ "Adam" , "RMSProp" , "momentum" , "GD" ]
batches    = [32  , 64 , 128 , 256 , 512]
l1_lambdas = [0   , 5  , 10 , 15] 
etas       = [0.1 , 0.01 , 0.001 , 0.0001 ]

## 4.2 Scaling and preparing test set  

In [13]:
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

## 4.3 Try diffrent values form lambda term ( lasso regularization )

In [14]:
# testing diffrent values for lambda ( Lasso regulariation )
for l1_lambda in l1_lambdas : 
    
    clf = LogisticRegression( eta = 0.01 , epoches = 100 , tolerance = 1e-7 , lambda_ = l1_lambda )
    
    # Mini-batch gradient descent ( of batch size 32 )
    w , b = clf.fit( X_train , y_train , 32 , algo_optimizer = "GD" )

    # prediction 
    z = clf.yPredict(X_test , w , b)
    y_predicted = clf.sigmoid(z) 

    # classification 
    clf.identifyClassWithThreshold(y_predicted)
    
    # calculate accuracy 
    print(f"using Gradient descent optimizer , with regularization term = {l1_lambda} , model accuracy is = " , clf.accuracy(y_test , y_predicted))

using Gradient descent optimizer , with regularization term = 0 , model accuracy is =  99.61049610496106
using Gradient descent optimizer , with regularization term = 5 , model accuracy is =  98.62648626486265
using Gradient descent optimizer , with regularization term = 10 , model accuracy is =  96.9249692496925
using Gradient descent optimizer , with regularization term = 15 , model accuracy is =  60.311603116031165


## 4.4  Try diffrent batch sizes ( mini-batchs )

In [15]:
for batch in batches : 
    for optimizer in optimizers : 
        # Logistic Regression model 
        clf = LogisticRegression( eta = 0.01 , epoches = 100 , tolerance = 1e-7 , lambda_ = 0 )
    
        # diffrent Mini-batchs with diffrent optimizers  
        w , b = clf.fit( X_train , y_train , batch , algo_optimizer = optimizer )

        # prediction 
        z = clf.yPredict(X_test , w , b)
        y_predicted = clf.sigmoid(z) 

        # classification 
        clf.identifyClassWithThreshold(y_predicted)
        
        # calculate accuracy 
        print(f"using {optimizer} optimizer , with batch size {batch} , model accuracy is = " , clf.accuracy(y_test , y_predicted))
    print("-------------------------------")

using Adam optimizer , with batch size 32 , model accuracy is =  96.74046740467405
using RMSProp optimizer , with batch size 32 , model accuracy is =  99.73349733497335
using momentum optimizer , with batch size 32 , model accuracy is =  96.65846658466585
using GD optimizer , with batch size 32 , model accuracy is =  99.50799507995079
-------------------------------
using Adam optimizer , with batch size 64 , model accuracy is =  97.51947519475195
using RMSProp optimizer , with batch size 64 , model accuracy is =  99.65149651496516
using momentum optimizer , with batch size 64 , model accuracy is =  97.43747437474374
using GD optimizer , with batch size 64 , model accuracy is =  99.58999589995899
-------------------------------
using Adam optimizer , with batch size 128 , model accuracy is =  98.1959819598196
using RMSProp optimizer , with batch size 128 , model accuracy is =  99.73349733497335
using momentum optimizer , with batch size 128 , model accuracy is =  98.29848298482985
usin

# 5. Final Conclusion 

1. Based on the results, it can be concluded that different optimizers and batch sizes can have a significant impact on the model's accuracy. RMSProp optimizer consistently achieved the highest accuracy across all batch sizes, while Adam optimizer showed the second-highest accuracy. On the other hand, GD optimizer consistently performed the worst. Increasing batch size generally resulted in better accuracy, except in the case of momentum optimizer where the accuracy decreased as batch size increased. In summary, choosing the right optimizer and batch size is crucial to achieving high accuracy in machine learning models.

2. for our problem based on this results and conclusion we may use the RMSProp 
