In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
#------------------> Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#------------------> Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

In [3]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
#------------------> Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)



In [6]:
X_normalized

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

In [7]:
#------------------> Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

#------------------> Convert data to PyTorch tensors and move to device
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)



In [8]:
#------------------> Define the neural network architecture
input_size = X_train.shape[1]
hidden_size = 10
output_size = len(set(y_train))

In [9]:
#------------------> Define the neural network
fc1 = nn.Linear(input_size, hidden_size)  # Fully connected layer from input to hidde
relu = nn.ReLU()  # ReLU activation function
fc2 = nn.Linear(hidden_size, output_size)  # Fully connected layer from hidden to output


In [10]:
#------------------> Move neural network to device
fc1.to(device)
relu.to(device)
fc2.to(device)

Linear(in_features=10, out_features=3, bias=True)

In [11]:
#------------------> Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Cross Entropy loss for classification
optimizer_SGD = optim.SGD([fc1.weight, fc1.bias, fc2.weight, fc2.bias], lr=0.01)  # Stochastic Gradient Descent


In [12]:
# adamOptim = torch.optim.Adam([fc1.weight, fc1.bias, fc2.weight, fc2.bias],lr=0.02,betas=(0.9, 0.999), eps=1e-8)

In [19]:

#-------------------> Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    #----------------> Forward pass
    hidden = relu(fc1(X_traiwn_tensor))  # Input to hidden layer with ReLU activation
    outputs = fc2(hidden)  # Hidden layer to output
    loss = criterion(outputs, y_train_tensor)

    #-----------> Backward pass and optimization
    optimizer_SGD.zero_grad()  # Clear gradients
    # print(optimizer_SGD)
    loss.backward()  # Compute gradients
    # print(loss)
    optimizer_SGD.step()  # Update weights
    # print(optimizer_SGD)


    #---------------> Print loss every 100 epochs
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


NameError: name 'X_traiwn_tensor' is not defined

In [20]:
fc1.weight

Parameter containing:
tensor([[ 0.0558, -0.1347,  0.3227,  0.1435],
        [ 0.1206, -0.3489,  0.1239,  0.5727],
        [ 0.0912, -0.0464, -0.0016, -0.7948],
        [-0.5531,  0.1056, -0.5028, -0.0900],
        [ 0.4863, -0.4063,  0.7889,  0.9482],
        [-0.5914, -0.6224,  0.2575,  0.2730],
        [ 0.3640,  0.1728, -0.2416,  0.4699],
        [ 0.2282, -0.1675,  0.0057,  0.0733],
        [-0.6657,  0.5517, -0.5165, -0.6346],
        [-0.1498, -0.5063,  0.3379,  0.6436]], requires_grad=True)

In [15]:
fc2.weight

Parameter containing:
tensor([[-0.2846, -0.1780,  0.3474,  0.5393, -0.5002, -0.4210, -0.2588,  0.1280,
          0.7057, -0.8651],
        [-0.3300,  0.0605,  0.4406, -0.1506, -0.2211,  0.2108, -0.0326,  0.1693,
         -0.7058,  0.0865],
        [-0.0774,  0.5234, -0.8631, -0.2062,  0.9830, -0.2399,  0.0827,  0.0274,
         -0.3126,  0.1271]], requires_grad=True)

In [16]:
fc1.bias

Parameter containing:
tensor([-0.2095, -0.2486,  0.8468,  0.2256, -0.1906, -0.1482,  0.0852, -0.2115,
         0.0784,  0.5841], requires_grad=True)

In [17]:
fc2.bias

Parameter containing:
tensor([-0.1683,  0.7760, -0.0737], requires_grad=True)

In [18]:
#---------------> Evaluate the model on the test set
with torch.no_grad():
    hidden = relu(fc1(X_test_tensor))  # Input to hidden layer with ReLU activation
    outputs = fc2(hidden)  # Hidden layer to output
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)
    print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9333


In [225]:
# Optimizers : Accuracy
# Adadelta : 0.90
# Rprop : 0.9333
# SGD : 0.9333


# Optimizers

In [226]:
params = [torch.randn(10, requires_grad=True)]

#### Adadelta:
##### Advantages:
    1. It does not require manual tuning of learning rates.
    2. It adapts learning rates per parameter.
    3. It has the ability to continue learning with large gradient updates.
##### Disadvantages:
    1. Computationally expensive due to maintaining per-parameter state.
    2. May require more memory compared to simpler optimizers.


In [94]:
#---------------->  Adaptive Delta optimizer
adadelta_params = {
    'lr': float,       # learning rate (default: 1.0)
    'rho': float,      # decay rate (default: 0.9)
    'eps': float,      # term added to the denominator to improve numerical stability (default: 1e-6)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
}
adadelta_optimizer = optim.Adadelta(params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0)


#### Adagrad:
##### Advantages:
    1. Automatically adapts learning rates based on the frequency of updates.
    2. Suitable for sparse data since it allows individual learning rates per parameter.
##### Disadvantages:
    1. Learning rates may become too small over time, causing premature convergence.
    2. Accumulates squared gradients in the denominator, potentially leading to numerical instabilities.


In [105]:
#-----------------> Adagrad optimizer
adagrad_params = {
    'lr': float,       # learning rate (default: 0.01)
    'lr_decay': float,  # learning rate decay (default: 0)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
    'initial_accumulator_value': float,  # starting value for the accumulators (default: 0)
}
adagrad_optimizer = optim.Adagrad(params, lr=0.01, lr_decay=0, weight_decay=0, initial_accumulator_value=0)


#### Adam:
##### Advantages:
    1. Combines the advantages of Adagrad and RMSprop.
    2. Efficient and effective for a wide range of problems.
    3. Maintains separate learning rates per parameter.
##### Disadvantages:
    1. May converge to suboptimal solutions on certain problems.
    2. Requires careful tuning of hyperparameters for optimal performance.


In [96]:
#-----------------------> Adaptive Moment Estimation optimizer
adam_params = {
    'lr': float,       # learning rate (default: 0.001)
    'betas': tuple,    # coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))
    'eps': float,      # term added to the denominator to improve numerical stability (default: 1e-8)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
    'amsgrad': bool,   # whether to use the AMSGrad variant of this algorithm (default: False)
}
adam_optimizer = optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False)


#### AdamW:
##### Advantages:
    1. Corrects the weight decay implementation issue present in Adam.
    2. Suitable for large-scale training of deep neural networks.
##### Disadvantages:
    1. Similar to Adam, may require careful hyperparameter tuning.


In [97]:
#-------------------------> Adam with Weight Decay optimizer
adamw_params = {
    'lr': float,       # learning rate (default: 0.001)
    'betas': tuple,    # coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))
    'eps': float,      # term added to the denominator to improve numerical stability (default: 1e-8)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
    'amsgrad': bool,   # whether to use the AMSGrad variant of this algorithm (default: False)
}
adamw_optimizer = optim.AdamW(params, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False)


#### Adamax:
##### Advantages:
    1. Simpler update rule compared to Adam.
    2. Can converge faster than Adam on some problems.
##### Disadvantages:
    1. Limited theoretical understanding compared to Adam.


In [98]:
#--------------------> Adam with Infinity Norm optimizer
adamax_params = {
    'lr': float,       # learning rate (default: 0.002)
    'betas': tuple,    # coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))
    'eps': float,      # term added to the denominator to improve numerical stability (default: 1e-8)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
}



adamax_optimizer = optim.Adamax(params, lr=0.002, betas=(0.9, 0.999), eps=1e-8, weight_decay=0)


#### SparseAdam:
##### Advantages:
    1. Designed for sparse gradients, making it efficient for sparse data.
##### Disadvantages:
    1. May not perform optimally on dense data.


In [99]:
#--------------------> Sparse Adam optimizer
sparseadam_params = {
    'lr': float,       # learning rate (default: 0.001)
    'betas': tuple,    # coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))
    'eps': float,      # term added to the denominator to improve numerical stability (default: 1e-8)
}


sparseadam_optimizer = optim.SparseAdam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-8)


#### ASGD:
##### Advantages:
    1. Suitable for large-scale optimization problems.
    2. Provides a good trade-off between convergence speed and memory usage.
##### Disadvantages:
    1. May require careful tuning of hyperparameters.


In [107]:
#------------------> Averaged Stochastic Gradient Descent (ASGD) optimizer
asgd_params = {
    'lr': float,       # learning rate (default: 0.01)
    'lambd': float,    # decay term (default: 1e-4)
    'alpha': float,    # power for eta update (default: 0.75)
    't0': float,       # point at which to start averaging (default: 1000)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
}


asgd_optimizer = optim.ASGD(params, lr=0.01, lambd=1e-4, alpha=0.75, t0=1000, weight_decay=0)


#### LBFGS:
##### Advantages:
    1. Efficient optimization algorithm for problems with large numbers of parameters.
    2. No learning rate to tune.
##### Disadvantages:
    1. Requires more memory compared to gradient descent-based methods.
    2. May not scale well to extremely large datasets.


In [101]:
#------------> Limited-memory BFGS optimizer
lbfgs_params = {
    'lr': float,       # learning rate (default: 1)
    'max_iter': int,   # maximal number of iterations (default: 20)
    'max_eval': int,   # maximal number of function evaluations (default: max_iter * 1.25)
    'tolerance_grad': float,  # termination tolerance on first order optimality (default: 1e-5)
    'tolerance_change': float,  # termination tolerance on function value/parameter changes (default: 1e-9)
    'history_size': int,  # update history size (default: 100)
    'line_search_fn': str,  # the line search function to use (default: None)
}


lbfgs_optimizer = optim.LBFGS(params, lr=1, max_iter=20, max_eval=None, tolerance_grad=1e-5, tolerance_change=1e-9, history_size=100, line_search_fn=None)


#### RMSprop:
##### Advantages:
    1. Mitigates the Adagrad's diminishing learning rates issue.
    2. Suitable for non-stationary objectives.
##### Disadvantages:
    1. Requires manual tuning of hyperparameters.


In [102]:
#---------------------> Root Mean Square Propagation optimizer
rmsprop_params = {
    'lr': float,       # learning rate (default: 0.01)
    'alpha': float,    # smoothing constant (default: 0.99)
    'eps': float,      # term added to the denominator to improve numerical stability (default: 1e-8)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
    'momentum': float, # momentum factor (default: 0)
    'centered': bool,  # whether to compute the centered RMSProp (default: False)
}


rmsprop_optimizer = optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False)


#### Rprop:
##### Advantages:
    1. Robust to the choice of initial learning rates.
    2. Suitable for problems with sparse gradients.
##### Disadvantages:
    1. May require careful tuning of hyperparameters.
    2. Less commonly used compared to other optimizers.


In [103]:
#-----------------------> Resilient Backpropagation optimizer
rprop_params = {
    'lr': float,       # learning rate (default: 0.01)
    'etas': tuple,     # pair of (etaminus, etaplis), that are multiplicative increase and decrease factors (default: (0.5, 1.2))
    'step_sizes': tuple,  # pair of minimal and maximal allowed step sizes (default: (1e-6, 50))
}


rprop_optimizer = optim.Rprop(params, lr=0.01, etas=(0.5, 1.2), step_sizes=(1e-6, 50))


#### SGD:
##### Advantages:
    1. Simple and easy to implement.
    2. Can be effective with proper tuning and momentum.
##### Disadvantages:
    1. Prone to getting stuck in local minima or saddle points.
    2. Requires careful tuning of learning rates and momentum.

In [104]:
#-----------------> Stochastic Gradient Descent (SGD) optimizer
sgd_params = {
    'lr': float,       # learning rate (default: 0.01)
    'momentum': float, # momentum factor (default: 0)
    'dampening': float,  # dampening for momentum (default: 0)
    'weight_decay': float,  # weight decay (L2 penalty) (default: 0)
    'nesterov': bool,  # enables Nesterov momentum (default: False)
}


sgd_optimizer = optim.SGD(params, lr=0.01, momentum=0, dampening=0, weight_decay=0, nesterov=False)


# Cunstome Optimizers

In [120]:
import torch
import torch.nn as nn
from torch.optim import Optimizer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [134]:
# Define the custom optimizer
class CustomOptimizer(Optimizer):
    def __init__(self, params, lr=0.01):
        defaults = dict(lr=lr)
        super(CustomOptimizer, self).__init__(params, defaults)

    def step(self, closure=None):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                p.data.add_(-group['lr'], grad)

In [128]:


# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)


In [129]:

# Convert data to PyTorch tensors and move to device
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)


In [130]:

# Define the neural network architecture
input_size = X_train.shape[1]
hidden_size = 10
output_size = len(set(y_train))

# Define the neural network
fc1 = nn.Linear(input_size, hidden_size)
relu = nn.ReLU()
fc2 = nn.Linear(hidden_size, output_size)


In [131]:

# Move neural network to device
fc1.to(device)
relu.to(device)
fc2.to(device)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Define custom optimizer
optimizer_custom = CustomOptimizer([{'params': fc1.parameters()}, {'params': fc2.parameters()}], lr=0.01)


In [132]:

# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    # Forward pass
    hidden = relu(fc1(X_train_tensor))
    outputs = fc2(hidden)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass and optimization
    optimizer_custom.zero_grad()
    loss.backward()
    optimizer_custom.step()

    # Print loss every 100 epochs
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [100/1000], Loss: 0.9264
Epoch [200/1000], Loss: 0.7235
Epoch [300/1000], Loss: 0.6036
Epoch [400/1000], Loss: 0.5266
Epoch [500/1000], Loss: 0.4732
Epoch [600/1000], Loss: 0.4341
Epoch [700/1000], Loss: 0.4045
Epoch [800/1000], Loss: 0.3805
Epoch [900/1000], Loss: 0.3606
Epoch [1000/1000], Loss: 0.3426


In [133]:

# Evaluate the model on the test set
with torch.no_grad():
    hidden = relu(fc1(X_test_tensor))
    outputs = fc2(hidden)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)
    print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9333
