In [8]:
import torch
import numpy as np

In [3]:
pip install torch

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/59/1f/4975d1ab3ed2244053876321ef65bc02935daed67da76c6e7d65900772a3/torch-2.2.1-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.2.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Obtaining dependency information for typing-extensions>=4.8.0 from https://files.pythonhosted.org/packages/f9/de/dc04a3ea60b22624b51c703a84bbe0184abcd1d0b9bc8074b5d6b7ab90bb/typing_extensions-4.10.0-py3-none-any.whl.metadata
  Downloading typing_extensions-4.10.0-py3-none-any.whl.metadata (3.0 kB)
Collecting fsspec (from torch)
  Obtaining dependency information for fsspec from https://files.pythonhosted.org/packages/ad/30/2281c062222dc39328843bd1ddd30ff3005ef8e30b2fd09c4d2792766061/fsspec-2024.2.0-py3-none-any.whl.metadata
  Downloading fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Downloading torch-2.2.1-cp311-cp311-win_amd64.whl (198.6 MB)
   


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
lst = [[1,2,3], [4,5,6]]
tensor = torch.tensor(lst)

In [6]:
tensor

tensor([[1, 2, 3],
        [4, 5, 6]])

In [10]:
np_array = np.array([[1,2,3], [4,5,6]])

In [11]:
np_tensor = torch.from_numpy(np_array)
np_tensor

tensor([[1, 2, 3],
        [4, 5, 6]], dtype=torch.int32)

In [12]:
tensor.shape

torch.Size([2, 3])

In [13]:
tensor.dtype

torch.int64

In [15]:
# displays which device the tensor is loaded on, such as a CPU or GPU
tensor.device

device(type='cpu')

In [16]:
tensor+np_tensor

tensor([[ 2,  4,  6],
        [ 8, 10, 12]])

In [17]:
tensor*np_tensor

tensor([[ 1,  4,  9],
        [16, 25, 36]])

# First neural network

In [18]:
import torch.nn as nn

In [27]:
# creat tensor
input_tensor = torch.tensor([0.3471, 0.4547, -0.2356])

# creat first linear layer
linear_layer = nn.Linear(in_features = 3, out_features =2)

# pass input through linear layer
output = linear_layer(input_tensor)
output

# networks with only linear layers are called "fully connected networks".

tensor([ 0.4950, -0.2503], grad_fn=<ViewBackward0>)

In [25]:
linear_layer.weight

Parameter containing:
tensor([[ 0.4593, -0.0739, -0.4664],
        [ 0.0051, -0.5005,  0.1787]], requires_grad=True)

In [26]:
linear_layer.bias

Parameter containing:
tensor([-0.1629,  0.4813], requires_grad=True)

In [28]:
# weights and biases are initialized randomly, 
# later these weights and biases are tuned

# Sequential

In [29]:
model = nn.Sequential(
    nn.Linear(10,18),
    nn.Linear(18,20),
    nn.Linear(20,5)
)

In [35]:
arr =np.random.random(10)
r = arr.astype('float32')
input_tensor2 = torch.from_numpy(r)
input_tensor2

tensor([0.7042, 0.1507, 0.2062, 0.5406, 0.9572, 0.5336, 0.0259, 0.4324, 0.3293,
        0.0463])

In [37]:
output2 = model(input_tensor2)
output2

tensor([ 0.1005, -0.0660, -0.1944,  0.2997,  0.2181], grad_fn=<ViewBackward0>)

# Sigmoid function

In [38]:
input_tensor4 = torch.tensor([[6.0]])
sigmoid = nn.Sigmoid()
output = sigmoid(input_tensor4)
output

tensor([[0.9975]])

In [39]:
# the last step in a neural network when performing binary classification
# sigmoid as the last step in a network of only linear layers is equivalent to a logistic regression 
model = nn.Sequential(
    nn.Linear(6,4),
    nn.Linear(4,1),
    nn.Sigmoid()
)

# Softmax - multiclass classification

In [41]:
input_tensor5 = torch.tensor([[4.3, 6.1, 2.3]])
# used to the last dimensio = -1
probabilities = nn.Softmax(dim=-1)
output = probabilities(input_tensor5)
output

tensor([[0.1392, 0.8420, 0.0188]])

# Forward pass

In [44]:
arr =np.random.random(30)
arr = arr.reshape(5,6)
r = arr.astype('float32')
input_tensor6 = torch.from_numpy(r)
input_tensor6

tensor([[0.6611, 0.4581, 0.7533, 0.7126, 0.1199, 0.9594],
        [0.1514, 0.8072, 0.9929, 0.0569, 0.2109, 0.7071],
        [0.7837, 0.7540, 0.5261, 0.0243, 0.7320, 0.5334],
        [0.3689, 0.7743, 0.9581, 0.8208, 0.4180, 0.0111],
        [0.4173, 0.4929, 0.6583, 0.4849, 0.2056, 0.6849]])

### Binary classification

In [45]:
model = nn.Sequential(
    nn.Linear(6,4), # first argumment is equal the number of feauter in one row
    nn.Linear(4,1),
    nn.Sigmoid()
)
output = model(input_tensor6)
output
# The output of our binary classification is a single probability between zero and one for each of our five samples.

tensor([[0.3473],
        [0.3450],
        [0.3050],
        [0.3790],
        [0.3638]], grad_fn=<SigmoidBackward0>)

###  Multi-class classification

In [48]:
# we are predicting three classes: mammal, bird or reptile
n_classes = 3
model = nn.Sequential(
    nn.Linear(6,4), # first argumment is equal the number of feauter in one row
    nn.Linear(4,n_classes),
    nn.Softmax(dim = -1) # dim equals minus one to indicate the five samples have the same 
                         # last dimension as the last linear layer's output
)
output = model(input_tensor6)
output.shape

torch.Size([5, 3])

In [49]:
output

tensor([[0.3957, 0.2238, 0.3804],
        [0.3707, 0.2751, 0.3542],
        [0.3623, 0.2467, 0.3910],
        [0.3973, 0.2253, 0.3774],
        [0.3811, 0.2501, 0.3688]], grad_fn=<SoftmaxBackward0>)

### Regression

In [51]:
model = nn.Sequential(
    nn.Linear(6,4), 
    nn.Linear(4,1)
)
output = model(input_tensor6)
output

tensor([[-0.8555],
        [-0.8856],
        [-0.8127],
        [-0.7990],
        [-0.8147]], grad_fn=<AddmmBackward0>)

# Loss functions

### one hot encoding

In [52]:
one_hot_array = np.array([1,0,0])

In [53]:
import torch.nn.functional as F
F.one_hot(torch.tensor(0), num_classes = 3) # the correct class 0

tensor([1, 0, 0])

In [54]:
F.one_hot(torch.tensor(1), num_classes = 3)

tensor([0, 1, 0])

In [55]:
F.one_hot(torch.tensor(2), num_classes = 3)

tensor([0, 0, 1])

### Cross entropy loss in PyTorch

In [59]:
# this is the most used loss function for classification problems
from torch.nn import CrossEntropyLoss
score = torch.tensor([[-0.1221, 0.1059]])
ohe = torch.tensor([[1, 0]])
criterion = CrossEntropyLoss()
criterion(score.double(), ohe.double())
# this casts the tensors to a specific float data type that is accepted by the CrossEntropyLoss() function

tensor(0.8136, dtype=torch.float64)

# Backpropagation

In [60]:
arr =np.random.random(30)
arr = arr.reshape(5,6)
r = arr.astype('float32')
input_tensor6 = torch.from_numpy(r)
input_tensor6

tensor([[0.1646, 0.0916, 0.8040, 0.8046, 0.8735, 0.8920],
        [0.1848, 0.4480, 0.5242, 0.7147, 0.1553, 0.1821],
        [0.8431, 0.7805, 0.7956, 0.2143, 0.4482, 0.1153],
        [0.1009, 0.2607, 0.9396, 0.9960, 0.4097, 0.8597],
        [0.4722, 0.1675, 0.2877, 0.9116, 0.6867, 0.2121]])

In [68]:
arr =np.random.random(5)
arr = arr.reshape(5,1)
arr = arr.astype('float32')
one_hot = torch.from_numpy(arr)
one_hot

tensor([[0.3116],
        [0.5787],
        [0.1133],
        [0.7600],
        [0.1388]])

In [69]:
model = nn.Sequential(
    nn.Linear(6,4), 
    nn.Linear(4,8),
    nn.Linear(8,1)
)
prediction = model(input_tensor6)
prediction

tensor([[-0.0286],
        [-0.0004],
        [ 0.0612],
        [-0.0442],
        [ 0.0433]], grad_fn=<AddmmBackward0>)

In [72]:
criterion = CrossEntropyLoss()
loss = criterion(prediction.double(), one_hot.double())
loss

tensor(-0., dtype=torch.float64, grad_fn=<DivBackward1>)

In [73]:
loss.backward()

In [74]:
model[0].weight.grad, model[0].bias.grad

(tensor([[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]]),
 tensor([0., 0., 0., 0.]))

In [75]:
model[1].weight.grad, model[1].bias.grad

(tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]),
 tensor([0., 0., 0., 0.]))

In [76]:
model[2].weight.grad, model[2].bias.grad

(tensor([[0., 0., 0., 0., 0., 0., 0., 0.]]), tensor([0., 0., 0., 0.]))

# Updating model parameters

### manual

In [None]:
# learning rate
lr = 0.001

weight = model[0].weight
weight_grad = model[0].weight.grad
weight = weight - lr*weight_grad

bias = model[0].bias
bias_grad = model[0].bias.grad
bias = bias - lr*wbias_grad

# Gradient descent

In [83]:
# stochastic gradient descent (SGD)
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr = 0.001)
optimizer.step()

# Mean Square Error Loss - for linear

In [None]:
def mean_square_loss(prediction,target):
    return np.mean((prediction - target)**2)

# All together

In [None]:
dataset = TensorDataset(torch.tensor(features).float(), torch.tensor(target).float())
dataloader = DataLoader(dataset, batch_size = 4, shuffle = True)

In [None]:
model = nn.Sequential(
    nn.Linear(4,2), 
    nn.Linear(2,1))

In [None]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [None]:
# Loop over the number of epochs and the dataloader
for i in range(num_epochs):
  for data in dataloader:
    # Set the gradients to zero
    optimizer.zero_grad()
    # Run a forward pass
    feature, target = data
    prediction = model(feature)    
    # Calculate the loss
    loss = criterion(prediction, target)    
    # Compute the gradients
    loss.backward()
    # Update the model's parameters
    optimizer.step()
show_results(model, dataloader)

# Data import

In [89]:
import pandas as pd
water = pd.read_csv('water_potability.csv')
water

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,0.587349,0.577747,0.386298,0.568199,0.647347,0.292985,0.654522,0.795029,0.630115,0
1,0.643654,0.441300,0.314381,0.439304,0.514545,0.356685,0.377248,0.202914,0.520358,0
2,0.388934,0.470876,0.506122,0.524364,0.561537,0.142913,0.249922,0.401487,0.219973,0
3,0.725820,0.715942,0.506141,0.521683,0.751819,0.148683,0.467200,0.658678,0.242428,0
4,0.610517,0.532588,0.237701,0.270288,0.495155,0.494792,0.409721,0.469762,0.585049,0
...,...,...,...,...,...,...,...,...,...,...
2006,0.636224,0.580511,0.277748,0.418063,0.522486,0.342184,0.310364,0.402799,0.627156,1
2007,0.470143,0.548826,0.301347,0.538273,0.498565,0.231359,0.565061,0.175889,0.395061,1
2008,0.817826,0.087434,0.656389,0.670774,0.369089,0.431872,0.563265,0.285745,0.578674,1
2009,0.424187,0.464092,0.459656,0.541633,0.615572,0.388360,0.397780,0.449156,0.440004,1


In [90]:
water['Potability'].unique()

array([0, 1], dtype=int64)

In [91]:
features = water.iloc[:,:-1]

In [None]:
features2 = torch.tensor(water[['ph', 'Sulfate', 'Conductivity', 'Organic_carbon']].to_numpy()).float()

In [93]:
X = features.to_numpy()
X

array([[0.58734916, 0.57774671, 0.38629788, ..., 0.65452157, 0.79502934,
        0.63011476],
       [0.64365393, 0.44130035, 0.31438058, ..., 0.37724796, 0.20291434,
        0.52035803],
       [0.38893354, 0.47087564, 0.50612238, ..., 0.24992171, 0.40148717,
        0.21997295],
       ...,
       [0.81782618, 0.08743355, 0.65638906, ..., 0.56326524, 0.28574454,
        0.5786739 ],
       [0.42418706, 0.4640915 , 0.45965606, ..., 0.39778031, 0.44915584,
        0.44000443],
       [0.32242529, 0.49289123, 0.84140928, ..., 0.47142165, 0.50345848,
        0.59186714]])

In [94]:
target = water.iloc[:,-1]

In [95]:
y = target.to_numpy()
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [96]:
from torch.utils.data import TensorDataset

dataset = TensorDataset(torch.tensor(X).float(), torch.tensor(y).float())

In [101]:
sample = dataset[0]
sample

(tensor([0.5873, 0.5777, 0.3863, 0.5682, 0.6473, 0.2930, 0.6545, 0.7950, 0.6301]),
 tensor(0.))

In [98]:
input_sample, label_sample = sample

In [100]:
print('input_sample: ', input_sample )
print('label_sample: ', label_sample )

input_sample:  tensor([0.5873, 0.5777, 0.3863, 0.5682, 0.6473, 0.2930, 0.6545, 0.7950, 0.6301])
label_sample:  tensor(0.)


In [103]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size = 4, shuffle = True)

In [106]:
X, y = next(iter(dataloader))

In [107]:
print(X, y)

tensor([[0.4885, 0.3538, 0.3841, 0.2644, 0.5916, 0.5093, 0.3412, 0.6081, 0.3409],
        [0.3383, 0.8178, 0.2702, 0.4767, 0.3908, 0.6669, 0.6088, 0.6153, 0.3230],
        [0.5358, 0.4525, 0.5166, 0.3675, 0.3708, 0.4085, 0.4760, 0.7192, 0.6174],
        [0.5864, 0.3366, 0.3193, 0.8570, 0.5684, 0.3239, 0.4658, 0.2294, 0.5414]]) tensor([0., 0., 1., 1.])


In [104]:
for batch_imputs, batch_labels in dataloader:
    print('batch_imputs: ', batch_imputs )
    print('batch_labels: ', batch_labels )

batch_imputs:  tensor([[0.5458, 0.5327, 0.3753, 0.5310, 0.5801, 0.4037, 0.5946, 0.3452, 0.4286],
        [0.4933, 0.6228, 0.4057, 0.3848, 0.5441, 0.4681, 0.4624, 0.5957, 0.4412],
        [0.5064, 0.3044, 0.6585, 0.3652, 0.4266, 0.3523, 0.4485, 0.4769, 0.4823],
        [0.4173, 0.5069, 0.3956, 0.3640, 0.5947, 0.3791, 0.3616, 0.4440, 0.4504]])
batch_labels:  tensor([1., 0., 1., 1.])
batch_imputs:  tensor([[0.4092, 0.4576, 0.7789, 0.2549, 0.8059, 0.2355, 0.5283, 0.4421, 0.8043],
        [0.4644, 0.5019, 0.2418, 0.4237, 0.7339, 0.1980, 0.3853, 0.3797, 0.8245],
        [0.4537, 0.4257, 0.2886, 0.7418, 0.5007, 0.4804, 0.1845, 0.4901, 0.5662],
        [0.3859, 0.4600, 0.3529, 0.7537, 0.2286, 0.5733, 0.6556, 0.3534, 0.4407]])
batch_labels:  tensor([1., 1., 0., 1.])
batch_imputs:  tensor([[0.3399, 0.3599, 0.2349, 0.5807, 0.6088, 0.6047, 0.4900, 0.5447, 0.5444],
        [0.4275, 0.6258, 0.4340, 0.4530, 0.4810, 0.4134, 0.4973, 0.5412, 0.4442],
        [0.5285, 0.4705, 0.5626, 0.3684, 0.5830, 0.42

batch_imputs:  tensor([[0.4701, 0.5488, 0.3013, 0.5383, 0.4986, 0.2314, 0.5651, 0.1759, 0.3951],
        [0.3356, 0.4642, 0.5784, 0.5202, 0.4680, 0.3463, 0.3399, 0.6635, 0.4828],
        [0.3948, 0.4640, 0.3268, 0.3206, 0.5229, 0.2427, 0.6188, 0.4418, 0.4203],
        [0.8235, 0.4898, 0.6928, 0.6365, 0.3039, 0.4745, 0.7026, 0.6318, 0.4582]])
batch_labels:  tensor([1., 0., 0., 0.])
batch_imputs:  tensor([[0.4943, 0.6029, 0.3458, 0.4830, 0.6900, 0.6064, 0.5282, 0.5140, 0.6159],
        [0.6486, 0.5000, 0.2912, 0.7430, 0.4907, 0.3680, 0.1208, 0.4062, 0.7934],
        [0.2904, 0.5247, 0.2279, 0.4722, 0.6984, 0.3644, 0.8314, 0.5882, 0.6006],
        [0.6300, 0.4951, 0.2314, 0.3558, 0.5523, 0.2098, 0.6019, 0.4579, 0.5779]])
batch_labels:  tensor([1., 1., 0., 0.])
batch_imputs:  tensor([[0.3266, 0.7225, 0.3652, 0.3106, 0.7102, 0.3797, 0.6276, 0.6085, 0.3382],
        [0.4907, 0.5991, 0.2875, 0.4307, 0.5967, 0.5280, 0.5484, 0.5374, 0.3235],
        [0.8183, 0.4619, 0.2124, 0.5689, 0.5579, 0.10

batch_imputs:  tensor([[0.5067, 0.1921, 0.4328, 0.4100, 0.4567, 0.3930, 0.4613, 0.4779, 0.4480],
        [0.5824, 0.5243, 0.4378, 0.5527, 0.7388, 0.3880, 0.4290, 0.3750, 0.4144],
        [0.5869, 0.5256, 0.4084, 0.6778, 0.5803, 0.1644, 0.3961, 0.4771, 0.4050],
        [0.2087, 0.5126, 0.7981, 0.5479, 0.5558, 0.4089, 0.5374, 0.3985, 0.5547]])
batch_labels:  tensor([0., 0., 1., 0.])
batch_imputs:  tensor([[0.5765, 0.5225, 0.4775, 0.5003, 0.6112, 0.5872, 0.5620, 0.4995, 0.5628],
        [0.5258, 0.6843, 0.6477, 0.2680, 0.5972, 0.4560, 0.5606, 0.5144, 0.3834],
        [0.5655, 0.4370, 0.2103, 0.4148, 0.5144, 0.8429, 0.6232, 0.5026, 0.3430],
        [0.4936, 0.5409, 0.3839, 0.5015, 0.4992, 0.4364, 0.5500, 0.4939, 0.2957]])
batch_labels:  tensor([0., 0., 0., 0.])
batch_imputs:  tensor([[0.3631, 0.4846, 0.4368, 0.4793, 0.5916, 0.2569, 0.5532, 0.5955, 0.3447],
        [0.4126, 0.8581, 0.2657, 0.4470, 0.6658, 0.8949, 0.4409, 0.7037, 0.4383],
        [0.2466, 0.3420, 0.4703, 0.3095, 0.7379, 0.33

batch_labels:  tensor([0., 0., 0., 1.])
batch_imputs:  tensor([[0.3734, 0.3992, 0.4785, 0.5688, 0.7018, 0.3629, 0.4790, 0.6339, 0.3781],
        [0.4629, 0.4812, 0.3409, 0.4247, 0.7151, 0.4616, 0.1505, 0.6496, 0.5592],
        [0.6252, 0.5017, 0.2010, 0.5521, 0.5788, 0.5679, 0.4068, 0.6832, 0.1263],
        [0.6812, 0.6310, 0.3983, 0.3404, 0.5264, 0.3500, 0.5961, 0.5784, 0.5666]])
batch_labels:  tensor([0., 1., 1., 1.])
batch_imputs:  tensor([[0.3171, 0.4599, 0.4925, 0.4576, 0.6414, 0.3246, 0.4750, 0.5718, 0.6874],
        [0.6245, 0.5047, 0.3094, 0.6433, 0.7128, 0.5170, 0.6175, 0.4558, 0.4645],
        [0.4546, 0.6233, 0.6150, 0.6442, 0.7286, 0.2227, 0.4231, 0.7671, 0.6228],
        [0.5010, 0.4835, 0.4434, 0.4445, 0.5404, 0.3946, 0.4299, 0.6055, 0.6067]])
batch_labels:  tensor([0., 0., 0., 1.])
batch_imputs:  tensor([[0.6066, 0.4639, 0.3609, 0.2159, 0.6736, 0.3311, 0.3144, 0.5035, 0.3843],
        [0.4930, 0.3963, 0.3534, 0.3260, 0.7080, 0.5640, 0.5364, 0.4856, 0.6838],
        [0.60

batch_imputs:  tensor([[0.5536, 0.6637, 0.2571, 0.5412, 0.5954, 0.5511, 0.3895, 0.3554, 0.7441],
        [0.1870, 0.4619, 0.2065, 0.6178, 0.5788, 0.4458, 0.6564, 0.6855, 0.4044],
        [0.4637, 0.2774, 0.4509, 0.4969, 0.4722, 0.3138, 0.4772, 0.5694, 0.4142],
        [0.4349, 0.2356, 0.3408, 0.4858, 0.4861, 0.6435, 0.4790, 0.4273, 0.7683]])
batch_labels:  tensor([1., 0., 0., 0.])
batch_imputs:  tensor([[0.5332, 0.4081, 0.1516, 0.6966, 0.8729, 0.1889, 0.3551, 0.7269, 0.4897],
        [0.4193, 0.5024, 0.2816, 0.3661, 0.4914, 0.2452, 0.3946, 0.3061, 0.5680],
        [0.5728, 0.5544, 0.4213, 0.5217, 0.6176, 0.4253, 0.2988, 0.5095, 0.2943],
        [0.5752, 0.6017, 0.4212, 0.5107, 0.6574, 0.4716, 0.4501, 0.8214, 0.6936]])
batch_labels:  tensor([1., 1., 0., 1.])
batch_imputs:  tensor([[0.5616, 0.6185, 0.1999, 0.6903, 0.6774, 0.4535, 0.5124, 0.6161, 0.1659],
        [0.4797, 0.4468, 0.3801, 0.4275, 0.5595, 0.3547, 0.5767, 0.5708, 0.2396],
        [0.6134, 0.5218, 0.2622, 0.6169, 0.6471, 0.43

# Full example based on Water dataset

In [108]:
# Load the different columns into two PyTorch tensors
features = torch.tensor(water[['ph', 'Sulfate', 'Conductivity', 'Organic_carbon']].to_numpy()).float()
target = torch.tensor(water['Potability'].to_numpy()).float()

# Create a dataset from the two generated tensors
dataset = TensorDataset(features, target)

# Create a dataloader using the above dataset
dataloader = DataLoader(dataset, shuffle=True, batch_size=2)
x, y = next(iter(dataloader))

# Create a model using the nn.Sequential API
model = nn.Sequential(nn.Linear(4,8),
    nn.Linear(8,1))
output = model(features)
print(output)

tensor([[-0.0848],
        [-0.0390],
        [-0.1087],
        ...,
        [ 0.0297],
        [-0.0790],
        [-0.0654]], grad_fn=<AddmmBackward0>)


# Sigmoid
- the gradients are always low and approach zero for low and high values of x. This behavior is called saturation
- gradient will be so small that it can prevent the weight from changing or updating. This phenomenon is called vanishing gradients

# ReLU
- or positive inputs, the output of the function is equal to the input. For strictly negative outputs, the output of the function is equal to zero.

In [110]:
# Create a ReLU function with PyTorch
relu_pytorch = nn.ReLU()

# Apply your ReLU function on x, and calculate gradients
x = torch.tensor(-1.0, requires_grad=True)
y = relu_pytorch(x)
y.backward()

# Print the gradient of the ReLU function for x
gradient = y
print(gradient)

tensor(0., grad_fn=<ReluBackward0>)


# Leaky ReLU
- For positive inputs, it behaves similarly to the ReLU function. For negative inputs, however, it multiplies them by a small coefficient (defaulted to 0.01 in PyTorch). 
- By doing this, the leaky ReLU function has non-null gradients for negative inputs. 

In [None]:
leaky_relu = nn.LeakyReLU(negative_slope = 0.05)
# the negative_slope parameter indicates the coefficient by which negative inputs are multiplied. 

# Counting the number of parameters

In [111]:
model = nn.Sequential(
    nn.Linear(8,4), # 4 neurons * (8+1 bias) parameters = 36
    nn.Linear(4,2)) # 2 neurons * (4+1 bias) parameters = 10
# total = 46 parameters

In [113]:
# .numel() - returns the number of elements in the tensor
total = 0
for parameter in model.parameters():
    total += parameter.numel()
total

46

In [114]:
def calculate_capacity(model):
  total = 0
  for p in model.parameters():
    total += p.numel()
  return total

# Updating weights with SGD

In [115]:
sgd = optim.SGD(model.parameters(), lr = 0.001, momentum = 0.95)
# lr = learning rate - controls the tep size
# Typical learning rate values range from 0.10, to 0.001
# momentum - controls the inertia of the optimizer (bezwładność)
# The momentum keeps the step size large when previous steps were also large, even if the current gradient is small. 
# Momentum usually ranges from 0.85 to 0.99.

# Layer initialization and transfer learning

In [116]:
layer = nn.Linear(64,128)
print('min: ', layer.weight.min(), 'max: ', layer.weight.max())

min:  tensor(-0.1249, grad_fn=<MinBackward1>) max:  tensor(0.1250, grad_fn=<MaxBackward1>)


### Layer initialization usuing uniform distribution

In [119]:
custom_layer = nn.init.uniform_(layer.weight)
print('min: ', custom_layer.weight.min(), 'max: ', custom_layer.weight.max())

AttributeError: 'Parameter' object has no attribute 'weight'

# Transfer learning 
- transfer learning consists in taking a model that was trained on a first task and reuse for a second task
- we can load the weights from the first model and use them as a starting point to train on this new dataset

# Fine tuning - dostrajanie
- we load weights from a previously trained model, but train the model with a smaller learning rate
- We can even train part of a network, if we decide some of the network layers do not need to be trained and choose to freeze them

In [None]:
model = nn.Sequential(
    nn.Linear(64,128),
    nn.Linear(128,256)) 

# Freeze the parameters of the first two layers of this model.
for name, param in model.named_parameters():    
  
    # Check if the parameters belong to the first layer
    if name == '0.weight' or name == '0.bias':
      
        # Freeze the parameters
        param.requires_grad = False
  
    # Check if the parameters belong to the second layer
    if name == '1.weight' or name == '1.bias':
      
        # Freeze the parameters
        param.requires_grad = False

# Evaluating model performance

### Calculating training loss

In [None]:
training_loss = 0
for i, data in enumerate(trainloader, 0):
    # run the forward pass
    .
    .
    # calculate the loss
    loss = criterion(outputs,labels)
    # calculate gradient
    .
    .
    # calculate an sum the loss
    training_loss +=loss.item()
epoch_loss = training_loss /len(trainloader)

### Calculating validation loss

In [None]:
validation_loss = 0
model.eval() # to put the model in evaluation mode, because some layers behave differently at training vs validation
with torch.no_grad(): # we will not be performing gradient calculation in this epoch
    for i, data in enumerate(validationloader, 0):
        # run the forward pass
        outputs = model(data[0])
        .
        # calculate the loss
        loss = criterion(outputs,labels)
        validation_loss +=loss.item()
epoch_loss = validation_loss /len(validationloader)
model.train() # in a mood for training again

### Calculating accuracy with torchmetrics

In [None]:
import torchmetrics

metric = torchmetrics.Accuracy(task="multiclass", num_classes=3)

for i, data in enumerate(dataloader, 0):
    features,labels = data
    outputs = model(features)
    # calculate accuracy over the batch
    acc= metric(outputs, labels.argmax(dim=-1)) # The output variable here would be the probabilities 
    # returned by the softmax function. If the labels contain one-hot encoded classes, 
    # we'll need the argmax function to obtain numbers instead of one-hot vectors.
# calculate total accuracy over the whole epoch
acc = metric.compute()
print(f'Accuraccy of all data: {acc}')
# reset the metric for the next epoch
metric.reset()

# Fighting overfitting

### Reasons:
- a small dataset, 
- a model with too much capacity, 
- large values of weights.

### How to fight:
- reduce the model size
- add a new type of layer called dropout
- use weight decay to force the parameters to remain small
- get more data or use data augmentation.

In [None]:
# "Regularization" using a dropout layer
model = nn.Sequential(
    nn.Linear(8,4),
    nn.ReLU(),
    nn.Dropout(p=0.5)) # dropout layers are added after activation functions

In [None]:
# Regularization with weight decay
optimizer = optim.SGD(model.parameters(), lr = 0.001, weight_decay = 0.0001) # value between (0,1)
# This regularization term is proportional to the current value of the weight, 
# and it is subtracted from the gradient during backpropagation

In [None]:
# Data augmentation
# Data augmentation is commonly applied to image data, which can be rotated and scaled, 
# so that different views of the same face become available as "new" data points.

# Improving model performance

### Step 1: overfit the training set

In [None]:
#it is recommended to overfit a single data point
features,labels = next(iter(trainloader))
for i in range(1000):
    outputs = model(features)
    loss = criterion(outputs, labels)    
    loss.backward()
    optimizer.step()
# Overfitting a single data point should give us an accuracy of one and a loss close to zero
# We can then overfit the whole training set.

### Step 2: reduce overfitting

### Step 3: fine-tune hyperparameters

In [None]:
# grid search over the hyperparameters
for factor in range(2,6):
    lr = 10 ** - factor
    
factor = np.random.uniform(2,6)
lr = 10 ** - factor

In [None]:
values = []
for idx in range(10):
    # Randomly sample a learning rate factor between 2 and 4
    factor = np.random.uniform(2,4)
    lr = 10 ** -factor
    
    # Randomly select a momentum between 0.85 and 0.99
    momentum = np.random.uniform(0.85,0.99)
    
    values.append((lr, momentum))