In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import  torch

# Tensors
As the name suggests,PyTorch is a library for processing Tensors.A tensor is a multidimensional array,number,vector or a matrix.

Just like NumPy arrays, tensors have a
type and a shape. In fact, in the Python API tensors are simply represented by NumPy
ndarrays. They typically contain floats, but you can also use them to carry strings
(arbitrary byte arrays).

Tensors can have any number of dimensions aand different lengths along each dimension.We can inspect the length along each dimension using the .shape property of tensor.

NOTE that it's not possible to create tensors with an improper shape

In [None]:
t1=torch.tensor(4.)
t1

4. is a shorthand for 4.0. used to indicate pytorch that you want to create a floating point number

In [None]:
#vector
t2=torch.tensor([1.,2,3,4])
t2

all the elemnts of a tensor have same type

In [None]:
#matrix

t3=torch.tensor([[5.,6],[3,4],[88,9]])
t3

In [None]:
#3-d array....gonna give you a cuboid kinda structure
t4=torch.tensor([[[34,55,66],[33,45,6]],[[33,9,8],[22,11,82]]])
t4

# Tensor operations and gradients

In [None]:
#creating tensors
x=torch.tensor(3.)
w=torch.tensor(4.,requires_grad=True)
b=torch.tensor(5.,requires_grad=True)
x,w,b

In [None]:
#arithmetic operation
y=w*x+b
y

Pytorch is unique as we can automatically compute the derivaitve of y w.r.t the tensor that have requires_grad set to true. This feature is called automatic gradient

In [None]:
#computing derivatives
y.backward()

In [None]:
#display gradients
print('dy/dx:',x.grad)
print('dy/dw:',w.grad)
print('dy/db:',b.grad)

# Tensor functions

In [None]:
#create a tensor with a fixed value for every element
t6=torch.full((3,2),33)
t6

In [None]:
#concatenate two tensors with compatible shapes
t7=torch.cat((t3,t6))
t7

In [None]:
#compute the sin of each element
t8=torch.sin(t7)
t8

In [None]:
#change the tensor shape
t9=t8.reshape(3,2,2)
t9

In [None]:
x=np.array([[1,2],[2,5],[5.,6]])
x

In [None]:
#convert the numpy array to a PyTorch tensor 
y=torch.from_numpy(x)
y

In [None]:
#convert a torch to a numpy array 
z=y.numpy()
z


Reasons why we need PyTorch since Numpy alread provides data structures and utillites for working with mutli dimensional numeric data
1. Autograd-the ability to automatically compute gradients for tensor operations is essential for training deep learning models
1. GPU support-while working with massive datasets amd large models,pytorch tensor operations can be performed efficiently using a Graphic Processing Unit.computations that might take hours can be completed within minutes using GPUs

# WORKING WITH MNIST HANDWRITTEN DATABASE

In [None]:
import torch
import torchvision
from torchvision.datasets import MNIST

In [None]:
#downloading training dataset
dataset=MNIST(root='data/',download=True)

In [None]:
len(dataset)

In [None]:
test_dataset=MNIST(root='data/',train=False)
test_dataset

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 
#without this the jupyter will show the graphs as a popups#

In [None]:
image,label= dataset[0]
plt.imshow(image,cmap='gray')
print('label:',label)

it's evident that it can be challenging to recognize the images with human eye.
PyTorch doesn't know how to work with images.We need to convert the images into tensors. We can do this by specifying a transform while creating our dataset
we can see it is 28x28 pixels image

In [None]:
import torchvision.transforms as transforms

In [None]:
# MNIST dataset (images and labels)
dataset = MNIST(root='data/', 
                train=True,
                transform=transforms.ToTensor())

In [None]:
img_tensor, label = dataset[0]
print(img_tensor.shape, label)

let's look at the sample values inside the tensor

In [None]:
print(img_tensor[0,10:15,10:15])
print(torch.max(img_tensor),torch.min(img_tensor))

The values range fromm 0 to 1 with 0 representing black,1 white and the values in between are different shades of grey. We can plot the tensor as an image using plt.imshow

In [None]:
plt.imshow(img_tensor[0,10:15,10:15],cmap='gray')

**SPLITTING DATA INTO TRAIN VALIDATION AND TEST SET**

In [None]:
from torch.utils.data import random_split
train_ds, val_ds=random_split(dataset,[50000,10000])

its essential to choose a random sample for creating a validation set. the training data id often sorted by the target labels,i.e images of 0s followed by images of 1s. if we create a val set of the remaninng 20% of images, it would only consists of 8s and 9s. such a split would not make useful models
we'll use a batch size of 128

In [None]:
from torch.utils.data import DataLoader
batch_size=128
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)

**Training model**
A logistic reegression model is almost identical to a linear regression model. ot contains weights and bias matrices and the output is obtained by simple matrix operations(pred=x@w.t()+b).... @ is matrix mul

we will use nn.linear to create the model instead of manually creating and initializing matrices 

each 1x28x28 image tensor is flattend innto a vector of 784(28*28)

In [None]:
import torch.nn as nn
input_size=28*28
num_classes=10
model=nn.Linear(input_size,num_classes)

let's extend thenn.module class from PyTorch to define a custom model

In [None]:
class MnistModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, num_classes)
        
    def forward(self, xb):
        xb = xb.reshape(-1, 784)
        out = self.linear(xb)
        return out
    
model = MnistModel()
        
    

Inside the `__init__` constructor method, we instantiate the weights and biases using `nn.Linear`. And inside the `forward` method, which is invoked when we pass a batch of inputs to the model, we flatten the input tensor and pass it into `self.linear`.

`xb.reshape(-1, 28*28)` indicates to PyTorch that we want a *view* of the `xb` tensor with two dimensions. The length along the 2nd dimension is 28\*28 (i.e., 784). One argument to `.reshape` can be set to `-1` (in this case, the first dimension) to let PyTorch figure it out automatically based on the shape of the original tensor.

Note that the model no longer has `.weight` and `.bias` attributes (as they are now inside the `.linear` attribute), but it does have a `.parameters` method that returns a list containing the weights and bias.

In [None]:
for images,labels in train_loader:
    outputs=model(images)
    break

In [None]:
print(outputs.shape)
print(outputs[:2].data)

for each of the 100 images input,we get 10 outputs. each row's elements must lie between 0 to 1 and adds up to 1 which is not the case. so we use softmax function

In [None]:
import torch.nn.functional  as F

In [None]:
#Apply softmax
probs=F.softmax(outputs,dim=1)
print('sample prob',probs[:2].data)
print(torch.sum(probs[0]).item())

In [None]:
# Imports
import torch

import torchvision
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision.datasets import MNIST
from torch.utils.data import random_split
from torch.utils.data import DataLoader
# Hyperparmeters
batch_size = 128
learning_rate = 0.001

# Other constants
input_size = 28*28
num_classes = 10

In [None]:
# Download dataset
dataset = MNIST(root='data/', train=True, transform=transforms.ToTensor(), download=True)

# Training validation & test dataset
train_ds, val_ds = random_split(dataset, [50000, 10000])
test_ds = MNIST(root='data/', train=False, transform=transforms.ToTensor())

# Dataloaders
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size*2)
test_loader = DataLoader(test_ds, batch_size*2)
image, label = train_ds[0]
plt.imshow(image[0], cmap='gray')
print('Label:', label)

In [None]:
class MnistModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, num_classes)
        
    def forward(self, xb):
        xb = xb.reshape(-1, 784)
        out = self.linear(xb)
        return out
    
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(epoch, result['val_loss'], result['val_acc']))
    
model = MnistModel()



##Training
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [None]:
evaluate(model, val_loader)

In [None]:
history = fit(5, 0.001, model, train_loader, val_loader)

In [None]:
accuracies = [r['val_acc'] for r in history]
plt.plot(accuracies, '-x')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('Accuracy vs. No. of epochs');

In [None]:
#Prediction
def predict_image(img, model):
    xb = img.unsqueeze(0)
    yb = model(xb)
    _, preds  = torch.max(yb, dim=1)
    return preds[0].item()
img, label = test_ds[919]
plt.imshow(img[0], cmap='gray')
print('Label:', label, ', Predicted:', predict_image(img, model))

# tensorflow

In [None]:
import tensorflow as tf
print(tf.version)

In [None]:
scalar=tf.constant([[69,5],[6,7],[33,4]])
scalar.ndim
print(scalar)

In [None]:
scalar

In [None]:
scalarr=tf.variable_creator_scope(3)

In [None]:

#creating random tensor
random =tf.random.Generator.from_seed(33)
#set seed for reproducibiltiy
random.normal(shape=(33,2))
#normal outputs random values from a normal distribution


attributes of tensors

datatype of every element

number of dimensions

shape of tensor

elements along the 0 axis

elements along the last axis

total number of elements in our tensor


Indexing tensors

In [None]:
somelist=[1,23,45,6]
somelist[0:]

In [None]:
#getting the first two elements of each 

In [None]:

m=tf.constant([[45.,4],[5,6]])


In [None]:
tf.math.reduce_variance(m)
tf.math.reduce_max(m)

In [None]:
tf.argmin(m)

In [None]:
tf.one_hot(somelist,depth=2)

# Architecture of a neural network regression model
* input layer shape-same shape as the no. of features
* hidden layer-problem specific
* neurons per hidden layer-problem specific
* output layer shape- same shape as desired predicition shape
* hidden activation-usually reLu(rectified linear unit)
* output activation-
* loss function-MAE,MSE
* optimizer-SGD,Adam

In [None]:
tf.constant(3)

# regression with neural networks in tensorflow

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


# Create features (using tensors)
X = tf.constant([-7.0, -4.0, -1.0, 2.0, 5.0, 8.0, 11.0, 14.0],dtype=tf.float32)

# Create labels (using tensors)
y = tf.constant([3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0],dtype=tf.float32)

# Visualize it
plt.scatter(X, y);

In [None]:
#set random seed
tf.random.set_seed(42)

#create a model using the sequential API
model= tf.keras.Sequential([tf.keras.layers.Dense(1)])
#compile the model
model.compile(loss=tf.keras.losses.mae,optimizer=tf.keras.optimizers.SGD(),metrics=['mae'])
#fit the model
model.fit(tf.expand_dims(X, axis=-1), y, epochs=1000,verbose=0)

In [None]:
X,y

In [None]:
y1=tf.constant([12,4,0,2,9])

In [None]:
model.predict([y1])

improving the model
* increase the number of hidden units(all called neurons) within each of the hidden layers,change the activation function of each layer
* changing the optimization function or perhaphs the learning rate of the optimization function.
* fit the model for more epochs or on more data

In [None]:
X1=np.arange(-1000,1000,4)
y1=np.arange(-990,1010,4)

In [None]:
yy=X1+10==y1

splitting data in training set, validation set and  test set
here we only split in train and test set

In [None]:
xtrain=X1[:400]
ytrain=y1[:400]

xtest=X1[400:]
ytest=y1[400:]


In [None]:
plt.figure(figsize=(10, 7))
# Plot training data in blue
plt.scatter(xtrain, ytrain, c='b', label='Training data')
# Plot test data in green
plt.scatter(xtest, ytest, c='g', label='Testing data')
# Show the legend
plt.legend();

In [None]:
# Set random seed
tf.random.set_seed(42)

# Replicate model_1 and add an extra layer
model_2 = tf.keras.Sequential([
  tf.keras.layers.Dense(50),
  tf.keras.layers.Dense(1) # add a second layer
])

# Compile the model
model_2.compile(loss=tf.keras.losses.mae,
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['mae'])

# Fit the model
model_2.fit(tf.expand_dims(xtrain, axis=-1), ytrain, epochs=100, verbose=0) # set verbose to 0 for less output
     


In [None]:
y_preds=model.predict(xtest)

In [None]:
def plot_predictions(train_data=xtrain, 
                     train_labels=ytrain, 
                     test_data=xtest, 
                     test_labels=ytest, 
                     predictions=y_preds):
  """
  Plots training data, test data and compares predictions.
  """
  plt.figure(figsize=(10, 7))
  # Plot training data in blue
  plt.scatter(train_data, train_labels, c="b", label="Training data")
  # Plot test data in green
  plt.scatter(test_data, test_labels, c="g", label="Testing data")
  # Plot the predictions in red (predictions were made on the test data)
  plt.scatter(test_data, predictions, c="r", label="Predictions")
  # Show the legend
  plt.legend();
     

In [None]:
 plot_predictions(train_data=xtrain, 
                     train_labels=ytrain, 
                     test_data=xtest, 
                     test_labels=ytest, 
                     predictions=y_preds)
  

Evaluating model'
the cycle of building a model, fitting of model,evaluate the model, tweak the model and again fit it

In [None]:
model.evaluate(xtest,ytest)

In [None]:
model.summary()

# Classification with neural networks 
Typical architecture of a classification neural network
The word typical is on purpose.

Because the architecture of a classification neural network can widely vary depending on the problem you're working on.

However, there are some fundamentals all deep neural networks contain:

An input layer.
Some hidden layers.
An output layer.


In [None]:
from sklearn.datasets import make_circles
#make 1000eg
n_samples=1000
#create circles
x,y=make_circles(n_samples,noise=0.03,random_state=42)

In [None]:
import pandas as pd
df=pd.DataFrame({'X0':x[:,0],'X1':x[:,1],'label':y})
df

In [None]:
import matplotlib.pyplot as plt
plt.scatter(x[:,0],x[:,1],c=y,cmap=plt.cm.RdYlBu)

**Steps in modelling**

Now we know what data we have as well as the input and output shapes, let's see how we'd build a neural network to model it.

In TensorFlow, there are typically 3 fundamental steps to creating and training a model.

Creating a model - piece together the layers of a neural network yourself (using the functional or sequential API) or import a previously built model (known as transfer learning).

Compiling a model - defining how a model's performance should be measured (loss/metrics) as well as defining how it should improve (optimizer).

Fitting a model - letting the model try to find patterns in the data (how does X get to y).

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

model_1= tf.keras.Sequential([
  tf.keras.layers.Dense(100,activation='relu')
    ,tf.keras.layers.Dense(1)
])

# 2. Compile the model
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(), # binary since we are working with 2 clases (0 & 1)
                optimizer=tf.keras.optimizers.SGD(),
                metrics=['accuracy'])

# 3. Fit the model
model_1.fit(x, y, epochs=300,verbose=0)
      

In [None]:
model_1.evaluate(x,y)

In [None]:
preds=model_1.predict(x)

In [None]:
# Set random seed
tf.random.set_seed(42)

# Create a model
model_2 = tf.keras.Sequential([
  tf.keras.layers.Dense(4, activation=tf.keras.activations.relu), # hidden layer 1, ReLU activation
  tf.keras.layers.Dense(4, activation=tf.keras.activations.relu), # hidden layer 2, ReLU activation
  tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid) # ouput layer, sigmoid activation
])

# Compile the model
model_2.compile(loss=tf.keras.losses.binary_crossentropy,
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

# Fit the model
history = model_2.fit(x, y, epochs=100, verbose=0)
     

In [None]:
y3=model_2.predict(x)

In [None]:
import numpy as np

def plot_decision_boundary(model, X, y):
  # Define the axis boundaries of the plot and create a meshgrid
  x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
  y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
  xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                       np.linspace(y_min, y_max, 100))

  # Create X values (we're going to predict on all of these)
  x_in = np.c_[xx.ravel(), yy.ravel()] # stack 2D arrays together: https://numpy.org/devdocs/reference/generated/numpy.c_.html

  # Make predictions using the trained model
  y_pred = model.predict(x_in)

  # Check for multi-class
  if model.output_shape[-1] > 1: # checks the final dimension of the model's output shape, if this is > (greater than) 1, it's multi-class
    print("doing multiclass classification...")
    # We have to reshape our predictions to get them ready for plotting
    y_pred = np.argmax(y_pred, axis=1).reshape(xx.shape)
  else:
    print("doing binary classifcation...")
    y_pred = np.round(np.max(y_pred, axis=1)).reshape(xx.shape)

  # Plot decision boundary
  plt.contourf(xx, yy, y_pred, cmap=plt.cm.RdYlBu, alpha=0.7)
  plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.RdYlBu)
  plt.xlim(xx.min(), xx.max())
  plt.ylim(yy.min(), yy.max())
     

In [None]:
plot_decision_boundary(model_2,x,y)

# Activation functions

In [None]:
# Create a toy tensor (similar to the data we pass into our model)
A = tf.cast(tf.range(-10, 10), tf.float32)
A
    

In [None]:
# Visualize our toy tensor
plt.plot(A);

now let's recreate the sigmoid function and see what it does to our data
we can also use the pre defined sigmoid function at tf.keras.actiavtions.sigmoid

Sigmoid takes a real value as input and outputs another value between 0 and 1. It’s easy to work with and has all the nice properties of activation functions: it’s non-linear, continuously differentiable, monotonic, and has a fixed output range.

Pros

It is nonlinear in nature. Combinations of this function are also nonlinear!
It will give an analog activation unlike step function.
It has a smooth gradient too.
It’s good for a classifier.
The output of the activation function is always going to be in range (0,1) compared to (-inf, inf) of linear function. So we have our activations bound in a range. Nice, it won’t blow up the activations then.
Cons

Towards either end of the sigmoid function, the Y values tend to respond very less to changes in X.
It gives rise to a problem of “vanishing gradients”.
Its output isn’t zero centered. It makes the gradient updates go too far in different directions. 0 < output < 1, and it makes optimization harder.
Sigmoids saturate and kill gradients.
The network refuses to learn further or is drastically slow ( depending on use case and until gradient /computation gets hit by floating point value limits ).

In [None]:
def sigmoid(x):
    return 1/(1+tf.exp(-x))

sigmoid(A)

lets plot it

In [None]:
plt.plot(sigmoid(A))

a non linear line!

lets check relu function(turns all negatives to zero and positive numbers stay the same)

A recent invention which stands for Rectified Linear Units. The formula is deceptively simple: max(0,z)
. Despite its name and appearance, it’s not linear and provides the same benefits as Sigmoid (i.e. the ability to learn nonlinear functions), but with better performance.

In [None]:
def relu(x):
    return tf.maximum(0,x)

relu(A)

In [None]:
plt.plot(relu(A))

what about linear activation function

In [None]:
tf.keras.activations.linear(A)

In [None]:
A==tf.keras.activations.linear(A)

Tanh squashes a real-valued number to the range [-1, 1]. It’s non-linear. But unlike Sigmoid, its output is zero-centered. Therefore, in practice the tanh non-linearity is always preferred to the sigmoid nonlinearity.

In [None]:
def tanh(x):
    return(tf.exp(x)-tf.exp(-x)/((tf.exp(x)+tf.exp(-x))))

tanh(A)

In [None]:
plt.plot(tanh(A))

Okay, so it makes sense now the model doesn't really learn anything when using only linear activation functions, because the linear activation function doesn't change our input data in anyway.

Where as, with our non-linear functions, our data gets manipulated. A neural network uses these kind of transformations at a large scale to figure draw patterns between its inputs and outputs.

# Multiclass classification with neural network

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist

# The data has already been sorted into training and test sets for us
(train_data, train_labels), (test_data, test_labels) = fashion_mnist.load_data()

In [None]:
train_data[1]


Okay, 60,000 training examples each with shape (28, 28) and a label each as well as 10,000 test examples of shape (28, 28).

But these are just numbers, let's visualize.

In [None]:
plt.imshow(train_data[22])

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# How many classes are there (this'll be our output shape)?
len(class_names)

In [None]:
# Plot multiple random images of fashion MNIST
import random
plt.figure(figsize=(7, 7))
for i in range(4):
  ax = plt.subplot(2, 2, i + 1)
  rand_index = random.choice(range(len(train_data)))
  plt.imshow(train_data[rand_index], cmap=plt.cm.binary)
  plt.title(class_names[train_labels[rand_index]])
  plt.axis(False)
     

The input shape will have to deal with 28x28 tensors (the height and width of our images).
We're actually going to squash the input into a tensor (vector) of shape (784).
The output shape will have to be 10 because we need our model to predict for 10 different classes.

We'll also change the activation parameter of our output layer to be "softmax" instead of 'sigmoid'. As we'll see the "softmax" activation function outputs a series of values between 0 & 1 (the same shape as output shape, which together add up to ~1. The index with the highest value is predicted by the model to be the most likely class.



We'll need to change our loss function from a binary loss function to a multiclass loss function.
More specifically, since our labels are in integer form, 

we'll use tf.keras.losses.SparseCategoricalCrossentropy(), if our labels were one-hot encoded (e.g. they looked something like [0, 0, 1, 0, 0...]), we'd use tf.keras.losses.CategoricalCrossentropy().


We'll also use the validation_data parameter when calling the fit() function. This will give us an idea of how the model performs on the test set during training.

In [None]:
import tensorflow as tf
#set random seed
tf.random.set_seed(42)
#build the model
model_m=tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,28)),
    tf.keras.layers.Dense(4,activation='relu'),
    tf.keras.layers.Dense(4,activation='relu'),
    tf.keras.layers.Dense(10,activation='softmax')
])
#had to reshape 28x28 to 784, and output shape is 10, activation is softmax
#compile the model
model_m.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
               optimizer=tf.keras.optimizers.Adam(),
               metrics=['accuracy'])
non_norm_history=model_m.fit(train_data,train_labels,epochs=50,validation_data=(test_data,test_labels),verbose=0)

In [None]:
#check the shapes of our model 
model_m.summary()

In [None]:
train_data.min()

right now, the data we have isn't between 0 and 1, in other words, it's not normalized (hence why we used the non_norm_history variable when calling fit()). It's pixel values are between 0 and 255.

We can get these values between 0 and 1 by dividing the entire array by the maximum: 255.0 (dividing by a float also converts to a float).

Doing so will result in all of our data being between 0 and 1 (known as scaling or normalization).

In [None]:
train_data=train_data/train_data.max()
test_data=test_data/test_data.max()
train_data.max(),test_data.max()

yummers!!! now our data is between 0 and 1. let's use the above model with the normalized data

In [None]:
import tensorflow as tf
#set random seed
tf.random.set_seed(42)
#build the model
model_m2=tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,28)),
    tf.keras.layers.Dense(4,activation='relu'),
    tf.keras.layers.Dense(4,activation='relu'),
    tf.keras.layers.Dense(10,activation='softmax')
])
#had to reshape 28x28 to 784, and output shape is 10, activation is softmax
#compile the model
model_m2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
               optimizer=tf.keras.optimizers.Adam(),
               metrics=['accuracy'])
norm_history=model_m2.fit(train_data,train_labels,epochs=50,validation_data=(test_data,test_labels),verbose=0)

In [None]:
model_m2.predict(test_data)

In [None]:
pd.DataFrame(non_norm_history.history).plot(title='Non-normalized Data')
pd.DataFrame(norm_history.history).plot(title='Normalized data')

let's find the ideal  learning rate 

In [None]:
tf.random.set_seed(42)
model_m3=tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,28)),
    tf.keras.layers.Dense(4,activation='relu'),
    tf.keras.layers.Dense(4,activation='relu'),
    tf.keras.layers.Dense(10,activation='softmax')
])
model_m3.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(),
                 metrics=["accuracy"])
lr_scheduler=tf.keras.callbacks.LearningRateScheduler(lambda epoch:1e-3*10**(epoch/20))
find_lr_history = model_m3.fit(train_data,
                               train_labels,
                               epochs=40, # model already doing pretty good with current LR, probably don't need 100 epochs
                               validation_data=(test_data, test_labels),
                               callbacks=[lr_scheduler],verbose=1)

In [None]:
# Plot the learning rate decay curve
import numpy as np
import matplotlib.pyplot as plt
lrs = 1e-3 * (10**(np.arange(40)/20))
plt.semilogx(lrs, find_lr_history.history["loss"]) # want the x-axis to be log-scale
plt.xlabel("Learning rate")
plt.ylabel("Loss")
plt.title("Finding the ideal learning rate");
     

Now we've got a model trained with a close-to-ideal learning rate and performing pretty well, we've got a couple of options.

We could:

Evaluate its performance using other classification metrics (such as a confusion matrix or classification report).
Assess some of its predictions (through visualizations).
Improve its accuracy (by training it for longer or changing the architecture).
Save and export it for use in an application.
Let's go through the first two options.

First we'll create a classification matrix to visualize its predictions across the different classes.

In [None]:
# Set random seed
tf.random.set_seed(42)

# Create the model
model_m4 = tf.keras.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)), # input layer (we had to reshape 28x28 to 784)
  tf.keras.layers.Dense(4, activation="relu"),
  tf.keras.layers.Dense(4, activation="relu"),
  tf.keras.layers.Dense(10, activation="softmax") # output shape is 10, activation is softmax
])

# Compile the model
model_m4.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), # ideal learning rate (same as default)
                 metrics=["accuracy"])

# Fit the model
history = model_m4.fit(train_data,
                       train_labels,
                       epochs=100,
                       validation_data=(test_data, test_labels),verbose=0)

In [None]:
y_probs=model_m4.predict(test_data)

In [None]:
# See the predicted class number and label for the first example
y_probs[0].argmax(), class_names[y_probs[0].argmax()]

In [None]:
y_preds=y_probs.argmax(axis=1)

In [None]:
y_preds[:100]

In [None]:
# Check out the non-prettified confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=test_labels,
                 y_pred=y_preds)
     

In [None]:
import random

# Create a function for plotting a random image along with its prediction
def plot_random_image(model, images, true_labels, classes):
  """Picks a random image, plots it and labels it with a predicted and truth label.

  Args:
    model: a trained model (trained on data similar to what's in images).
    images: a set of random images (in tensor form).
    true_labels: array of ground truth labels for images.
    classes: array of class names for images.

  Returns:
    A plot of a random image from `images` with a predicted class label from `model`
    as well as the truth class label from `true_labels`.
  """
  # Setup random integer
  i = random.randint(0, len(images))

  # Create predictions and targets
  target_image = images[i]
  pred_probs = model.predict(target_image.reshape(1, 28, 28)) # have to reshape to get into right size for model
  pred_label = classes[pred_probs.argmax()]
  true_label = classes[true_labels[i]]

  # Plot the target image
  plt.imshow(target_image, cmap=plt.cm.binary)

  # Change the color of the titles depending on if the prediction is right or wrong
  if pred_label == true_label:
    color = "green"
  else:
    color = "red"

  # Add xlabel information (prediction/true label)
  plt.xlabel("Pred: {} {:2.0f}% (True: {})".format(pred_label,
                                                   100*tf.reduce_max(pred_probs),
                                                   true_label),
             color=color) # set the color to green or red
     

    
  
import random

# Create a function for plotting a random image along with its prediction
def plot_random_image(model, images, true_labels, classes):
  """Picks a random image, plots it and labels it with a predicted and truth label.

  Args:
    model: a trained model (trained on data similar to what's in images).
    images: a set of random images (in tensor form).
    true_labels: array of ground truth labels for images.
    classes: array of class names for images.

  Returns:
    A plot of a random image from `images` with a predicted class label from `model`
    as well as the truth class label from `true_labels`.
  """
  # Setup random integer
  i = random.randint(0, len(images))

  # Create predictions and targets
  target_image = images[i]
  pred_probs = model.predict(target_image.reshape(1, 28, 28)) # have to reshape to get into right size for model
  pred_label = classes[pred_probs.argmax()]
  true_label = classes[true_labels[i]]

  # Plot the target image
  plt.imshow(target_image, cmap=plt.cm.binary)

  # Change the color of the titles depending on if the prediction is right or wrong
  if pred_label == true_label:
    color = "green"
  else:
    color = "red"

  # Add xlabel information (prediction/true label)
  plt.xlabel("Pred: {} {:2.0f}% (True: {})".format(pred_label,
                                                   100*tf.reduce_max(pred_probs),
                                                   true_label),
             color=color) # set the color to green or red
     

     

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [None]:
plot_random_image(model_m4,images=test_data,true_labels=test_labels,classes=class_names)

A model learns by updating and improving its weight matrices and biases values every epoch (in our case, when we call the fit() fucntion).

It does so by comparing the patterns its learned between the data and labels to the actual labels.

If the current patterns (weight matrices and bias values) don't result in a desirable decrease in the loss function (higher loss means worse predictions), the optimizer tries to steer the model to update its patterns in the right way (using the real labels as a reference).

This process of using the real labels as a reference to improve the model's predictions is called backpropagation.

In other words, data and labels pass through a model (forward pass) and it attempts to learn the relationship between the data and labels.

And if this learned relationship isn't close to the actual relationship or it could be improved, the model does so by going back through itself (backward pass) and tweaking its weights matrices and bias values to better represent the data.