In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


Plotting a quadratic function:

In [None]:
def quadrtc_fn(x):
    return 3*x**2 + 2*x + 1

plot_function(quadrtc_fn, "$3x^2 + 2x + 1$")

Imagine we don't know the values of the quadrtc_fn() above. Instead, we are trying to recreate it. This is the general idea of training a model.

Let's see what we can achieve:

In [1]:
# Create a general quadratic function
def quad(a, b, c, x):
    return a*x**2 + b*x + c

Now let's call the `quad(*params)` function with some random values for parameters:

In [None]:
quad(3,2,1, 1.5)

In [None]:
from functools import partial # helps make quadratic functions

def mk_quad(a,b,c):
    return partial(quad, a,b,c)

f_example = mk_quad(3,2,1)
f_example(1.5) # -> passing in the value of x cos the others (a,b,c) are already fixed.


In [None]:
# Fit a function by good hands and eyes - Manually changing the parameters a, b, c
from numpy.random import normal, seed, uniform

np.random.seed(42) # set the seed to ensure we always get the same random numbers

def noise(x, scale):
    # create normally distributed random numbers
    return normal(slace=scale, size=x.shape)

def add_noise(x, mult, add):
    return x * (1 + noise(x, mult)) + noise(x, add)


Plotting some data that matches the shape of mk_quad et al functions

In [None]:
# plot the quadratic function using
x = torch.lisepace(-2, 2, steps=20)[:, None] # creates a tensor - a vector going from -2 to 2 in 20 equal steps
y = add_noise(f_example(x), 0.3, 1.5)
plt.scatter(x,y)

Now, we have some data.
We will try to reconstruct the original quadratic equation. How do we do that?
_Let's fuck around, and find out_

In [None]:
# interact with the parameters to change them
from ipywidgets import interact

@interact(a=1.5, b=1.5, c=1.5)
def plot_quad(a, b, c):
    plt.scatter(x,y)
    plot_function(mk_quad(a, b, c), ylim=(-3, 12))

NOW WE AUTOMATE:

The first step:
We need to know, if we increase (or decrease) the value of a parameter, does it get better or worse?

Introducing the loss functions. These will help with automatically updating the parameters.

Loss function tells us whether the current value of the function and parameters is better or worse than the actual. - relative to making better predictions.

In [None]:
# main squared error (mse) - a simple and common loss function
def mse(predictions, actuals):
    return ((predictions - actuals)**2).mean()


In [None]:
# plot the quadratic function with manual parameters but including a display of the loss function
@interact(a=1.5, b=1.5, c=1.5)
def plot_quad(a, b, c):
    f = mk_quad(a, b, c)
    plot.scatter(x, y)
    loss = mse(f(x), y)
    plot_function(f, ylim=(-3, 12), title=f"MSE: {loss:.2f}")

Finally, Automating.
The key thing to remember is we want to know, `does the loss get better when we increase or decrease the input?`.

Another way to see if the loss function is getting better, _other than the manual way of changing the parameters(input),_ is by calculating its **derivative**.

A derivative is a function that tells us, _if we increase the input, does the output increases or decreases, and by how much_. That is called the slope or the gradient.

Pytorch calculates the derivative automatically.

In [None]:
# Automate the search of parameters for better loss.

# a func that takes coefficients of the quad and returns the mse of predictions and actuals
def quad_mse(params):
    f = mk_quad(*params)
    return mse(f(x), y) # the loss of the quadratic function

quad_mse([1.5, 1.5, 1.5])
# returns: tensor(11.4648, dtype=torch.float64) - 1d tensor


In pytorch, everything is a tensor.
- 1D tensor - `[1, 2, 3]` - lists and vectors of numbers
- 2D tensor - rectangles and tables of numbers
- 3D tensors - Layers of tables of numbers
- etc

In [None]:
# create all the coefficients and put them in a single 1D - rank1 tensor
abc = torch.tensor([1.5, 1.5, 1.5])
abc.requires_grad_() # tell pytorch to calculate the gradient of these numbers whenever we use them in a calculation

# return a tensor

In [None]:
# use it in a calculation
loss = quad_mse(abc)
loss
# display a tensor with a grad_fn(gradient function)

In [None]:
# calculate the gradient
loss.backward()

In [None]:
# access the calculated gradient
abc.grad

In [None]:
# the gradient displays a tensor that shows the loss is still big.
# Now we reduce it

with torch.no_grad(): # tell pytorch not to calc gradient of abc params
    abc -= abc.grad*0.01 # updating the gradients
    loss = quad_mse(abc) # calculate loss

print(f"loss={loss:.2f}") # outputs 10.59

Now we have update the parameters in one cycle. Let's Automate the whole thing.

In [None]:
# The mathematical function
# we will repeat for five times - This is called Optimization using Gradient descent
for i in range(5):
    loss = quad_mse(abc)
    loss.backward()
    with torch.no_grad():
        abc -= abc.grad*0.01
    
    print(f"step={i}; loss={loss:.2f}")

This is a basic principle of optimizers in deep learning and ML.
This principle of optimization is called _gradient descent_, because we calculate the gradient and the we try to do a descent.

And that is it. _*How to find parameters for the model*_ 

We need one more thing. _What is the mathematical function, that we are finding these parameters for?_ We can't just use quadratics, right?

Enters, **Rectified Linear Function**

In [None]:
# infitely flexible function called Rectified Linear Unit
def rectified_linear(m,b,x):
    y = m*x + b # linear function
    return torch.clip(y, 0.) # if y<0: y = 0

In [None]:
# plot it
plot_function(partial(rectified_linear, 1, 1)) # 

Let's make it interactive:

In [None]:
@interact(m=1.5, b=1.5)
def plot_relu(m, b): # plot rectified linear unit
    plot_function(partial(rectified_linear, 1, 1), ylim=(-1,4))

Add two ReLU together.

In [None]:
def double_relu(m1, b1, m2, b2):
    return rectified_linear(m1, b1, x) + rectified_linear(m2, b2, x)

In [None]:
# plot them
@interact(m1=1.5, b1=1.5, m2=1.5, b2=1.5)
def plot_double_relu(m1,b1,m2,b2):
    plot_function(partial(double_relu, m1,b1,m2,b2), ylim=(-1,6))

Now, We can do the same using as extremely many relus, not just two.

We use gradient descent to get the parameters.

This is the principle or foundation or the proof or the concept or the deriving of deep learning.

Everything else is just how to make it faster and make it need less data.

Consider the analogy of how to draw an owl. _You fast draw two circles and then draw the rest of the damn owl._

In the context of deep learning, Jeremy Howard states that: _"When you have ReLUs getting added together, and gradient descent to optimize the parameters, and samples of inputs and outputs that you want, the computer draws the owl."_

## Matrix Multiplication
The first Case of Optimization and making it easier is **Matrix Multiplication**

Matrix multiplication is the most critical mathematical operation in basically all of deep learning. It is basically multiplying things together and adding them up.
If we have any negatives in the product of the matrix multiplication, we convert it into zero.

- GPUs are good in matrix multiplication, because they have tensor cores which only multiply four by four matrices.

# # Practice
- It is time to take on the **Titanic** competition on [Kaggle.com](https://www.kaggle.com/competitions/titanic/data?select=train.csv)


### Next UP
[Getting started with NLP for absolute beginners](https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners)