# Implement linear regression on tensorflow with gradient tape

In [1]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
DATASETS = "datasets/"

# Read dataset

In [3]:
with open(DATASETS+os.sep+"winequality-red.csv", "r") as file:
    raw_data = file.read()

# Remove columns

In [4]:
raw_dataset = raw_data.split("\n")[1:]
columns = raw_data.split("\n")[0].split(",")

N_ROWS = len(raw_dataset)
N_FEATURES = len(columns)

In [5]:
columns

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [6]:
print("ROWS: ", N_ROWS)
print("Features: ", N_FEATURES)

ROWS:  1599
Features:  12


In [7]:
dataset = np.zeros((N_ROWS, N_FEATURES))

i = 0
for row in raw_dataset:
    j = 0
    for feature in row.split(","):
        dataset[i][j] = float(feature)
        j+=1
    i+=1

In [8]:
dataset[0].shape

(12,)

In [9]:
dataset[:, 7:]

array([[ 0.9978 ,  3.51   ,  0.56   ,  9.4    ,  5.     ],
       [ 0.9968 ,  3.2    ,  0.68   ,  9.8    ,  5.     ],
       [ 0.997  ,  3.26   ,  0.65   ,  9.8    ,  5.     ],
       ...,
       [ 0.99574,  3.42   ,  0.75   , 11.     ,  6.     ],
       [ 0.99547,  3.57   ,  0.71   , 10.2    ,  5.     ],
       [ 0.99549,  3.39   ,  0.66   , 11.     ,  6.     ]])

# Split dataset into features and target

In [10]:
N_SEQ_FEATURES = 10

In [11]:
x_train = tf.constant(dataset[:, :N_SEQ_FEATURES], dtype="float32")
y_train = tf.constant(dataset[:, -1], dtype="float32")

Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB



2022-10-22 19:26:52.771245: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-22 19:26:52.771403: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Split dataset into train and validation

into 70/30

- Shuffle dataset
- Select N for Train and N for Validation base on the proportio 70/20/10

In [12]:
shuffled_dataset = np.random.shuffle(dataset)
N_train = int(dataset.shape[0]*0.7)+1
N_val = int(dataset.shape[0]*0.2)+1
N_test = dataset.shape[0]-N_train-N_val

In [13]:
print("Train rows: ", N_train)
print("Validation rows: ", N_val)
print("Test rows: ", N_test)

Train rows:  1120
Validation rows:  320
Test rows:  159


# Split data 
Into 
- Train
- Val
- Test 

And X for features and y for the target

In [14]:
train_X = dataset[:N_train, :N_SEQ_FEATURES]
train_y = dataset[:N_train, -1]

val_X = dataset[:N_val, :N_SEQ_FEATURES]
val_y = dataset[:N_val, -1]

test_X = dataset[:N_test, :N_SEQ_FEATURES]
test_y = dataset[:N_test, -1]

# First model Baseline AVG 

In [15]:
baseline_prediction = train_y.mean()

print("Pred for baseline: ", baseline_prediction)

Pred for baseline:  5.632142857142857


# Metric

In [16]:
((val_y - baseline_prediction)**2).sum()/val_y.shape[0]

0.6701626275510203

# Scale data

In [17]:
np.array([4,4,4,4])/np.array([2, 2, 2, 2])

array([2., 2., 2., 2.])

In [18]:
train_X.max(axis=0).shape

(10,)

In [19]:
train_X.max(axis=0)

array([ 15.6    ,   1.58   ,   0.79   ,  15.5    ,   0.611  ,  72.     ,
       289.     ,   1.00369,   4.01   ,   1.98   ])

In [20]:
train_X.min(axis=0)

array([4.7    , 0.12   , 0.     , 0.9    , 0.012  , 1.     , 6.     ,
       0.99007, 2.88   , 0.37   ])

In [21]:
train_max = train_X.max(axis=0)
train_min = train_X.min(axis=0)

Q_factor = 100

train_X -= train_min
train_X /= (train_max-train_min) * Q_factor

val_X -= train_min
val_X /= (train_max-train_min) * Q_factor

test_X -= train_min
test_X /= (train_max-train_min) * Q_factor

# First model Regresion

Y = W.X + b

In [22]:
W_N_DIMS = train_X.shape[1]
B_N_DIMS = train_X.shape[0]

In [23]:
W = tf.Variable(np.random.ranf((1, W_N_DIMS)), dtype='float32')
b = tf.Variable(np.random.ranf((B_N_DIMS, 1)), dtype='float32')

In [24]:
W

<tf.Variable 'Variable:0' shape=(1, 10) dtype=float32, numpy=
array([[0.84212846, 0.3948212 , 0.64913833, 0.29011744, 0.505506  ,
        0.8147143 , 0.3300597 , 0.29872987, 0.47734326, 0.688361  ]],
      dtype=float32)>

In [25]:
b

<tf.Variable 'Variable:0' shape=(1120, 1) dtype=float32, numpy=
array([[0.7455349 ],
       [0.26478243],
       [0.39630336],
       ...,
       [0.45274028],
       [0.3571946 ],
       [0.6819259 ]], dtype=float32)>

# Inputs

In [26]:
train_x_tensor = tf.constant(train_X, dtype='float32')
train_y_tensor = tf.constant(train_y, dtype='float32')

val_x_tensor = tf.constant(val_X, dtype='float32')
val_y_tensor = tf.constant(val_y, dtype='float32')

test_x_tensor = tf.constant(test_X, dtype='float32')
test_y_tensor = tf.constant(test_y, dtype='float32')

W (1, DIM) * X (N, DIM)

# Try random values first

In [27]:
y = tf.matmul(W, tf.transpose(train_x_tensor)) + b

In [28]:
tf.math.reduce_mean(tf.pow(tf.subtract(y, train_y_tensor), 2))

<tf.Tensor: shape=(), dtype=float32, numpy=27.74398>

# Try with gradien tape to fix the weights

Problems I ran into

- I got none because I have declared W and b as constants!!! derivative is 0!
- Im getting nan an inf!!?
    - Standarizing values too small result in some of them being too big I had to adjust that (max-min) by Q=100
- Mac tensorflow cant work with float16!

In [29]:
with tf.GradientTape() as tape:
    y = tf.matmul(train_x_tensor, tf.transpose(W)) + b
    loss = tf.math.reduce_mean(tf.pow(tf.subtract(y, train_y_tensor), 2))

gradient_loss_w_b = tape.gradient(loss, [W, b])

W [1, 11]
train_x_tensor[1120, 11]

W*train_x_tensor [1120, 1]

b [1120, 1]

# Training loop

In [30]:
train_x_tensor[0]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([-4.3158811e-03, -8.2727749e-04,  0.0000000e+00, -6.1686028e-04,
       -2.0335267e-04, -1.4086490e-04, -2.1202162e-04, -1.2592443e+00,
       -2.5711842e-02, -2.3123941e-03], dtype=float32)>

In [31]:
W[0]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([0.84212846, 0.3948212 , 0.64913833, 0.29011744, 0.505506  ,
       0.8147143 , 0.3300597 , 0.29872987, 0.47734326, 0.688361  ],
      dtype=float32)>

In [32]:
b[0]

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.7455349], dtype=float32)>

In [33]:
#init params
W = tf.Variable(np.random.ranf((1, W_N_DIMS)), dtype='float32')
b = tf.Variable(np.random.ranf((B_N_DIMS, 1)), dtype='float32')

#init epsilon
epsilon = tf.constant(0.01, dtype='float32')

for epoc in tqdm(range(5000)):
    # Feed-forward pass
    with tf.GradientTape() as tape:
        y = tf.matmul(train_x_tensor, tf.transpose(W)) + b
        loss = tf.reduce_mean(tf.square(train_y_tensor-y))

    if epoc%1000==0:
        print("TRAIN Loss: ", loss)
        print(f"First sample training set prediction {y[0]} - real value {train_y_tensor[0]}")

    #backward - pass
    w_grad, b_grad = tape.gradient(loss, [W, b])

    W.assign_sub(epsilon*w_grad)
    b.assign_sub(epsilon*b_grad)

  0%|          | 0/5000 [00:00<?, ?it/s]

TRAIN Loss:  tf.Tensor(27.51944, shape=(), dtype=float32)
First sample training set prediction [0.4466024] - real value 6.0
TRAIN Loss:  tf.Tensor(19.419973, shape=(), dtype=float32)
First sample training set prediction [6.664195] - real value 6.0
TRAIN Loss:  tf.Tensor(18.736078, shape=(), dtype=float32)
First sample training set prediction [6.661855] - real value 6.0
TRAIN Loss:  tf.Tensor(18.077152, shape=(), dtype=float32)
First sample training set prediction [6.6449947] - real value 6.0
TRAIN Loss:  tf.Tensor(17.442245, shape=(), dtype=float32)
First sample training set prediction [6.6283207] - real value 6.0


# Batch approach is faster to converge

as it is able to adjust weights faster

In [34]:
#init params
BATCH_SIZE = 32
W = tf.Variable(np.random.ranf((1, W_N_DIMS)), dtype='float32')
b = tf.Variable(np.random.ranf((BATCH_SIZE, 1)), dtype='float32')
MAX_EPOCHS = 5000

#init epsilon
epsilon = tf.constant(0.01, dtype='float32')

for epoc in tqdm(range(MAX_EPOCHS)):
    
    # Feed-forward pass
    for batch in range(0, B_N_DIMS, BATCH_SIZE):
        with tf.GradientTape() as tape:
            y = tf.matmul(train_x_tensor[batch:batch+BATCH_SIZE], tf.transpose(W)) + b
            loss = tf.reduce_mean(tf.square(train_y_tensor[batch:batch+BATCH_SIZE]-y))
        
        #backward - pass
        w_grad, b_grad = tape.gradient(loss, [W, b])

        W.assign_sub(epsilon*w_grad)
        b.assign_sub(epsilon*b_grad)

    if epoc%1000==0:
        print(f"advance: {np.round(epoc/MAX_EPOCHS, 2)*100}%")
        val_loss = 0
        for batch in range(0, val_x_tensor.shape[0], BATCH_SIZE):
            y_val = tf.matmul(val_x_tensor[batch:batch+BATCH_SIZE], tf.transpose(W)) + b
            val_loss += tf.reduce_mean(tf.square(val_y_tensor[batch:batch+BATCH_SIZE]-y_val))
        print("TRAIN Loss: ", loss)
        print("VAL loss: ", val_loss/(int(val_x_tensor.shape[0]/BATCH_SIZE)+1))
        print(f"First sample training set prediction {y[0]} - real value {train_y_tensor[0]}")

  0%|          | 0/5000 [00:00<?, ?it/s]

advance: 0.0%
TRAIN Loss:  tf.Tensor(23.179642, shape=(), dtype=float32)
VAL loss:  tf.Tensor(15.773583, shape=(), dtype=float32)
First sample training set prediction [0.62152916] - real value 6.0
advance: 20.0%
TRAIN Loss:  tf.Tensor(0.5936506, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.6071366, shape=(), dtype=float32)
First sample training set prediction [5.6435504] - real value 6.0
advance: 40.0%
TRAIN Loss:  tf.Tensor(0.5936593, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.6071228, shape=(), dtype=float32)
First sample training set prediction [5.6434584] - real value 6.0
advance: 60.0%
TRAIN Loss:  tf.Tensor(0.59366196, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.60710907, shape=(), dtype=float32)
First sample training set prediction [5.6433983] - real value 6.0
advance: 80.0%
TRAIN Loss:  tf.Tensor(0.5936788, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.6070962, shape=(), dtype=float32)
First sample training set prediction [5.643373] - real value 6.0


# Batch with momemtum

# With RMS to avoid getting stuck in a local minima around [0.557] en val

In [35]:
#init params
BATCH_SIZE = 32
MAX_EPOCHS = 5000

W = tf.Variable(np.random.ranf((1, W_N_DIMS)), dtype='float32')
b = tf.Variable(np.random.ranf((BATCH_SIZE, 1)), dtype='float32')

#RMS
velocity_w = tf.Variable(np.zeros((1, W_N_DIMS)), dtype='float32')
velocity_b = tf.Variable(np.zeros((BATCH_SIZE, 1)), dtype='float32')
past_velocity_w = tf.Variable(np.zeros((1, W_N_DIMS)), dtype='float32')
past_velocity_b = tf.Variable(np.zeros((BATCH_SIZE, 1)), dtype='float32')
momentum = tf.constant(0.1, dtype='float32')

#init epsilon
epsilon = tf.constant(0.01, dtype='float32')

for epoc in tqdm(range(MAX_EPOCHS)):
    # Feed-forward pass
    for batch in range(0, B_N_DIMS, BATCH_SIZE):
        with tf.GradientTape() as tape:
            # print(f"Range: {batch} , {batch+BATCH_SIZE}")
            y = tf.matmul(train_x_tensor[batch:batch+BATCH_SIZE], tf.transpose(W)) + b
            loss = tf.reduce_mean(tf.square(train_y_tensor-y))
 
        #backward - pass
        w_grad, b_grad = tape.gradient(loss, [W, b])

        velocity_w.assign(past_velocity_w*momentum-epsilon*w_grad)
        velocity_b.assign(past_velocity_b*momentum-epsilon*b_grad)

        W.assign_add(velocity_w*momentum-epsilon*w_grad)
        b.assign_add(velocity_b*momentum-epsilon*b_grad)

        past_velocity_w.assign(velocity_w)
        past_velocity_b.assign(velocity_b)

    if epoc%1000==0:
        print(f"advance: {np.round(epoc/MAX_EPOCHS, 2)}%")
        val_loss = 0
        for batch in range(0, val_x_tensor.shape[0], BATCH_SIZE):
            y_val = tf.matmul(val_x_tensor[batch:batch+BATCH_SIZE], tf.transpose(W)) + b
            val_loss += tf.reduce_mean(tf.square(val_y_tensor[batch:batch+BATCH_SIZE]-y_val))
        print("TRAIN Loss: ", loss)
        print("VAL loss: ", val_loss/(int(val_x_tensor.shape[0]/BATCH_SIZE)+1))
        print(f"First sample training set prediction {y[0]} - real value {train_y_tensor[0]}")

  0%|          | 0/5000 [00:00<?, ?it/s]

advance: 0.0%
TRAIN Loss:  tf.Tensor(26.068678, shape=(), dtype=float32)
VAL loss:  tf.Tensor(15.618469, shape=(), dtype=float32)
First sample training set prediction [0.8675574] - real value 6.0
advance: 0.2%
TRAIN Loss:  tf.Tensor(0.6522009, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.60857123, shape=(), dtype=float32)
First sample training set prediction [5.6283174] - real value 6.0
advance: 0.4%
TRAIN Loss:  tf.Tensor(0.65220076, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.6085788, shape=(), dtype=float32)
First sample training set prediction [5.628383] - real value 6.0
advance: 0.6%
TRAIN Loss:  tf.Tensor(0.65220064, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.6085862, shape=(), dtype=float32)
First sample training set prediction [5.628398] - real value 6.0
advance: 0.8%
TRAIN Loss:  tf.Tensor(0.6522005, shape=(), dtype=float32)
VAL loss:  tf.Tensor(0.6085936, shape=(), dtype=float32)
First sample training set prediction [5.628371] - real value 6.0
