# Segment 3 Extra Lab

## Let's make a deeper neural network

In [1]:
# imports - now including pytorch

import os
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
from tqdm import tqdm
import pickle
from evaluator import evaluate
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import chromadb
from torch.optim.lr_scheduler import CosineAnnealingLR

In [2]:
# Load in dataset
# Sidenote: this is actually a larger dataset than before (about twice as large)

with open('../train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('../test.pkl', 'rb') as file:
    test = pickle.load(file)

In [3]:
len(train)

400000

In [4]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
DB = "../segment4/products_vectorstore"

In [5]:
# Log in to HuggingFace
# If you don't have a HuggingFace account, you can set one up for free at www.huggingface.co
# And then add the HF_TOKEN to your .env file as explained in the project README

hf_token = os.environ['HF_TOKEN']
login(token=hf_token, add_to_git_credential=False)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [6]:
client = chromadb.PersistentClient(path=DB)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
collection_name = "products"
collection = client.get_or_create_collection(collection_name)

In [8]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
prices = [metadata['price'] for metadata in result['metadatas']]

In [9]:
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(vectors)
y_train_tensor = torch.FloatTensor(prices).unsqueeze(1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tensor, y_train_tensor, test_size=0.01, random_state=42)

# Log
y_train_log = torch.log(y_train + 1)
y_val_log = torch.log(y_val + 1)
    
# Normalize log prices
y_mean = y_train_log.mean()
y_std = y_train_log.std()
y_train_norm = (y_train_log - y_mean) / y_std
y_val_norm = (y_val_log - y_mean) / y_std

# Create the loader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [10]:
class NewNeuralNetwork(nn.Module):
    def __init__(self, input_size, num_layers=10, hidden_size=4096, dropout_prob=0.2):
        super(NewNeuralNetwork, self).__init__()
        
        # First layer
        self.input_layer = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_prob)
        )
        
        # Residual blocks
        self.residual_blocks = nn.ModuleList()
        for i in range(num_layers - 2):
            self.residual_blocks.append(
                ResidualBlock(hidden_size, dropout_prob)
            )
        
        # Output layer
        self.output_layer = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = self.input_layer(x)
        
        for block in self.residual_blocks:
            x = block(x)
            
        return self.output_layer(x)

In [11]:
class ResidualBlock(nn.Module):
    def __init__(self, hidden_size, dropout_prob):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size)
        )
        self.relu = nn.ReLU()
        
    def forward(self, x):
        residual = x
        out = self.block(x)
        out += residual  # Skip connection
        return self.relu(out)

In [12]:
model = NewNeuralNetwork(X_train.shape[1])
total_params = sum(p.numel() for p in model.parameters())
print("Total parameters:", total_params)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
loss_function = nn.L1Loss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=0)

train_dataset = torch.utils.data.TensorDataset(X_train, y_train_norm)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

Total parameters: 270221313


In [13]:
EPOCH_START = 1
EPOCH_END = 5

for epoch in range(EPOCH_START, EPOCH_END+1):
    model.train()
    train_losses = []
    
    for batch_X, batch_y in tqdm(train_loader):
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        train_losses.append(loss.item())
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val.to(device))
        val_loss = loss_function(val_outputs, y_val_norm.to(device))
        
        # Convert back to original scale for meaningful metrics
        val_outputs_orig = torch.exp(val_outputs * y_std + y_mean) - 1
        mae = torch.abs(val_outputs_orig - y_val.to(device)).mean()
    
    avg_train_loss = np.mean(train_losses)
    print(f'Epoch [{epoch+1}/{EPOCH_END}]')
    print(f'Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss.item():.4f}')
    print(f'Val MAE (original scale): ${mae.item():.2f}')
    print(f'Learning rate: {scheduler.get_last_lr()[0]:.6f}')

    # torch.save(model.state_dict(), f'models/nnn-{epoch+1}.pth')
    
    # Learning rate scheduling
    scheduler.step()

100%|██████████| 6188/6188 [15:40<00:00,  6.58it/s]


Epoch [2/5]
Train Loss: 0.6568, Val Loss: 0.4961
Val MAE (original scale): $98.00
Learning rate: 0.001000


100%|██████████| 6188/6188 [15:56<00:00,  6.47it/s]


Epoch [3/5]
Train Loss: 0.4744, Val Loss: 0.4649
Val MAE (original scale): $90.33
Learning rate: 0.000976


100%|██████████| 6188/6188 [15:42<00:00,  6.57it/s]


Epoch [4/5]
Train Loss: 0.4404, Val Loss: 0.4498
Val MAE (original scale): $86.95
Learning rate: 0.000905


100%|██████████| 6188/6188 [15:43<00:00,  6.56it/s]


Epoch [5/5]
Train Loss: 0.4132, Val Loss: 0.4353
Val MAE (original scale): $85.60
Learning rate: 0.000794


100%|██████████| 6188/6188 [15:45<00:00,  6.55it/s]


Epoch [6/5]
Train Loss: 0.3872, Val Loss: 0.4248
Val MAE (original scale): $82.73
Learning rate: 0.000655


In [14]:
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def new_neural_network(item):
    model.eval()
    with torch.no_grad():
        vector = encoder.encode(item.text)
        vector = torch.FloatTensor(vector).to(device)
        pred = model(vector)[0]
        result = torch.exp(pred * y_std + y_mean) - 1
        result = result.item()
    return max(0, result)

In [15]:
new_neural_network(test[1])

155.8573760986328

In [16]:
evaluate(new_neural_network, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[92m$62 [93m$69 [92m$26 [93m$148 [92m$2 [92m$25 [92m$9 [92m$52 [91m$416 [93m$45 [92m$9 [93m$158 [92m$11 [93m$52 [91m$172 [92m$33 [92m$27 [93m$66 [92m$8 [92m$2 [92m$64 [91m$93 [92m$28 [93m$63 [92m$6 [92m$37 [93m$48 [91m$120 [92m$29 [91m$87 [93m$82 [93m$71 [92m$9 [93m$69 [92m$9 [92m$20 [92m$29 [93m$157 [92m$27 [91m$108 [92m$4 [92m$8 [92m$17 [91m$250 [93m$49 [93m$125 [92m$7 [92m$20 [92m$62 [93m$92 [92m$1 [92m$24 [92m$25 [91m$268 [91m$176 [92m$23 [91m$259 [92m$34 [92m$2 [93m$60 [91m$115 [92m$21 [92m$33 [92m$25 [92m$110 [91m$85 [93m$50 [92m$8 [93m$66 [91m$83 [92m$1 [93m$93 [92m$68 [93m$51 [91m$111 [91m$101 [93m$85 [92m$21 [92m$31 [91m$103 [91m$704 [92m$16 [91m$191 [93m$125 [92m$8 [92m$2 [92m$4 [93m$102 [93m$46 [92m$92 [92m$3 [92m$41 [93m$87 [93m$79 [91m$284 [92m$103 [93m$45 [92m$47 [92m$11 [93m$46 [91m$270 [92m$27 [93m$49 [91m$128 [93m$80 [92m$7 [92m$0 [92m$0 [91m$149 [92m$2 [92m$3