In [1]:
import os
import numpy as np 
from tqdm import tqdm
from sklearn.feature_extraction.text import HashingVectorizer
import torch 
import torch.nn as nn
import torch.optim as optim 
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import CosineAnnealingLR
import pickle

from testing import Tester
from evaluator import evaluate

In [2]:
with open("train.pkl", "rb") as f: 
    train_ds = pickle.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("test.pkl", "rb") as f: 
    test_ds = pickle.load(f)

In [4]:
### Documents for X / Price for y

y = np.array([float(item.price) for item in train_ds])
documents = [str(item).split("= $")[0].replace("<", "").strip() for item in train_ds]

In [5]:
# Use the HashingVectorizer for a Bag of Words model

np.random.seed(42)
vectorizer = HashingVectorizer(n_features=5000, stop_words="english", binary=True)
X = vectorizer.fit_transform(documents)

In [6]:
class NeuralNetwork(nn.Module): 
    def __init__(self, input_size): 
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 64)
        self.layer4 = nn.Linear(64, 64)
        self.layer5 = nn.Linear(64, 64) 
        self.layer6 = nn.Linear(64, 64)
        self.layer7 = nn.Linear(64, 64)
        self.layer8 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x): 
        output1 = self.relu(self.layer1(x))
        output2 = self.relu(self.layer2(output1))
        output3 = self.relu(self.layer3(output2))
        output4 = self.relu(self.layer4(output3))
        output5 = self.relu(self.layer5(output4)) 
        output6 = self.relu(self.layer6(output5)) 
        output7 = self.relu(self.layer7(output6)) 
        output8 = self.layer8(output7)

        return output8

In [7]:
### Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X.toarray())
y_train_tensor = torch.FloatTensor(y).unsqueeze(1)

### Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_tensor, 
    y_train_tensor, 
    test_size=0.01, 
    random_state=42
)

### Create the loader (modified the original batch_size 64)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

### Initialize the model
input_size = X_train_tensor.shape[1]
model = NeuralNetwork(input_size)

In [8]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of trainable parameters: {trainable_params:,}")

Number of trainable parameters: 669,249


In [9]:
## Define loss function and optimizer 

loss_function = nn.MSELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 3

for epoch in range(EPOCHS): 
    model.train()
    for batch_X, batch_y in tqdm(train_loader): 
        optimizer.zero_grad()

        ### forward pass, loss calculation, backward pass, optimizer
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad(): 
        val_outputs = model(X_val)
        val_loss = loss_function(val_outputs, y_val)

    print(f'Epoch [{epoch+1}/{EPOCHS}], Train Loss: {loss.item():.3f}, Val Loss: {val_loss.item():.3f}')

100%|█████████████████████████████████████████████████████████████████████████| 1161/1161 [00:05<00:00, 197.43it/s]


Epoch [1/3], Train Loss: 39517.398, Val Loss: 19185.752


100%|█████████████████████████████████████████████████████████████████████████| 1161/1161 [00:05<00:00, 214.18it/s]


Epoch [2/3], Train Loss: 40145.402, Val Loss: 18731.053


100%|█████████████████████████████████████████████████████████████████████████| 1161/1161 [00:05<00:00, 225.37it/s]


Epoch [3/3], Train Loss: 12280.417, Val Loss: 17233.447


In [10]:
def neural_network(item): 
    model.eval()
    with torch.no_grad(): 
        vector = vectorizer.transform([item])
        vector = torch.FloatTensor(vector.toarray())
        result = model(vector)[0].item()

    return max(0, result)

In [11]:
test_docs = [str(item).split("= $")[0].replace("<", "").strip() for item in test_ds]
test_prices = [item.price for item in test_ds]
test_titles = [item.title for item in test_ds]

In [12]:
test_items = [
    {
    "item": str(item).split("= $")[0].replace("<", "").strip(), 
    "price": item.price, 
    "title": item.title
    }
    for item in test_ds
]

In [None]:
from testing_for_neural_network import TesterForNeuralNetwork

TesterForNeuralNetwork.test(neural_network, test_items)

## Save the neural network model

In [15]:
torch.save(model.state_dict(), "models/neural_network_pricer_model.pt")

import joblib
joblib.dump(vectorizer, "vectorizer.joblib")

['vectorizer.joblib']

In [16]:
print(type(vectorizer))

<class 'sklearn.feature_extraction.text.HashingVectorizer'>
