In [1]:
import numpy as np
import random
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from helpers import bag_of_words, tokenize, stem
from model import NeuralNet
from dataset import ChatDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('./datasets/products.json', encoding="utf-8") as p_file:
    products = json.loads(p_file.read())

In [3]:
all_words = []
tags = []
xy = []
for product in products:
    tag = product['title']
    tags.append(tag)
    patterns = [product['title']]
    product_title_words = tokenize(product['title'])
    for i in range(0,len(product_title_words),2):
        if i + 2 > len(product_title_words):
            i -= 1
        patterns.append(product_title_words[i] +" "+ product_title_words[i+1])
    print(patterns)
    for pattern in patterns:
        w = tokenize(pattern)
        if ''.join(w).isnumeric():
            continue
        for token in w:
            if not token.isnumeric() and len(token) > 1:
                all_words.append(token)
        xy.append((w, tag))

['Fjallraven - Foldsack No. 1 Backpack, Fits 15 Laptops', 'Fjallraven Foldsack', 'No 1', 'Backpack Fits', '15 Laptops']
['Mens Casual Premium Slim Fit T-Shirts ', 'Mens Casual', 'Premium Slim', 'Fit T', 'T Shirts']
['Mens Cotton Jacket', 'Mens Cotton', 'Cotton Jacket']
['Mens Casual Slim Fit', 'Mens Casual', 'Slim Fit']
["John Hardy Women's Legends Naga Gold & Silver Dragon Station Chain Bracelet", 'John Hardy', 'Women s', 'Legends Naga', 'Gold Silver', 'Dragon Station', 'Chain Bracelet']
['Solid Gold Petite Micropave ', 'Solid Gold', 'Petite Micropave']
['White Gold Plated Princess', 'White Gold', 'Plated Princess']
['Pierced Owl Rose Gold Plated Stainless Steel Double', 'Pierced Owl', 'Rose Gold', 'Plated Stainless', 'Steel Double']
['WD 2TB Elements Portable External Hard Drive - USB 3.0 ', 'WD 2TB', 'Elements Portable', 'External Hard', 'Drive USB', '3 0']
['SanDisk SSD PLUS 1TB Internal SSD - SATA III 6 Gb/s', 'SanDisk SSD', 'PLUS 1TB', 'Internal SSD', 'SATA III', '6 Gb', 'Gb s']


In [4]:
print(all_words, "\n\n")
print(tags, "\n\n")
print(xy, "\n\n")
print([data for data in xy if data[1]=='Products'], "\n\n")

['Fjallraven', 'Foldsack', 'No', 'Backpack', 'Fits', 'Laptops', 'Fjallraven', 'Foldsack', 'No', 'Backpack', 'Fits', 'Laptops', 'Mens', 'Casual', 'Premium', 'Slim', 'Fit', 'Shirts', 'Mens', 'Casual', 'Premium', 'Slim', 'Fit', 'Shirts', 'Mens', 'Cotton', 'Jacket', 'Mens', 'Cotton', 'Cotton', 'Jacket', 'Mens', 'Casual', 'Slim', 'Fit', 'Mens', 'Casual', 'Slim', 'Fit', 'John', 'Hardy', 'Women', 'Legends', 'Naga', 'Gold', 'Silver', 'Dragon', 'Station', 'Chain', 'Bracelet', 'John', 'Hardy', 'Women', 'Legends', 'Naga', 'Gold', 'Silver', 'Dragon', 'Station', 'Chain', 'Bracelet', 'Solid', 'Gold', 'Petite', 'Micropave', 'Solid', 'Gold', 'Petite', 'Micropave', 'White', 'Gold', 'Plated', 'Princess', 'White', 'Gold', 'Plated', 'Princess', 'Pierced', 'Owl', 'Rose', 'Gold', 'Plated', 'Stainless', 'Steel', 'Double', 'Pierced', 'Owl', 'Rose', 'Gold', 'Plated', 'Stainless', 'Steel', 'Double', 'WD', '2TB', 'Elements', 'Portable', 'External', 'Hard', 'Drive', 'USB', 'WD', '2TB', 'Elements', 'Portable', 'Ex

In [5]:
ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words and len(w) > 1 and not w.isnumeric()]
all_words = sorted(set(all_words))
tags = sorted(set(tags))

In [6]:
print(len(xy), "patterns")
print(len(tags), "tags:", tags)
print(len(all_words), "unique stemmed words:", all_words)

110 patterns
20 tags: ['Acer SB220Q bi 21.5 inches Full HD (1920 x 1080) IPS Ultra-Thin', "BIYLACLESEN Women's 3-in-1 Snowboard Jacket Winter Coats", 'DANVOUY Womens T Shirt Casual Cotton Short', 'Fjallraven - Foldsack No. 1 Backpack, Fits 15 Laptops', "John Hardy Women's Legends Naga Gold & Silver Dragon Station Chain Bracelet", "Lock and Love Women's Removable Hooded Faux Leather Moto Biker Jacket", "MBJ Women's Solid Short Sleeve Boat Neck V ", 'Mens Casual Premium Slim Fit T-Shirts ', 'Mens Casual Slim Fit', 'Mens Cotton Jacket', "Opna Women's Short Sleeve Moisture", 'Pierced Owl Rose Gold Plated Stainless Steel Double', 'Rain Jacket Women Windbreaker Striped Climbing Raincoats', 'Samsung 49-Inch CHG90 144Hz Curved Gaming Monitor (LC49HG90DMNXZA) – Super Ultrawide Screen QLED ', 'SanDisk SSD PLUS 1TB Internal SSD - SATA III 6 Gb/s', 'Silicon Power 256GB SSD 3D NAND A55 SLC Cache Performance Boost SATA III 2.5', 'Solid Gold Petite Micropave ', 'WD 2TB Elements Portable External Hard

In [7]:
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    label = tags.index(tag)
    y_train.append(label)

In [8]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [9]:
num_epochs = 1000
batch_size = 12
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 24
output_size = len(tags)
print(input_size, output_size)

113 20


In [10]:
dataset = ChatDataset(X_train, y_train)
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
model = NeuralNet(input_size, hidden_size, output_size).to(device)

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        
        # Forward pass
        outputs = model(words)
        # if y would be one-hot, we must apply
        # labels = torch.max(labels, 1)[1]
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 0.9006
Epoch [200/1000], Loss: 0.0047
Epoch [300/1000], Loss: 0.0117
Epoch [400/1000], Loss: 0.0004
Epoch [500/1000], Loss: 0.0003
Epoch [600/1000], Loss: 0.0002
Epoch [700/1000], Loss: 0.0004
Epoch [800/1000], Loss: 0.2061
Epoch [900/1000], Loss: 0.0001
Epoch [1000/1000], Loss: 0.0002


In [15]:
print(f'final loss: {loss.item():.4f}')

final loss: 0.0002


In [16]:
data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": all_words,
"tags": tags
}

FILE = "./models/product.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

training complete. file saved to ./models/product.pth
