In [1]:
!pip install wget
!pip install transformers
!pip install ray

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import json
import os
import wget
import gzip
import html
import tarfile
import logging
import sys

import torch
import torch.nn as nn
from transformers import BertModel, AdamW, BertTokenizer

import pandas as pd
import numpy as np

from functools import partial
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [3]:
class ProductRanker(nn.Module):
    def __init__(self, l1=32, l2=256):
        super(ProductRanker, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.ranker = nn.Sequential(
            nn.Linear(768, l1),
            nn.ReLU(),
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2,1)
        )

        for param in self.bert.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]
        logits = self.ranker(last_hidden_state_cls)
        return logits

In [4]:
#Constants
config = {'l1': 256, 'l2': 8, 'lr': 0.05557941010651975, 'batch_size': 4}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ProductRanker(config["l1"],config["l2"]).to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], betas=(0.9, 0.999), eps=1e-08, weight_decay=0,
                            amsgrad=False)

if not os.path.exists("./data"):
    os.makedirs("./data")
if not os.path.exists("./outputs"):
    os.makedirs("./outputs")

logging.basicConfig(filename='./outputs/training.log', filemode='w', format='%(asctime)s %(message)s', level=logging.DEBUG, force=True)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logger = logging.getLogger()

  return torch._C._cuda_getDeviceCount() > 0
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def downloadData(url, path):
    return wget.download(url, out=path)

In [6]:
def downloadData(url, path):
    return wget.download(url, out=path)

In [7]:
def loadJson(filename):
    data = []
    with gzip.open(filename, "rt") as f:
        for line in f:
            stripped = line.strip()
            stripped = stripped.replace("\'", "\"")
            try:
                data.append(json.loads(stripped))
            except json.decoder.JSONDecodeError:
                continue 
    return data

In [8]:
def filterJson(data, extrema_dict):
    filtered = []
    for object in data:
        if not isGoodJsonEntry(object):
            continue
        
        sales_rank_key = list(object['salesRank'].keys())[0]
        sales_rank_value = object['salesRank'][sales_rank_key]
        min = extrema_dict[sales_rank_key][0]
        max = extrema_dict[sales_rank_key][1]

        tmp = {}
        tmp['description'] = html.unescape(object['description'])
        tmp['salesRank'] = (sales_rank_value - min) / (max - min)
        filtered.append(tmp)

    return filtered

In [9]:
def getExtremaDict(data):
    extrema_dict = {}
    for object in data:
        if not isGoodJsonEntry(object):
            continue

        sales_rank_key = list(object['salesRank'].keys())[0]
        sales_rank_value = object['salesRank'][sales_rank_key]
        if sales_rank_key not in extrema_dict:
            extrema_dict[sales_rank_key] = [sales_rank_value, sales_rank_value] 
        else:
            if extrema_dict[sales_rank_key][0] > sales_rank_value:
                extrema_dict[sales_rank_key][0] = sales_rank_value
            elif extrema_dict[sales_rank_key][1] < sales_rank_value:
                extrema_dict[sales_rank_key][1] = sales_rank_value

    return extrema_dict

In [10]:
def isGoodJsonEntry(object):
    if 'description' not in object.keys():
        return False
    if len(object['description']) == 0:
        return False
    if 'salesRank' not in object.keys():
        return False
    if len(object['salesRank']) == 0:
        return False
    return True

In [11]:
def safeJson(filtered, filename):
    with open(filename, 'w') as f:
        for object in filtered:
            json.dump(object, f)
            f.write(os.linesep)
    

In [12]:
def getJSON(data_dir="./data"):
    filename = os.path.join(data_dir, "meta_Electronics.json.gz")
    filename_filtered = os.path.join(data_dir, "filtered_meta_Electronics.tar.gz")
    filename_destination = os.path.join(data_dir, "filtered_meta_Electronics.json")

    if os.path.exists(filename_filtered):
        file = tarfile.open(filename_filtered, "r:gz")
        file.extractall(data_dir)
        file.close()
        return

    if not os.path.exists(filename):
        url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz"
        downloadData(url, filename)
    
    data = loadJson(filename)
    extrema_dict = getExtremaDict(data)
    filtered = filterJson(data, extrema_dict)
    safeJson(filtered, filename_destination)

Train

In [13]:
def dataPreparation(data_dir="./data"):
    path = os.path.join(data_dir, "filtered_meta_Electronics.json")
    print(path)
    df = pd.read_json(path, lines=True).sample(n = 120)
    #df = pd.read_json("filtered_meta_Electronics.json", lines=True)[:110]
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    tokenized_set = []
    for index, row in df.iterrows():
        tokenized_set.append([tokenizer.encode_plus(row['description'], truncation = True, return_tensors="pt",
                                                    max_length=512, pad_to_max_length=True), row['salesRank']])
    return torch.utils.data.random_split(tokenized_set, [100, 20])

In [14]:
def createDataloader(dataset, config, shuffle=True):
    labeled_set = []
    for i, doc1 in enumerate(dataset):
        for j, doc2 in enumerate(dataset):
            if i != j:
                label = 0.0
                if doc1[1] > doc2[1]:
                    label = 1.0
                if doc1[1] == doc2[1]:
                    label = 0.5

                labeled_set.append([[doc1[0], doc2[0]], label])
    return torch.utils.data.DataLoader(labeled_set, batch_size=int(config["batch_size"]), num_workers=2, shuffle=shuffle)

In [15]:
def train(model, loss_fn, optimizer, dataloader, epoch, saving=False, tune=False):
    running_loss = 0.0
    epoch_steps = 0

    train_loss = 0.0
    for X,y in dataloader:
        input_ids_1 = X[0]['input_ids'].squeeze().to(device)
        attention_mask_1 = X[0]['attention_mask'].squeeze().to(device)
        input_ids_2 = X[1]['input_ids'].squeeze().to(device)
        attention_mask_2 = X[1]['attention_mask'].squeeze().to(device)

        optimizer.zero_grad()
        out1 = model.forward(input_ids_1, attention_mask_1)
        out2 = model.forward(input_ids_2, attention_mask_2)
        diff = (out1 - out2).squeeze()
        diff = torch.sigmoid(diff).to(device)
        loss = loss_fn(diff, y.float().to(device))

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
    logger.info("[%d] train loss: %.10f" % (epoch + 1, train_loss / len(dataloader)))

    if saving:
        path = './outputs/model_' + str(epoch+1) + '.pth'
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, path)
        print("Model saved..")

    if tune:
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

    print("Training finished!")


In [16]:
def val(model, loss_fn, dataloader, epoch, tune=False):
        val_loss = 0.0
        val_steps = 0
        for X, y in dataloader:
            with torch.no_grad():
                input_ids_1 = X[0]['input_ids'].squeeze().to(device)
                attention_mask_1 = X[0]['attention_mask'].squeeze().to(device)
                input_ids_2 = X[1]['input_ids'].squeeze().to(device)
                attention_mask_2 = X[1]['attention_mask'].squeeze().to(device)
                out1 = model.forward(input_ids_1, attention_mask_1)
                out2 = model.forward(input_ids_2, attention_mask_2)
                diff = (out1 - out2).squeeze()
                diff = torch.sigmoid(diff)
                loss = loss_fn(diff, y.float().to(device))
                val_loss += loss.cpu().numpy()
                val_steps += 1

        if(tune):
            tune.report(loss=val_loss)

        logger.info("[%d] val loss: %.10f" % (epoch+1, val_loss))
        return val_loss

In [17]:
def train_tune(config, data_dir="./data"):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = ProductRanker(config["l1"],config["l2"]).to(device)
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], betas=(0.9, 0.999), eps=1e-08, weight_decay=0,
                                amsgrad=False)
    
    train_set, val_set = dataPreparation(data_dir)
    dataloader_train = createDataloader(train_set, config)
    dataloader_val = createDataloader(val_set, config)

    train(model, optimizer, dataloader_train, 0, tune=True)
    val(model, loss_fn, dataloader_val, 0, tune=True)


Main

In [18]:
def tune_hyperparameters():
    data_dir = os.path.abspath("./data")
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=4,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "training_iteration"])
    result = tune.run(
        partial(train_tune, data_dir=data_dir),
        resources_per_trial={"cpu": 12, "gpu": 1},
        config=config,
        num_samples=10,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

In [19]:
def tune_lr():
    data_dir = os.path.abspath("./data")
    config = {
        'l1': 256,
        'l2': 8,
        'lr': tune.loguniform(1e-4, 1e-1),
        'batch_size': 4}
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=5,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        metric_columns=["loss", "training_iteration"])
    result = tune.run(
        partial(train_tune, data_dir=data_dir),
        resources_per_trial={"cpu": 12, "gpu": 1},
        local_dir="./ray_results",
        config=config,
        num_samples=5,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    best_checkpoint = result.get_best_checkpoint(best_trial, metric="loss", mode="min", )

In [20]:
def train_model(epochs, config, from_checkpoint=False, path=None):
    data_dir = os.path.abspath("./data")
    train(epochs, config, data_dir, from_checkpoint, path)

In [21]:
def main(epochs, train_bool=True, validate_bool=True, data_dir="./data", saving=False, load=False):
    getJSON()
    train_set, val_set = dataPreparation()
    dataloader_train = createDataloader(train_set, config)
    dataloader_val = createDataloader(val_set, config)

    for epoch in range(epochs):
        if load:
            current_model_filename = './outputs/model_' + str(epoch+1) + '.pth'
            if os.path.exists(current_model_filename):
                    checkpoint = torch.load(current_model_filename)
                    model.load_state_dict(checkpoint['model_state_dict'])
                    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            else:
                logger.info(f"model: {current_model_filename} doesnt exist")
        
        logger.info(f"Epoch {epoch + 1}\n-------------------------------")
        if train_bool:
            logger.info("Starting training loop...")
            train(model, loss_fn, optimizer, dataloader_train, epoch, saving=saving, tune=False)
        
        if validate_bool:
            logger.info("Starting validation loop...")
            val(model, loss_fn, dataloader_val, epoch, tune=False)

In [22]:
main(5, saving=True, load=True)

here
100% [..................................................] 186594679 / 186594679./data/filtered_meta_Electronics.json
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
model: ./outputs/model_1.pth doesnt exist
Epoch 1
-------------------------------
Starting training loop...




KeyboardInterrupt: 