In [None]:
!pip install wget
!pip install transformers
!pip install ray
!pip install scikit-learn

In [None]:
import json
import os
import wget
import gzip
import html
import tarfile
import logging
import sys
import csv

import torch
import torch.nn as nn
from transformers import BertModel, AdamW, BertTokenizer

import pandas as pd
import numpy as np
from random import sample

from functools import partial
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from sklearn.metrics import mean_squared_error
import gensim
import gensim.downloader as api
from gensim import utils

In [None]:
class ProductRanker(nn.Module):
    def __init__(self, l1=128, l2=32):
        super(ProductRanker, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.ranker = nn.Sequential(
            nn.Linear(768, l1),
            nn.BatchNorm1d(l1, affine=False),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(l1, l2),
            nn.BatchNorm1d(l2, affine=False),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(l2,1)
        )

        for param in self.bert.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]
        logits = self.ranker(last_hidden_state_cls)
        return logits

In [None]:
#Constants
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = {'l1': 128, 'l2': 32, 'lr': 0.0005, 'batch_size': 100}
model = ProductRanker(config["l1"],config["l2"]).to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], betas=(0.9, 0.999), eps=1e-08, weight_decay=0,
                            amsgrad=False)
cos = torch.nn.CosineSimilarity(dim=0)

if not os.path.exists("./data"):
    os.makedirs("./data")
if not os.path.exists("./outputs"):
    os.makedirs("./outputs")

logging.basicConfig(filename='./outputs/training.log', filemode='w', format='%(asctime)s %(message)s', level=logging.DEBUG, force=True)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logger = logging.getLogger()

In [None]:
def downloadData(url, path):
    return wget.download(url, out=path)

In [None]:
def loadJson(filename):
    data = []
    with gzip.open(filename, "rt") as f:
        for line in f:
            stripped = line.strip()
            stripped = stripped.replace("\'", "\"")
            try:
                data.append(json.loads(stripped))
            except json.decoder.JSONDecodeError:
                continue 
    return data

In [None]:
def filterJson(data, extrema_dict):
    filtered = []
    for entry in data:
        if not isGoodJsonEntry(entry):
            continue
        
        sales_rank_key = list(entry['salesRank'].keys())[0]
        sales_rank_value = entry['salesRank'][sales_rank_key]
        min = extrema_dict[sales_rank_key][0]
        max = extrema_dict[sales_rank_key][1]

        tmp = {}
        if min == max:
            tmp['salesRank'] = 0.5
        else:
            tmp['salesRank'] = (sales_rank_value - min) / (max - min)
        tmp['description'] = html.unescape(entry['description'])
        filtered.append(tmp)

    return filtered

In [None]:
def getExtremaDict(data):
    extrema_dict = {}
    for entry in data:
        if not isGoodJsonEntry(entry):
            continue

        sales_rank_key = list(entry['salesRank'].keys())[0]
        sales_rank_value = entry['salesRank'][sales_rank_key]
        if sales_rank_key not in extrema_dict:
            extrema_dict[sales_rank_key] = [sales_rank_value, sales_rank_value] 
        else:
            if extrema_dict[sales_rank_key][0] > sales_rank_value:
                extrema_dict[sales_rank_key][0] = sales_rank_value
            elif extrema_dict[sales_rank_key][1] < sales_rank_value:
                extrema_dict[sales_rank_key][1] = sales_rank_value

    return extrema_dict

In [None]:
def isGoodJsonEntry(entry):
    if 'description' not in entry.keys():
        return False
    if len(entry['description']) == 0:
        return False
    if 'salesRank' not in entry.keys():
        return False
    if len(entry['salesRank']) == 0:
        return False
    return True

In [None]:
def safeJson(filtered, filename):
    with open(filename, 'w') as f:
        for object in filtered:
            json.dump(object, f)
            f.write(os.linesep)

In [None]:
def getJSON(data_dir="./data"):
    filename_meta = os.path.join(data_dir, "meta_Electronics.json.gz")
    filename_filtered_meta = os.path.join(data_dir, "filtered_meta_Electronics.tar.gz")
    filename_destination_meta_train = os.path.join(data_dir, "filtered_meta_Electronics_train.json")
    filename_destination_meta_test = os.path.join(data_dir, "filtered_meta_Electronics_test.json")

    url_meta = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz"
    
    if os.path.exists(filename_destination_meta_train) and os.path.exists(filename_destination_meta_test):
        return
    
    if not os.path.exists(filename_meta):
        downloadData(url_meta, filename_meta)
    
    data = loadJson(filename_meta)
    extrema_dict = getExtremaDict(data)
    filtered = filterJson(data, extrema_dict)
    train_set, test_set = torch.utils.data.random_split(filtered, [len(filtered)-5625, 5625])
    safeJson(train_set, filename_destination_meta_train)
    safeJson(test_set, filename_destination_meta_test)
    
getJSON()

In [None]:
def loadGoogleWord2Vec():
    return api.load("word2vec-google-news-300")

In [None]:
def averagePollingDocument(model_word2vec, data_dir="./data"):
    path = os.path.join(data_dir, "filtered_meta_Electronics_test.json")
    df = pd.read_json(path, lines=True) 
    word_set = set()
    
    len_vec = len(model_word2vec["hello"])
    
    documents = dict()
    for index, row in df.iterrows():
        tmp = torch.zeros(len_vec)
        description = utils.simple_preprocess(row['description'])
        for word in description:
            try:
                tmp = tmp.add(torch.tensor(model_word2vec[word]))
                word_set.add(word)
            except KeyError:
                continue
        documents[index] = tmp.div(len(description))
        
    return documents, list(word_set)

In [None]:
def averagePollingQuery(query, model_word2vec):
    query = utils.simple_preprocess(query)
    len_vec = len(model_word2vec["hello"])
    
    avg_query = torch.zeros(len_vec)
    for word in query:
        try:
            avg_query += avg_query.add(torch.tensor(model_word2vec[word]))
        except KeyError:
            continue
    avg_query = avg_query / len(query)
    return avg_query

In [None]:
def createRandomQuery(word_set, x=5):
    return ' '.join(sample(word_set, x))

In [None]:
def getMostSimilair(avg_query, documents, x=100):
    cosine_sim = [(cos(avg_query, value), indx) for indx, value in documents.items()]
    cosine_sim.sort(key=lambda y: -y[0].item())
    most_similair = cosine_sim[:x]
    most_similair = [x[1] for x in most_similair]
    return most_similair

In [None]:
def dataPreparation(data_dir="./data"):
    path = os.path.join(data_dir, "filtered_meta_Electronics_train.json")
    df = pd.read_json(path, lines=True).sample(n = (225))
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    tokenized_set = []
    for index, row in df.iterrows():
        tokenized_set.append([tokenizer.encode_plus(row['description'], truncation = True, return_tensors="pt",
                                                    max_length=512, pad_to_max_length=True), row['salesRank']])
    return torch.utils.data.random_split(tokenized_set, [150, 75])

In [None]:
def createDataloader(dataset, config, shuffle=True, test=False):
    labeled_set = []
    if test:
        for i, doc1 in enumerate(dataset):
            labeled_set.append([[doc1[0]], doc1[1]])
    else:
        for i, doc1 in enumerate(dataset):
            for j, doc2 in enumerate(dataset):
                if i != j:
                    label = 0.0
                    if doc1[1] > doc2[1]:
                        label = 1.0
                    if doc1[1] == doc2[1]:
                        label = 0.5

                    labeled_set.append([[doc1[0], doc2[0]], label])
    return torch.utils.data.DataLoader(labeled_set, batch_size=int(config["batch_size"]), num_workers=2, shuffle=shuffle)

In [None]:
def createDataloaderTest(most_similair, data_dir="./data"):
    path = os.path.join(data_dir, "filtered_meta_Electronics_train.json")
    df = pd.read_json(path, lines=True)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    tokenized_set = []
    for index, row in df.iterrows():
        if index in most_similair:
            tokenized_set.append([tokenizer.encode_plus(row['description'], truncation = True, return_tensors="pt",
                                                    max_length=512, pad_to_max_length=True), row['salesRank']])

    return createDataloader(tokenized_set, config, test=True, shuffle=False)

In [None]:
def train(model, loss_fn, optimizer, dataloader, epoch, saving=False, tune=False):
    running_loss = 0.0
    epoch_steps = 0

    train_loss = 0.0
    acc_tp_tn = 0
    acc_total = 0
    
    model.train()
    
    for X,y in dataloader:
        input_ids_1 = X[0]['input_ids'].squeeze().to(device)
        attention_mask_1 = X[0]['attention_mask'].squeeze().to(device)
        input_ids_2 = X[1]['input_ids'].squeeze().to(device)
        attention_mask_2 = X[1]['attention_mask'].squeeze().to(device)

        optimizer.zero_grad()
        out1 = model.forward(input_ids_1, attention_mask_1)
        out2 = model.forward(input_ids_2, attention_mask_2)
        diff = (out1 - out2).squeeze()
        diff = torch.sigmoid(diff).to(device)
        loss = loss_fn(diff, y.float().to(device))
        
        for i in range(len(diff)):
            if (diff[i] >= 0.5 and y[i] == 1) or (diff[i] < 0.5 and y[i] == 0):
                acc_tp_tn += 1
            acc_total +=1

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
    logger.info("[%d] train loss: %.10f" % (epoch + 1, train_loss / len(dataloader)))
    logging.info("[%d] train accuracy: %.10f" % (epoch + 1, acc_tp_tn / acc_total))

    if saving:
        path = './outputs/model_' + str(epoch+1) + '.pth'
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, path)
        logger.info("Model saved..")

    if tune:
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

    logger.info("Training finished!")


In [None]:
def val(model, loss_fn, dataloader, epoch, tune=False):
        val_loss = 0.0
        acc_tp_tn = 0
        acc_total = 0
        
        model.eval()
        
        for X, y in dataloader:
            with torch.no_grad():
                input_ids_1 = X[0]['input_ids'].squeeze().to(device)
                attention_mask_1 = X[0]['attention_mask'].squeeze().to(device)
                input_ids_2 = X[1]['input_ids'].squeeze().to(device)
                attention_mask_2 = X[1]['attention_mask'].squeeze().to(device)
                out1 = model.forward(input_ids_1, attention_mask_1)
                out2 = model.forward(input_ids_2, attention_mask_2)
                diff = (out1 - out2).squeeze()
                diff = torch.sigmoid(diff)
                loss = loss_fn(diff, y.float().to(device))
                val_loss += loss.cpu().numpy()
                
                for i in range(len(diff)):
                    if (diff[i] >= 0.5 and y[i] == 1) or (diff[i] < 0.5 and y[i] == 0):
                        acc_tp_tn += 1
                    acc_total +=1

        if(tune):
            tune.report(loss=val_loss)

        logger.info("[%d] val loss: %.10f" % (epoch+1, val_loss/len(dataloader)))
        logging.info("[%d] val acc: %.10f" % (epoch + 1, acc_tp_tn/acc_total))
        return 

In [None]:
def train_tune(config, data_dir="./data"):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = ProductRanker(config["l1"],config["l2"]).to(device)
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], betas=(0.9, 0.999), eps=1e-08, weight_decay=0,
                                amsgrad=False)
    
    train_set, val_set = dataPreparation(data_dir)
    dataloader_train = createDataloader(train_set, config)
    dataloader_val = createDataloader(val_set, config)

    train(model, optimizer, dataloader_train, 0, tune=True)
    val(model, loss_fn, dataloader_val, 0, tune=True)


In [None]:
def tune_hyperparameters():
    data_dir = os.path.abspath("./data")
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=4,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "training_iteration"])
    result = tune.run(
        partial(train_tune, data_dir=data_dir),
        resources_per_trial={"cpu": 12, "gpu": 1},
        config=config,
        num_samples=10,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

In [None]:
def tune_lr():
    data_dir = os.path.abspath("./data")
    config = {
        'l1': 256,
        'l2': 8,
        'lr': tune.loguniform(1e-4, 1e-1),
        'batch_size': 4}
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=5,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        metric_columns=["loss", "training_iteration"])
    result = tune.run(
        partial(train_tune, data_dir=data_dir),
        resources_per_trial={"cpu": 12, "gpu": 1},
        local_dir="./ray_results",
        config=config,
        num_samples=5,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    best_checkpoint = result.get_best_checkpoint(best_trial, metric="loss", mode="min", )

In [None]:
def train_model(epochs, config, from_checkpoint=False, path=None):
    data_dir = os.path.abspath("./data")
    train(epochs, config, data_dir, from_checkpoint, path)

In [None]:
def MRR(y, rankings):
    y_dict = {}
    for i in range(len(y)):
        y_dict[i] = y[i]
        
    y_dict = {key: rank for rank, key in enumerate(sorted(y_dict, key=y_dict.get, reverse=True), 1)}
    mrr_ret = 0
    for i in range(len(y)):
        mrr_ret += y[i] / rankings[i]
    
    #print(y_dict)
    #print(rankings)
    return mrr_ret/len(y)
    

In [None]:
def predict(dataloader,model_filename):
    checkpoint = torch.load(model_filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    return_ranks = {}
    number_of_batches = 0
    model.eval()
    
    for X, y in dataloader:
        with torch.no_grad():
            input_ids = X[0]['input_ids'].squeeze().to(device)
            attention_mask = X[0]['attention_mask'].squeeze().to(device)
            out = model.forward(input_ids, attention_mask).squeeze().cpu().numpy()
            for i, result in enumerate(out):
                return_ranks[i] = result   
            return_ranks = {key: rank for rank, key in enumerate(sorted(return_ranks, key=return_ranks.get, reverse=True), 1)}
        baseline = {}
        for i in range(1,101):
            baseline[i-1] = i
        print('baseline: ',MRR(y, baseline))
        print('prediction: ', MRR(y, return_ranks)) 

In [None]:
def training_loop(epochs, train_bool=True, validate_bool=True, data_dir="./data", saving=False, load=False):
    train_set, val_set = dataPreparation()
    dataloader_train = createDataloader(train_set, config)
    dataloader_val = createDataloader(val_set, config)
    
    if train_bool or validate_bool:
        for epoch in range(epochs):
            if load:
                current_model_filename = './outputs/model_' + str(epoch+1) + '.pth'
                if os.path.exists(current_model_filename):
                        checkpoint = torch.load(current_model_filename)
                        model.load_state_dict(checkpoint['model_state_dict'])
                        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                else:
                    logger.info(f"model: {current_model_filename} doesnt exist")

            logger.info(f"Epoch {epoch + 1}\n-------------------------------")
            if train_bool:
                logger.info("Starting training loop...")
                train(model, loss_fn, optimizer, dataloader_train, epoch, saving=saving, tune=False)

            if validate_bool:
                logger.info("Starting validation loop...")
                val(model, loss_fn, dataloader_val, epoch, tune=False)

In [None]:
model_word2vec = loadGoogleWord2Vec()

In [None]:
documents, words = averagePollingDocument(model_word2vec)

In [None]:
def testing_loop(query, data_dir="./data"):
    avg_query = averagePollingQuery(query, model_word2vec)
    most_similair = getMostSimilair(avg_query, documents)
    #print(most_similair)
    dataloader = createDataloaderTest(most_similair)
    #print(dataloader)
    print('query: ', query)
    predict(dataloader, './outputs/model_15.pth')
    
for x in range(10):
    testing_loop(createRandomQuery(words))

In [None]:
#training_loop(5, train_bool=False, validate_bool=False, saving=False, load=False)