In [1]:
import gradio as gr
import pandas as pd
import numpy as np
import json
import io 
import sys
from PIL import Image
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.faiss import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
import torch
import torch.nn as nn
from tqdm.auto import trange

from cornac.models.recommender import Recommender
from cornac.utils.common import scale
from cornac.exception import ScoreException

from cornac.models.recommender import ANNMixin, MEASURE_DOT


# Utility

## BiVAE

In [37]:
class BiVAECF(Recommender):
    """Bilateral Variational AutoEncoder for Collaborative Filtering.
    Parameters
    ----------
    k: int, optional, default: 10
        The dimension of the stochastic user ``theta'' and item ``beta'' factors.
    encoder_structure: list, default: [20]
        The number of neurons per layer of the user and item encoders for BiVAE.
        For example, encoder_structure = [20], the user (item) encoder structure will be [num_items, 20, k] ([num_users, 20, k]).
    act_fn: str, default: 'tanh'
        Name of the activation function used between hidden layers of the auto-encoder.
        Supported functions: ['sigmoid', 'tanh', 'elu', 'relu', 'relu6']
    likelihood: str, default: 'pois'
        The likelihood function used for modeling the observations.
        Supported choices:
        bern: Bernoulli likelihood
        gaus: Gaussian likelihood
        pois: Poisson likelihood
    n_epochs: int, optional, default: 100
        The number of epochs for SGD.
    batch_size: int, optional, default: 100
        The batch size.
    learning_rate: float, optional, default: 0.001
        The learning rate for Adam.
    beta_kl: float, optional, default: 1.0
        The weight of the KL terms as in beta-VAE.
    cap_priors: dict, optional, default: {"user":False, "item":False}
        When {"user":True, "item":True}, CAP priors are used (see BiVAE paper for details),\
        otherwise the standard Normal is used as a Prior over the user and item latent variables.
    name: string, optional, default: 'BiVAECF'
        The name of the recommender model.
    trainable: boolean, optional, default: True
        When False, the model is not trained and Cornac assumes that the model is already \
        pre-trained.
    verbose: boolean, optional, default: False
        When True, some running logs are displayed.
    seed: int, optional, default: None
        Random seed for parameters initialization.
    use_gpu: boolean, optional, default: True
        If True and your system supports CUDA then training is performed on GPUs.
    References
    ----------
    * Quoc-Tuan Truong, Aghiles Salah, Hady W. Lauw. " Bilateral Variational Autoencoder for Collaborative Filtering."
    ACM International Conference on Web Search and Data Mining (WSDM). 2021.
    """

    def __init__(
        self,
        name="BiVAECF",
        k=10,
        user_encoder_structure=[20],
        item_encoder_structure = [20],
        act_fn="tanh",
        likelihood="pois",
        n_epochs=100,
        user_batch_size=100,
        item_batch_size = 100,
        user_learning_rate=0.001,
        item_learning_rate=0.001,
        beta_kl=1.0,
        cap_priors={"user": False, "item": False},
        trainable=True,
        verbose=False,
        seed=None,
        use_gpu=True,
        plot_loss = False,
    ):
        Recommender.__init__(self, name=name, trainable=trainable, verbose=verbose)
        self.k = k
        self.user_encoder_structure = user_encoder_structure
        self.item_encoder_structure = item_encoder_structure
        self.act_fn = act_fn
        self.likelihood = likelihood
        self.user_batch_size = user_batch_size
        self.item_batch_size = item_batch_size
        self.n_epochs = n_epochs
        self.user_learning_rate = user_learning_rate
        self.item_learning_rate = item_learning_rate
        self.beta_kl = beta_kl
        self.cap_priors = cap_priors
        self.seed = seed
        self.use_gpu = use_gpu
        self.plot_loss = plot_loss

    def fit(self, train_set, val_set=None):
        """Fit the model to observations.
        Parameters
        ----------
        train_set: :obj:`cornac.data.Dataset`, required
            User-Item preference data as well as additional modalities.
        val_set: :obj:`cornac.data.Dataset`, optional, default: None
            User-Item preference data for model selection purposes (e.g., early stopping).
        Returns
        -------
        self : object
        """
        Recommender.fit(self, train_set, val_set)

        self.device = (
            torch.device("cuda:0")
            if (self.use_gpu and torch.cuda.is_available())
            else torch.device("cpu")
        )

        if self.trainable:
            feature_dim = {"user": None, "item": None}
            if self.cap_priors.get("user", False):
                if train_set.user_feature is None:
                    raise ValueError(
                        "CAP priors for users is set to True but no user features are provided"
                    )
                else:
                    feature_dim["user"] = train_set.user_feature.feature_dim

            if self.cap_priors.get("item", False):
                if train_set.item_feature is None:
                    raise ValueError(
                        "CAP priors for items is set to True but no item features are provided"
                    )
                else:
                    feature_dim["item"] = train_set.item_feature.feature_dim

            if self.seed is not None:
                torch.manual_seed(self.seed)
                torch.cuda.manual_seed(self.seed)

            if not hasattr(self, "bivaecf"):
                num_items = train_set.matrix.shape[1]
                num_users = train_set.matrix.shape[0]
                self.bivae = BiVAE(
                    k=self.k,
                    #changes
                    user_encoder_structure=[num_items] + self.user_encoder_structure,
                    item_encoder_structure=[num_users] + self.item_encoder_structure,
                    #changes end
                    act_fn=self.act_fn,
                    likelihood=self.likelihood,
                    cap_priors=self.cap_priors,
                    feature_dim=feature_dim,
                    user_batch_size=self.user_batch_size,
                    item_batch_size = self.item_batch_size
                ).to(self.device)

            learn(
                self.bivae,
                self.train_set,
                n_epochs=self.n_epochs,
                user_batch_size=self.user_batch_size,
                item_batch_size = self.user_batch_size,
                user_learn_rate=self.user_learning_rate,
                item_learn_rate = self.item_learning_rate,
                beta_kl=self.beta_kl,
                verbose=self.verbose,
                device=self.device,
                plot_loss = self.plot_loss,
            )

        elif self.verbose:
            print("%s is trained already (trainable = False)" % (self.name))

        return self

    def score(self, user_idx, item_idx=None):
        """Predict the scores/ratings of a user for an item.
        Parameters
        ----------
        user_idx: int, required
            The index of the user for whom to perform score prediction.
        item_idx: int, optional, default: None
            The index of the item for which to perform score prediction.
            If None, scores for all known items will be returned.
        Returns
        -------
        res : A scalar or a Numpy array
            Relative scores that the user gives to the item or to all known items
        """

        if item_idx is None:
            if self.train_set.is_unk_user(user_idx):
                raise ScoreException(
                    "Can't make score prediction for (user_id=%d)" % user_idx
                )

            theta_mu_u = self.bivae.mu_theta[user_idx].view(1, -1)
            # theta_u = self.bivae.theta[user_idx].view(1, -1)
            beta = self.bivae.mu_beta
            known_item_scores = (
                self.bivae.decode_user(theta_mu_u, beta).cpu().numpy().ravel()
                # self.bivae.decode_user(theta, beta).cpu().detach().numpy().ravel()
            )

            return known_item_scores
        else:
            if self.train_set.is_unk_user(user_idx) or self.train_set.is_unk_item(
                item_idx
            ):
                raise ScoreException(
                    "Can't make score prediction for (user_id=%d, item_id=%d)"
                    % (user_idx, item_idx)
                )

            theta_mu_u = self.bivae.mu_theta[user_idx].view(1, -1)
            # theta_u = self.bivae.theta[user_idx].view(1, -1)
            beta_i = self.bivae.mu_beta[item_idx].view(1, -1)
            pred = self.bivae.decode_user(theta_mu_u, beta_i).cpu().numpy().ravel()
            # pred = self.bivae.decode_user(theta_u, beta).cpu().detach().numpy().ravel()

            pred = scale(
                pred, self.train_set.min_rating, self.train_set.max_rating, 0.0, 1.0
            )

            return pred

# cornac 1.18 BiVAE

## bivae

In [38]:
# Copyright 2018 The Cornac Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import itertools as it

import numpy as np
import torch
import torch.nn as nn
from tqdm.auto import trange


EPS = 1e-10

ACT = {
    "sigmoid": nn.Sigmoid(),
    "tanh": nn.Tanh(),
    "elu": nn.ELU(),
    "relu": nn.ReLU(),
    "relu6": nn.ReLU6(),
}


class BiVAE(nn.Module):
    def __init__(
        self,
        k,
        user_encoder_structure,
        item_encoder_structure,
        act_fn,
        likelihood,
        cap_priors,
        feature_dim,
        batch_size,
    ):
        super(BiVAE, self).__init__()

        self.mu_theta = torch.zeros((item_encoder_structure[0], k))  # n_users*k
        self.mu_beta = torch.zeros((user_encoder_structure[0], k))  # n_items*k

        self.theta = torch.randn(item_encoder_structure[0], k) * 0.01
        self.beta = torch.randn(user_encoder_structure[0], k) * 0.01
        torch.nn.init.kaiming_uniform_(self.theta, a=np.sqrt(5))

        self.likelihood = likelihood
        self.act_fn = ACT.get(act_fn, None)
        if self.act_fn is None:
            raise ValueError("Supported act_fn: {}".format(ACT.keys()))

        self.cap_priors = cap_priors
        if self.cap_priors.get("user", False):
            self.user_prior_encoder = nn.Linear(feature_dim.get("user"), k)
        if self.cap_priors.get("item", False):
            self.item_prior_encoder = nn.Linear(feature_dim.get("item"), k)

        # User Encoder
        self.user_encoder = nn.Sequential()
        for i in range(len(user_encoder_structure) - 1):
            self.user_encoder.add_module(
                "fc{}".format(i),
                nn.Linear(user_encoder_structure[i], user_encoder_structure[i + 1]),
            )
            self.user_encoder.add_module("act{}".format(i), self.act_fn)
        self.user_mu = nn.Linear(user_encoder_structure[-1], k)  # mu
        self.user_std = nn.Linear(user_encoder_structure[-1], k)

        # Item Encoder
        self.item_encoder = nn.Sequential()
        for i in range(len(item_encoder_structure) - 1):
            self.item_encoder.add_module(
                "fc{}".format(i),
                nn.Linear(item_encoder_structure[i], item_encoder_structure[i + 1]),
            )
            self.item_encoder.add_module("act{}".format(i), self.act_fn)
        self.item_mu = nn.Linear(item_encoder_structure[-1], k)  # mu
        self.item_std = nn.Linear(item_encoder_structure[-1], k)

    def to(self, device):
        self.beta = self.beta.to(device=device)
        self.theta = self.theta.to(device=device)
        self.mu_beta = self.mu_beta.to(device=device)
        self.mu_theta = self.mu_theta.to(device=device)
        return super(BiVAE, self).to(device)

    def encode_user_prior(self, x):
        h = self.user_prior_encoder(x)
        return h

    def encode_item_prior(self, x):
        h = self.item_prior_encoder(x)
        return h

    def encode_user(self, x):
        h = self.user_encoder(x)
        return self.user_mu(h), torch.sigmoid(self.user_std(h))

    def encode_item(self, x):
        h = self.item_encoder(x)
        return self.item_mu(h), torch.sigmoid(self.item_std(h))

    def decode_user(self, theta, beta):
        h = theta.mm(beta.t())
        return torch.sigmoid(h)

    def decode_item(self, theta, beta):
        h = beta.mm(theta.t())
        return torch.sigmoid(h)

    def reparameterize(self, mu, std):
        eps = torch.randn_like(mu)
        return mu + eps * std

    def forward(self, x, user=True, beta=None, theta=None):

        if user:
            mu, std = self.encode_user(x)
            theta = self.reparameterize(mu, std)
            return theta, self.decode_user(theta, beta), mu, std
        else:
            mu, std = self.encode_item(x)
            beta = self.reparameterize(mu, std)
            return beta, self.decode_item(theta, beta), mu, std

    def loss(self, x, x_, mu, mu_prior, std, kl_beta):
        # Likelihood
        ll_choices = {
            "bern": x * torch.log(x_ + EPS) + (1 - x) * torch.log(1 - x_ + EPS),
            "gaus": -(x - x_) ** 2,
            "pois": x * torch.log(x_ + EPS) - x_,
        }

        ll = ll_choices.get(self.likelihood, None)
        if ll is None:
            raise ValueError("Supported likelihoods: {}".format(ll_choices.keys()))

        ll = torch.sum(ll, dim=1)

        # KL term
        kld = -0.5 * (1 + 2.0 * torch.log(std) - (mu - mu_prior).pow(2) - std.pow(2))
        kld = torch.sum(kld, dim=1)

        return torch.mean(kl_beta * kld - ll)


def learn(
    bivae,
    train_set,
    n_epochs,
    batch_size,
    learn_rate,
    beta_kl,
    verbose,
    device=torch.device("cpu"),
    dtype=torch.float32,
):
    user_params = it.chain(
        bivae.user_encoder.parameters(),
        bivae.user_mu.parameters(),
        bivae.user_std.parameters(),
    )

    item_params = it.chain(
        bivae.item_encoder.parameters(),
        bivae.item_mu.parameters(),
        bivae.item_std.parameters(),
    )

    if bivae.cap_priors.get("user", False):
        user_params = it.chain(user_params, bivae.user_prior_encoder.parameters())
        user_features = train_set.user_feature.features[: train_set.num_users]

    if bivae.cap_priors.get("item", False):
        item_params = it.chain(item_params, bivae.item_prior_encoder.parameters())
        item_features = train_set.item_feature.features[: train_set.num_items]

    u_optimizer = torch.optim.Adam(params=user_params, lr=learn_rate)
    i_optimizer = torch.optim.Adam(params=item_params, lr=learn_rate)

    x = train_set.matrix.copy()
    x.data = np.ones_like(x.data)  # Binarize data
    tx = x.transpose()

    progress_bar = trange(1, n_epochs + 1, disable=not verbose)
    for _ in progress_bar:
        # item side
        i_sum_loss = 0.0
        i_count = 0
        for i_ids in train_set.item_iter(batch_size, shuffle=False):
            i_batch = tx[i_ids, :]
            i_batch = i_batch.A
            i_batch = torch.tensor(i_batch, dtype=dtype, device=device)

            # Reconstructed batch
            beta, i_batch_, i_mu, i_std = bivae(i_batch, user=False, theta=bivae.theta)

            i_mu_prior = 0.0  # zero mean for standard normal prior if not CAP prior
            if bivae.cap_priors.get("item", False):
                i_batch_f = item_features[i_ids]
                i_batch_f = torch.tensor(i_batch_f, dtype=dtype, device=device)
                i_mu_prior = bivae.encode_item_prior(i_batch_f)

            i_loss = bivae.loss(i_batch, i_batch_, i_mu, i_mu_prior, i_std, beta_kl)
            i_optimizer.zero_grad()
            i_loss.backward()
            i_optimizer.step()

            i_sum_loss += i_loss.data.item()
            i_count += len(i_batch)

            beta, _, i_mu, _ = bivae(i_batch, user=False, theta=bivae.theta)

            bivae.beta.data[i_ids] = beta.data
            bivae.mu_beta.data[i_ids] = i_mu.data

        # user side
        u_sum_loss = 0.0
        u_count = 0
        for u_ids in train_set.user_iter(batch_size, shuffle=False):
            u_batch = x[u_ids, :]
            u_batch = u_batch.A
            u_batch = torch.tensor(u_batch, dtype=dtype, device=device)

            # Reconstructed batch
            theta, u_batch_, u_mu, u_std = bivae(u_batch, user=True, beta=bivae.beta)

            u_mu_prior = 0.0  # zero mean for standard normal prior if not CAP prior
            if bivae.cap_priors.get("user", False):
                u_batch_f = user_features[u_ids]
                u_batch_f = torch.tensor(u_batch_f, dtype=dtype, device=device)
                u_mu_prior = bivae.encode_user_prior(u_batch_f)

            u_loss = bivae.loss(u_batch, u_batch_, u_mu, u_mu_prior, u_std, beta_kl)
            u_optimizer.zero_grad()
            u_loss.backward()
            u_optimizer.step()

            u_sum_loss += u_loss.data.item()
            u_count += len(u_batch)

            theta, _, u_mu, _ = bivae(u_batch, user=True, beta=bivae.beta)
            bivae.theta.data[u_ids] = theta.data
            bivae.mu_theta.data[u_ids] = u_mu.data

            progress_bar.set_postfix(
                loss_i=(i_sum_loss / i_count), loss_u=(u_sum_loss / (u_count))
            )

    # infer mu_beta
    for i_ids in train_set.item_iter(batch_size, shuffle=False):
        i_batch = tx[i_ids, :]
        i_batch = i_batch.A
        i_batch = torch.tensor(i_batch, dtype=dtype, device=device)

        beta, _, i_mu, _ = bivae(i_batch, user=False, theta=bivae.theta)
        bivae.mu_beta.data[i_ids] = i_mu.data

    # infer mu_theta
    for u_ids in train_set.user_iter(batch_size, shuffle=False):
        u_batch = x[u_ids, :]
        u_batch = u_batch.A
        u_batch = torch.tensor(u_batch, dtype=dtype, device=device)

        theta, _, u_mu, _ = bivae(u_batch, user=True, beta=bivae.beta)
        bivae.mu_theta.data[u_ids] = u_mu.data

    return bivae

## BiVAECF recommender

In [39]:
class BiVAECF(Recommender, ANNMixin):
    """Bilateral Variational AutoEncoder for Collaborative Filtering.

    Parameters
    ----------
    k: int, optional, default: 10
        The dimension of the stochastic user ``theta'' and item ``beta'' factors.

    encoder_structure: list, default: [20]
        The number of neurons per layer of the user and item encoders for BiVAE.
        For example, encoder_structure = [20], the user (item) encoder structure will be [num_items, 20, k] ([num_users, 20, k]).

    act_fn: str, default: 'tanh'
        Name of the activation function used between hidden layers of the auto-encoder.
        Supported functions: ['sigmoid', 'tanh', 'elu', 'relu', 'relu6']

    likelihood: str, default: 'pois'
        The likelihood function used for modeling the observations.
        Supported choices:

        bern: Bernoulli likelihood
        gaus: Gaussian likelihood
        pois: Poisson likelihood

    n_epochs: int, optional, default: 100
        The number of epochs for SGD.

    batch_size: int, optional, default: 100
        The batch size.

    learning_rate: float, optional, default: 0.001
        The learning rate for Adam.

    beta_kl: float, optional, default: 1.0
        The weight of the KL terms as in beta-VAE.

    cap_priors: dict, optional, default: {"user":False, "item":False}
        When {"user":True, "item":True}, CAP priors are used (see BiVAE paper for details),\
        otherwise the standard Normal is used as a Prior over the user and item latent variables.

    name: string, optional, default: 'BiVAECF'
        The name of the recommender model.

    trainable: boolean, optional, default: True
        When False, the model is not trained and Cornac assumes that the model is already \
        pre-trained.

    verbose: boolean, optional, default: False
        When True, some running logs are displayed.

    seed: int, optional, default: None
        Random seed for parameters initialization.

    use_gpu: boolean, optional, default: True
        If True and your system supports CUDA then training is performed on GPUs.

    References
    ----------
    * Quoc-Tuan Truong, Aghiles Salah, Hady W. Lauw. " Bilateral Variational Autoencoder for Collaborative Filtering."
    ACM International Conference on Web Search and Data Mining (WSDM). 2021.
    """

    def __init__(
        self,
        name="BiVAECF",
        k=10,
        user_encoder_structure=[20],
        item_encoder_structure = [20],
        act_fn="tanh",
        likelihood="pois",
        n_epochs=100,
        batch_size=100,
        learning_rate=0.001,
        beta_kl=1.0,
        cap_priors={"user": False, "item": False},
        trainable=True,
        verbose=False,
        seed=None,
        use_gpu=True,
    ):
        Recommender.__init__(self, name=name, trainable=trainable, verbose=verbose)
        self.k = k
        self.user_encoder_structure = user_encoder_structure
        self.item_encoder_structure = item_encoder_structure
        self.act_fn = act_fn
        self.likelihood = likelihood
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.learning_rate = learning_rate
        self.beta_kl = beta_kl
        self.cap_priors = cap_priors
        self.seed = seed
        self.use_gpu = use_gpu


    def fit(self, train_set, val_set=None):
        """Fit the model to observations.

        Parameters
        ----------
        train_set: :obj:`cornac.data.Dataset`, required
            User-Item preference data as well as additional modalities.

        val_set: :obj:`cornac.data.Dataset`, optional, default: None
            User-Item preference data for model selection purposes (e.g., early stopping).

        Returns
        -------
        self : object
        """
        Recommender.fit(self, train_set, val_set)

        import torch
        # from .bivae import BiVAE, learn
        self.device = (
            torch.device("cuda:0")
            if (self.use_gpu and torch.cuda.is_available())
            else torch.device("cpu")
        )

        if self.trainable:
            feature_dim = {"user": None, "item": None}
            if self.cap_priors.get("user", False):
                if train_set.user_feature is None:
                    raise ValueError(
                        "CAP priors for users is set to True but no user features are provided"
                    )
                else:
                    feature_dim["user"] = train_set.user_feature.feature_dim

            if self.cap_priors.get("item", False):
                if train_set.item_feature is None:
                    raise ValueError(
                        "CAP priors for items is set to True but no item features are provided"
                    )
                else:
                    feature_dim["item"] = train_set.item_feature.feature_dim

            if self.seed is not None:
                torch.manual_seed(self.seed)
                torch.cuda.manual_seed(self.seed)

            if not hasattr(self, "bivae"):
                num_items = train_set.matrix.shape[1]
                num_users = train_set.matrix.shape[0]
                self.bivae = BiVAE(
                    k=self.k,
                    user_encoder_structure=[num_items] + self.user_encoder_structure,
                    item_encoder_structure=[num_users] + self.item_encoder_structure,
                    act_fn=self.act_fn,
                    likelihood=self.likelihood,
                    cap_priors=self.cap_priors,
                    feature_dim=feature_dim,
                    batch_size=self.batch_size,
                ).to(self.device)

            learn(
                self.bivae,
                train_set,
                n_epochs=self.n_epochs,
                batch_size=self.batch_size,
                learn_rate=self.learning_rate,
                beta_kl=self.beta_kl,
                verbose=self.verbose,
                device=self.device,
            )
        elif self.verbose:
            print("%s is trained already (trainable = False)" % (self.name))

        return self


    def score(self, user_idx, item_idx=None):
        """Predict the scores/ratings of a user for an item.

        Parameters
        ----------
        user_idx: int, required
            The index of the user for whom to perform score prediction.

        item_idx: int, optional, default: None
            The index of the item for which to perform score prediction.
            If None, scores for all known items will be returned.

        Returns
        -------
        res : A scalar or a Numpy array
            Relative scores that the user gives to the item or to all known items

        """
        if self.is_unknown_user(user_idx):
            raise ScoreException("Can't make score prediction for user %d" % user_idx)

        if item_idx is not None and self.is_unknown_item(item_idx):
            raise ScoreException("Can't make score prediction for item %d" % item_idx)

        if item_idx is None:
            theta_u = self.bivae.mu_theta[user_idx].view(1, -1)
            beta = self.bivae.mu_beta
            return self.bivae.decode_user(theta_u, beta).cpu().numpy().ravel()
        else:
            theta_u = self.bivae.mu_theta[user_idx].view(1, -1)
            beta_i = self.bivae.mu_beta[item_idx].view(1, -1)
            pred = self.bivae.decode_user(theta_u, beta_i).cpu().numpy().ravel()
            return scale(pred, self.min_rating, self.max_rating, 0.0, 1.0)


    def get_vector_measure(self):
        """Getting a valid choice of vector measurement in ANNMixin._measures.

        Returns
        -------
        measure: MEASURE_DOT
            Dot product aka. inner product
        """
        return MEASURE_DOT


    def get_user_vectors(self):
        """Getting a matrix of user vectors serving as query for ANN search.

        Returns
        -------
        out: numpy.array
            Matrix of user vectors for all users available in the model.
        """
        user_vectors = self.bivae.mu_theta.detach().cpu().numpy()
        return user_vectors


    def get_item_vectors(self):
        """Getting a matrix of item vectors used for building the index for ANN search.

        Returns
        -------
        out: numpy.array
            Matrix of item vectors for all items available in the model.
        """
        item_vectors = self.bivae.mu_beta.detach().cpu().numpy()
        return item_vectors



# Functions

In [40]:
def predict_ranking(
    model,
    data,
    usercol='userID',
    itemcol='itemID',
    predcol='pred',
    remove_seen=False,
):
    """Computes predictions of recommender model from Cornac on all users and items in data.
    It can be used for computing ranking metrics like NDCG.

    Args:
        model (cornac.models.Recommender): A recommender model from Cornac
        data (pandas.DataFrame): The data from which to get the users and items
        usercol (str): Name of the user column
        itemcol (str): Name of the item column
        remove_seen (bool): Flag to remove (user, item) pairs seen in the training data

    Returns:
        pandas.DataFrame: Dataframe with usercol, itemcol, predcol
    """
    users, items, preds = [], [], []
    item = list(model.iid_map.keys())
    for uid, user_idx in model.uid_map.items():
        user = [uid] * len(item)
        users.extend(user)
        items.extend(item)
        preds.extend(model.score(user_idx).tolist())

    all_predictions = pd.DataFrame(
        data={usercol: users, itemcol: items, predcol: preds}
    )

    if remove_seen:
        tempdf = pd.concat(
            [
                data[[usercol, itemcol]],
                pd.DataFrame(
                    data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

In [41]:
def predict_ranking_user(
    model,
    data,
    user_id,
    usercol='userID',
    itemcol='itemID',
    predcol='pred',
    remove_seen=False,
):
    """Computes predictions of recommender model from Cornac on all users and items in data.
    It can be used for computing ranking metrics like NDCG.

    Args:
        model (cornac.models.Recommender): A recommender model from Cornac
        data (pandas.DataFrame): The data from which to get the users and items
        usercol (str): Name of the user column
        itemcol (str): Name of the item column
        remove_seen (bool): Flag to remove (user, item) pairs seen in the training data

    Returns:
        pandas.DataFrame: Dataframe with usercol, itemcol, predcol
    """
    users, items, preds = [], [], []
    item = list(model.iid_map.keys())
    # reverse_uid_map = {val : key for key,val in dict(model_.uid_map).items()}
    user_idx = dict(model_.uid_map).get(user_id)

    user_data = data.loc[data[usercol] == user_id]

    # user = [uid] * len(item)
    # users.extend(user)
    items.extend(item)
    preds.extend(model.score(user_idx).tolist())

    all_predictions = pd.DataFrame(
        data={itemcol: items, predcol: preds}
    )

    if remove_seen:
        tempdf = pd.concat(
            [
                user_data[[itemcol]],
                pd.DataFrame(
                    data=np.ones(user_data.shape[0]), columns=["dummycol"], index=user_data.index
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_predictions, on=[itemcol], how="outer")
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

In [42]:
def score_new_user(
        symbols, 
        model_,
        itemcol='itemID',
        predcol='pred',
        remove_seen=False,
        ):


    user_symbol_idx = np.array([model_.iid_map.get(symbol) for symbol in symbols])
    user_vec = np.zeros(len(model_.iid_map.keys()))
    user_vec[user_symbol_idx] =1
    
    user_encoder = model_.bivae.user_encoder

    user_mu = model_.bivae.user_mu
    user_ten = torch.from_numpy(user_vec)
    user_ten = user_ten.to("cuda:0").float()

    user_lat_vec = user_mu(user_encoder(user_ten)).view(1,-1)

    beta_mu = model_.bivae.mu_beta

    preds = model_.bivae.decode_user(user_lat_vec, beta_mu)
    preds = preds.detach().cpu().numpy().ravel()
    
    items = list(model_.iid_map.keys())
    all_predictions = pd.DataFrame(data = {itemcol:items})
    

    all_predictions = pd.DataFrame(
        data={itemcol: items, predcol: preds}
    )

    if remove_seen:
        tempdf = pd.concat(
            [
                pd.DataFrame(data = symbols, columns = [itemcol]), 
                pd.DataFrame(
                    data=np.ones(len(symbols)), columns=["dummycol"]
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_predictions, on=[itemcol], how="outer")
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

# score_new_user_gradio

In [56]:
def score_new_user_gradio(
        symbols, 
        model_ = model_,
        itemcol='itemID',
        predcol='pred',
        remove_seen=False,
        top_k = 10
        ):


    user_symbol_idx = np.array([model_.iid_map.get(symbol) for symbol in symbols])
    user_vec = np.zeros(len(model_.iid_map.keys()))
    user_vec[user_symbol_idx] =1
    
    user_encoder = model_.bivae.user_encoder

    user_mu = model_.bivae.user_mu
    user_ten = torch.from_numpy(user_vec)
    user_ten = user_ten.to("cuda:0").float()

    user_lat_vec = user_mu(user_encoder(user_ten)).view(1,-1)

    beta_mu = model_.bivae.mu_beta

    preds = model_.bivae.decode_user(user_lat_vec, beta_mu)
    preds = preds.detach().cpu().numpy().ravel()
    
    items = list(model_.iid_map.keys())
    # all_predictions = pd.DataFrame(data = {itemcol:items})
    

    all_preds = pd.DataFrame(
        data={itemcol: items, predcol: preds}
    )
    if remove_seen:
        tempdf = pd.concat(
            [
                pd.DataFrame(data = symbols, columns = [itemcol]), 
                pd.DataFrame(
                    data=np.ones(len(symbols)), columns=["dummycol"]
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_preds, on=[itemcol], how="outer")
        all_preds = merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
        top_k_symbols = list(all_preds.sort_values(by = 'pred', ascending= False).itemID.iloc[:top_k].values)
        top_k_out = '\n'.join(top_k_symbols)
        return top_k_out
    else:
        top_k_symbols = list(all_preds.sort_values(by = 'pred', ascending= False).itemID.iloc[:top_k].values)
        top_k_out = '\n'.join(top_k_symbols)
        return top_k_out
    

# Recommendation generation

In [2]:
embeddings = HuggingFaceEmbeddings()

In [14]:
new_db = FAISS.load_local("../../vector_db/phase3_symbols", embeddings)

In [15]:
def get_recommendations(query, top_k):
  relevant_stocks = new_db.similarity_search_with_relevance_scores(query= query, k = top_k)
  recommendations = [(relevant_stocks[idx][0].metadata.get('symbol'), relevant_stocks[idx][0].metadata.get('name'), relevant_stocks[idx][1]) for idx in range(len(relevant_stocks))]
  for idx, data in enumerate(recommendations):
    if data[2]>=0:
      print('{}. {} : {} |score : {}'.format(idx+1, data[0], data[1], data[2]))


In [29]:
def text_to_symbols(query, top_k):
    
    relevant_stocks = new_db.similarity_search_with_relevance_scores(query = query, k = top_k)
    recommendations = [(relevant_stocks[idx][0].metadata.get('symbol'), relevant_stocks[idx][0].metadata.get('name'), relevant_stocks[idx][1]) for idx in range(len(relevant_stocks))]
    
    reco_count = 0
    reco_symbols = []
    for idx, data in enumerate(recommendations):
        if data[2] >= 0:
            reco_symbols.append(data[0])
            reco_count += 1

        else:
            if reco_count <4:
                reco_symbols.append(data[0])
                reco_count += 1

    return reco_symbols

In [30]:
stock_pref = 'technology'

reco_symbols = text_to_symbols(stock_pref, top_k = 10)
reco_symbols



['ECL', 'SLTL', 'GEST', 'HBS']

# BiVAE

# Sandbox

In [47]:
# save_loc = r'../models\BiVAECF_alldata_24_02_16_15_32\BiVAECF\2024-02-16_15-32-24-532947.pkl'
save_loc = r'..\..\models\BiVAECF_alldata_24_02_16_15_32\BiVAECF\2024-02-16_15-32-24-532947.pkl'
model_ = BiVAECF.load(save_loc)

In [48]:
symbols = ['SHOT','SFIN', 'LCBF', 'MAL', 'TANG', 'ASIY']
user_symbol_idx = np.array([model_.iid_map.get(symbol) for symbol in symbols])
user_vec = np.zeros(len(model_.iid_map.keys()))
user_symbol_idx

array([ 72, 155,  67, 205, 240,  41])

In [49]:
user_vec[user_symbol_idx] =1

In [58]:
# df.itemID.value_counts()

In [51]:
symbols = ['SHOT','SFIN', 'LCBF', 'MAL', 'TANG', 'ASIY']

new_user_preds = score_new_user(
    symbols, 
    model_,
    itemcol='itemID',
    predcol='pred',
    remove_seen=True,
    )

In [52]:
len(new_user_preds)

269

In [53]:
list(new_user_preds.sort_values(by = 'pred', ascending= False).itemID.iloc[:5].values)

['BIL', 'EXPO', 'LIOC', 'ACL', 'LOFC']

In [54]:
# list(df.itemID.values)

In [57]:
import gradio as gr

demo = gr.Interface(
    fn = score_new_user_gradio,
    inputs = gr.Dropdown(
            # list(df.itemID.values),
             multiselect=True,
             label="symbols",
             allow_custom_value = False,
             scale=5
        ),
    outputs = ["text"],
)
demo.launch(share = True)

# businesses in the domain of shipping and logistics

Running on local URL:  http://127.0.0.1:7862

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


