# Multientity Recommender
This is a proof-of-concept for a Multi-entity recommender. For 1000 popular entities of books, TV shows, and movies, 30 keywords are generated to capture semantic significance to represent the entities using the GPT Completion API. 

### Datasets
* The 1000 most popular books are sourced and formatted from the Kaggle Goodreads dataset. https://www.kaggle.com/datasets/jealousleopard/goodreadsbooks?resource=download

* The TV and movies are sourced and formatted from IMDB dataset https://datasets.imdbws.com/ 

### Keywords from GPT-4 Completion API
Keywords are generated to capture plot, theme, mood, pace, tags, demographics, actors, directors, countries, awards. 

In [7]:
import asyncio
from dataclasses import dataclass
import functools
import json
import hashlib
import os
import pathlib

import backoff
from gensim.models import KeyedVectors
from matplotlib import pyplot as plt
import numpy as np
import openai
from openai.embeddings_utils import get_embedding, aget_embedding
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm.asyncio import tqdm

from movierecs.any2vec import BaseWord2VecRecommender
from pkg.pools import get_tmdb_movie_metadata_for_filters
from pkg.models.uri import Uri
from pkg.movie_metadata import batch_get_movie_metadata

openai.organization = "REDACTED"
openai.api_key = "REDACTED"
DATA_DIR = "data"

In [None]:
# 0ad2149024069816da4072d21bcbb0b5553326e6
KEYWORDS_PROMPT = """
Your task is to generate keywords that succinctly describe the {entity_type} {entity}.
These keywords will be used to generate an embedding vector for the entity, \
which will be used to calculate entity-similarities.
The keywords should capture the following information about the entity:
- plot, themes, mood, pace and tags
- demographics of the types people who will love this entity
- people involved, such as actors, directors and writers
- countries
- awards received
Output a comma separated list of 30 keywords.
"""

KEYWORDS_PROMPT_HASH = hashlib.sha1(KEYWORDS_PROMPT.encode()).hexdigest()
KEYWORDS_PROMPT_HASH

In [None]:
# URI_STRING_MAP is a dict mapping URIs to strings of their titles parsed from the metadata extracted from the datasets
URI_STRING_MAP_PATH = os.path.join(DATA_DIR, "uri_string_map.json")
with open(URI_STRING_MAP_PATH, "r+") as f:
    URI_STRING_MAP = json.load(f)

In [None]:
@backoff.on_exception(
    backoff.expo,
    openai.error.RateLimitError,
    max_time=300,
)
async def get_entity_keywords(uri: Uri):
    file = f"{uri}.txt"
    entity_dir = DATA_DIR / KEYWORDS_PROMPT_HASH / "keywords" / f"{uri.namespace}:{uri.entity}"
    os.makedirs(entity_dir, exist_ok=True)

    if file in os.listdir(entity_dir):
        with open(entity_dir / file, "r") as f:
            return f.read()
    
    completion = await openai.ChatCompletion.acreate(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": KEYWORDS_PROMPT.format(
                    entity_type=uri.entity,
                    entity=URI_STRING_MAP[uri],
                ),
            }
        ]
    )

    keywords = completion.choices[0].message["content"]
    
    with open(entity_dir / file, "w") as f:
        f.write(keywords)
    
    return keywords

In [None]:
sem = asyncio.BoundedSemaphore(10)

async def f(uri):
    async with sem:
        return await get_entity_keywords(uri)
    
# Load domain of URIs of books, movies, TV shows. 
with open("domain.json", "r+") as fp:
    uris = json.load(fp)

keywords = await tqdm.gather(*[f(uri) for uri in uris])

URI_TO_KEYWORDS = dict(zip(uris, keywords))

### Embeddings 
Use the GPT ada model to generate embeddings of 1536 dimensions to represent the entity in the embedding space. 

In [13]:
EMBEDDING_MODEL = "text-embedding-ada-002"

In [None]:
@backoff.on_exception(
    backoff.expo,
    openai.error.RateLimitError,
    max_time=300,
)
async def get_entity_embedding(
    uri: Uri,
    model: str = EMBEDDING_MODEL,
) -> list[float]:
    file = f"{uri}.txt"
    entity_dir = DATA_DIR / KEYWORDS_PROMPT_HASH / "embedding" / f"{uri.namespace}:{uri.entity}"
    os.makedirs(entity_dir, exist_ok=True)

    if file in os.listdir(entity_dir):
        with open(entity_dir / file, "r") as f:
            embedding = json.load(f)
            return embedding

    keywords = URI_TO_KEYWORDS[uri]
    embedding = await aget_embedding(keywords, model)
    
    with open(entity_dir / file, "w") as f:
        json.dump(embedding, f)

    return embedding

In [None]:
sem = asyncio.BoundedSemaphore(10)

async def f(uri):
    async with sem:
        return await get_entity_embedding(uri)

embeddings = await tqdm.gather(*[f(uri) for uri in URI_TO_KEYWORDS.keys()])

URI_TO_EMBEDDING = dict(zip(URI_TO_KEYWORDS.keys(), embeddings))

### Recommendations
The embeddings are stored in a Word2Vec style model.

In [None]:
wv = KeyedVectors(vector_size=len(list(URI_TO_EMBEDDING.values())[0]))
wv.add_vectors(
    keys=[str(uri) for uri in URI_TO_EMBEDDING.keys()],
    weights=list(URI_TO_EMBEDDING.values()),
)
wv.save(str(DATA_DIR / "multientity-ada-sample-20230503-3000.wordvectors"))

In [10]:
rec = BaseWord2VecRecommender(w2v_params=str(DATA_DIR + "/multientity-ada-sample-20230503-3000.wordvectors"))

In [29]:
@functools.cache
def get_embedding_memoised(query: str, model: str) -> list[float]:
    return get_embedding(query, model)

def get_recommendations(query: str | list[Uri | str], filter: list[str] = None):
    pool = {str(k) for k in URI_STRING_MAP.keys() if k.entity in filter} if filter else {str(k) for k in URI_STRING_MAP.keys()}
    if isinstance(query, str):
        embedding = get_embedding_memoised(query, EMBEDDING_MODEL)
        recs = rec.get_recommendations(query_items=[np.array(embedding)], num_items=3, pool=pool)
    elif isinstance(query, list):
        recs = rec.get_recommendations(query_items=[str(uri) for uri in query], num_items=3, pool=pool)
    
    for r in recs:
        uri = Uri(r)
        emoji = "🍿" if uri.entity == "movie" else "📺" if uri.entity == "tv" else "📖"

        print(emoji, URI_STRING_MAP[uri])

In [21]:
get_recommendations("Book that has wizards")

📖 Harry Potter Boxed Set  Books 1-5 (Harry Potter  #1-5) by J.K. Rowling (2014)
📖 Harry Potter and the Half-Blood Prince (Harry Potter  #6) by J.K. Rowling (2014)
📖 Son of a Witch (The Wicked Years  #2) by Gregory Maguire (2014)


In [31]:
get_recommendations("personal growth", filter=["book"])

📖 The 7 Habits of Highly Effective People: Powerful Lessons in Personal Change by Stephen R. Covey (2014)
📖 Think and Grow Rich: The Landmark Bestseller Now Revised and Updated for the 21st Century by Napoleon Hill (2014)
📖 Emotional Intelligence: Why It Can Matter More Than IQ by Daniel Goleman (2014)


  pool = [k for k in pool if k not in keys]
  candidates = [i for i in candidates if i not in queries]


In [32]:
get_recommendations("personal growth", filter=["movie"])

🍿 Yes Man (2008)
🍿 Stand by Me (1986)
🍿 The Karate Kid (1984)


In [30]:
get_recommendations("hilarious slapstick comedy", filter=["tv"])

📺 Mr. Bean (1990)
📺 Monty Python's Flying Circus (1969)
📺 Blackadder (1982)


  pool = [k for k in pool if k not in keys]
  candidates = [i for i in candidates if i not in queries]


In [34]:
# Books close to The Dark Knight
get_recommendations([Uri("imdb:movie:tt0468569")], filter=["book"])

📖 Batman: Arkham Asylum - A Serious House on Serious Earth by Grant Morrison (2014)
📖 V for Vendetta by Alan Moore (2014)
📖 Kingdom Come by Mark Waid (2014)


In [35]:
get_recommendations("hard hitting, thinker", filter=["book", "movie"])

📖 Blink: The Power of Thinking Without Thinking by Malcolm Gladwell (2014)
📖 Intensity by Dean Koontz (2014)
🍿 Die Hard with a Vengeance (1995)


  pool = [k for k in pool if k not in keys]
  candidates = [i for i in candidates if i not in queries]
