In [None]:
import os
import json
import boto3
from typing import List, Tuple, Dict

import mlflow
import torch
import numpy as np
from mlflow.tracking import MlflowClient
from mlflow.store.artifact.s3_artifact_repo import S3ArtifactRepository


# === Cấu hình MLflow + MinIO ===
def configure_mlflow(
    s3_endpoint: str, aws_key: str, aws_secret: str, tracking_uri: str
):
    os.environ.update(
        {
            "AWS_ACCESS_KEY_ID": aws_key,
            "AWS_SECRET_ACCESS_KEY": aws_secret,
            "MLFLOW_S3_ENDPOINT_URL": s3_endpoint,
            "MLFLOW_S3_IGNORE_TLS": "true",
        }
    )
    mlflow.set_tracking_uri(tracking_uri)

    session = boto3.session.Session(
        aws_access_key_id=aws_key, aws_secret_access_key=aws_secret
    )
    S3ArtifactRepository._get_s3_client = lambda self: session.client(
        "s3", endpoint_url=s3_endpoint
    )


# === IDMapper đơn giản ===
class SimpleIDMapper:
    def __init__(self, mapping_path: str):
        with open(mapping_path, "r") as f:
            mapping = json.load(f)
        self.item_to_index: Dict[str, int] = mapping["item_to_index"]
        self.index_to_item: Dict[int, str] = mapping["index_to_item"]


# === Load TorchScript model + IDMapper từ registered model ===
def load_champion_model(
    model_name: str,
) -> Tuple[torch.jit.ScriptModule, SimpleIDMapper]:
    client = MlflowClient()
    versions = client.search_model_versions(f"name='{model_name}'")
    champs = [v for v in versions if v.tags.get("champion", "").lower() == "true"]
    if not champs:
        raise RuntimeError(f"No champion version found for model '{model_name}'")

    champ = max(champs, key=lambda v: v.creation_timestamp)
    run_id = champ.run_id
    model = mlflow.pytorch.load_model(f"models:/{model_name}/{champ.version}")
    model.eval()

    id_mapper_path = f"runs:/{run_id}/id_mapper/id_mapper.json"
    id_mapper_local = mlflow.artifacts.download_artifacts(id_mapper_path)
    id_mapper = SimpleIDMapper(id_mapper_local)

    print(f"Loaded model v{champ.version} (run_id={run_id}) and IDMapper.")
    return model, id_mapper


# === Inference Utilities ===
def get_item_embedding(
    model, id_mapper, item_id, device=torch.device("cpu")
) -> torch.Tensor:
    idx = torch.tensor([id_mapper.item_to_index[item_id]], device=device)
    return model.embeddings(idx).squeeze(0)


def get_topk_similar(
    model, id_mapper, item_id, top_k=10, device=torch.device("cpu")
) -> List[Tuple[str, int, float]]:
    target_idx = id_mapper.item_to_index[item_id]
    weight = model.embeddings.weight[:-1].to(device)  # exclude padding_idx
    with torch.no_grad():
        target_emb = weight[target_idx]
        sims = torch.matmul(weight, target_emb)
    sims[target_idx] = -np.inf  # exclude self
    topk = torch.topk(sims, k=top_k)
    return [
        (id_mapper.index_to_item[int(i)], int(i), float(s))
        for i, s in zip(topk.indices, topk.values)
    ]


def predict_batch(
    model, id_mapper, batch: Dict[str, List[str]], device=torch.device("cpu")
) -> np.ndarray:
    tgt = torch.tensor(
        [id_mapper.item_to_index[i] for i in batch["target_items"]], device=device
    )
    ctx = torch.tensor(
        [id_mapper.item_to_index[i] for i in batch["context_items"]], device=device
    )
    with torch.no_grad():
        return model(tgt, ctx).cpu().numpy()


if __name__ == "__main__":
    configure_mlflow(
        s3_endpoint="http://127.0.0.1:9010",
        aws_key="admin",
        aws_secret="Password1234",
        tracking_uri="http://localhost:5002",
    )

    model_name = "item2vec_skipgram"
    model, id_mapper = load_champion_model(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    test_item = "B00000IV95"
    test_idx = id_mapper.item_to_index[test_item]
    emb = get_item_embedding(model, id_mapper, test_item, device)
    print(
        f"\n🔹 Embedding for '{test_item}' (index {test_idx}): {emb[:5].detach().cpu().numpy()}"
    )

    topk = get_topk_similar(model, id_mapper, test_item, top_k=5, device=device)
    print(f"\n🔹 Top 5 similar items to '{test_item}':")
    for iid, idx, score in topk:
        print(f" - {iid} (index {idx}): score = {score:.4f}")

    batch = {
        "target_items": [test_item, "B0792X1RSC"],
        "context_items": ["B0792X1RSC", test_item],
    }
    scores = predict_batch(model, id_mapper, batch, device)
    print(f"\n🔹 Batch prediction scores:")
    for t, c, s in zip(batch["target_items"], batch["context_items"], scores):
        print(f" - ({t}, {c}): score = {s:.4f}")



✅ Loaded model v1 (run_id=aac89aa42a3c447eac1294546df933e6) and IDMapper.

🔹 Embedding for 'B00000IV95' (index 36): [ 0.05149313  0.0443965  -0.01442701  0.07874709  0.07239346]

🔹 Top 5 similar items to 'B00000IV95':
 - B0C3H818H4 (index 4009): score = 2.2501
 - B0C48KPLZ2 (index 4026): score = 1.6529
 - 0975277324 (index 5): score = 1.6294
 - B0C1FX3BGK (index 3976): score = 1.5719
 - B00000IV35 (index 35): score = 1.5157

🔹 Batch prediction scores:
 - (B00000IV95, B0792X1RSC): score = 0.5309
 - (B0792X1RSC, B00000IV95): score = 0.5309


# Load embedding to S3 vector

In [2]:
import os
import sys
import json
import mlflow
import torch
import numpy as np
from dotenv import load_dotenv
from mlflow.tracking import MlflowClient
from mlflow.store.artifact.s3_artifact_repo import S3ArtifactRepository
import boto3
import warnings

# Suppress PyTorch warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Configuration
PROJECT_ROOT = os.getenv(
    "PROJECT_ROOT", "/home/duong/Documents/datn1/src/model_item2vec"
)
sys.path.append(PROJECT_ROOT)

load_dotenv()

# MLflow/MinIO config
S3_ENDPOINT = os.getenv("MLFLOW_S3_ENDPOINT", "http://127.0.0.1:9010")
AWS_KEY_MINIO = os.getenv("AWS_ACCESS_KEY_ID_MINIO", "admin")
AWS_SECRET_MINIO = os.getenv("AWS_SECRET_ACCESS_KEY_MINIO", "Password1234")
TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5002")

os.environ["AWS_ACCESS_KEY_ID"] = AWS_KEY_MINIO
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_MINIO
os.environ["MLFLOW_S3_ENDPOINT_URL"] = S3_ENDPOINT
os.environ["MLFLOW_S3_IGNORE_TLS"] = "true"

mlflow.set_tracking_uri(TRACKING_URI)

# Configure boto3 for MinIO (for MLflow artifacts)
minio_session = boto3.session.Session(
    aws_access_key_id=AWS_KEY_MINIO, aws_secret_access_key=AWS_SECRET_MINIO
)
S3ArtifactRepository._get_s3_client = lambda self: minio_session.client(
    "s3", endpoint_url=S3_ENDPOINT
)

# S3 Vectors config
S3_VECTOR_BUCKET = os.getenv("S3_VECTOR_BUCKET", "recsys-ops-s3-vector")
S3_VECTOR_INDEX = os.getenv("S3_VECTOR_INDEX", "item2vec-index-dim-256")
AWS_REGION = "us-east-1"
AWS_KEY_S3 = os.getenv("AWS_ACCESS_KEY_ID_AWS", "AKIA3TD2SE4D4EJQWMMS")
AWS_SECRET_S3 = os.getenv(
    "AWS_SECRET_ACCESS_KEY_AWS", "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"
)

# Model config
MODEL_NAME = os.getenv("MODEL_NAME", "item2vec_skipgram")
TAG_NAME = os.getenv("MODEL_TAG", "champion")


# SimpleIDMapper
class SimpleIDMapper:
    def __init__(self, mapping_path: str):
        with open(mapping_path, "r") as f:
            mapping = json.load(f)
        self.item_to_index = mapping["item_to_index"]
        self.index_to_item = mapping["index_to_item"]


# Load model and ID mapping
def load_model_from_mlflow(model_name: str, tag: str) -> tuple:
    client = MlflowClient()
    versions = client.search_model_versions(f"name='{model_name}'")
    champs = [v for v in versions if v.tags.get(tag, "").lower() == "true"]
    if not champs:
        raise ValueError(f"No version tagged '{tag}' found for model '{model_name}'.")

    champ = max(champs, key=lambda v: v.creation_timestamp)
    run_id = champ.run_id
    model_uri = f"models:/{model_name}/{champ.version}"
    print(f"Loading model from URI: {model_uri}")
    model = mlflow.pytorch.load_model(model_uri)
    model.eval()

    id_mapper_path = f"runs:/{run_id}/id_mapper/id_mapper.json"
    id_mapper_local = mlflow.artifacts.download_artifacts(id_mapper_path)
    id_mapper = SimpleIDMapper(id_mapper_local)

    print(
        f"Loaded model version {champ.version} (run_id={run_id}) with {len(id_mapper.item_to_index)} items."
    )
    return model, id_mapper


# Extract embeddings
def get_all_embeddings(model, id_mapper: SimpleIDMapper) -> tuple:
    item_ids = list(id_mapper.item_to_index.keys())
    item_indices = [id_mapper.item_to_index[id_] for id_ in item_ids]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    tensor_indices = torch.tensor(item_indices, device=device)
    with torch.no_grad():
        embeddings = model.embeddings(tensor_indices).detach().cpu().numpy()
    print(f"Extracted embeddings with shape {embeddings.shape}")
    return item_ids, embeddings


# Check index configuration
def check_index_config(
    bucket_name: str, index_name: str, region: str, aws_key: str, aws_secret: str
):
    try:
        s3_session = boto3.session.Session(
            aws_access_key_id=aws_key, aws_secret_access_key=aws_secret
        )
        client = s3_session.client("s3vectors", region_name=region)
        response = client.list_indexes(vectorBucketName=bucket_name)
        indexes = response.get("indexes", [])
        for idx in indexes:
            if idx["indexName"] == index_name:
                idx_serializable = json.loads(json.dumps(idx, default=str))
                print(f"Index config: {json.dumps(idx_serializable, indent=2)}")
                return idx_serializable
        print(f"Index {index_name} not found.")
        return None
    except Exception as e:
        print(f"Failed to check index config: {str(e)}")
        return None


# Index to S3 Vectors
def index_embeddings_to_s3_vectors(
    item_ids: list,
    embeddings: np.ndarray,
    bucket_name: str,
    index_name: str,
    region: str,
    aws_key: str,
    aws_secret: str,
):
    try:
        s3_session = boto3.session.Session(
            aws_access_key_id=aws_key, aws_secret_access_key=aws_secret
        )
        client = s3_session.client("s3vectors", region_name=region)
        print(f"Connecting to S3 Vectors in region {region}")

        vector_dim = embeddings.shape[1]

        # Check if bucket exists
        response = client.list_vector_buckets()
        buckets = [b["vectorBucketName"] for b in response.get("vectorBuckets", [])]
        if bucket_name not in buckets:
            raise ValueError(
                f"Vector bucket '{bucket_name}' does not exist. Please create it via AWS Console or CLI first."
            )

        # Delete existing index to clear all old vectors
        response = client.list_indexes(vectorBucketName=bucket_name)
        indexes = [idx["indexName"] for idx in response.get("indexes", [])]
        if index_name in indexes:
            client.delete_index(vectorBucketName=bucket_name, indexName=index_name)
            print(f"Deleted existing index '{index_name}' to clear all old vectors.")

        # Create new index
        client.create_index(
            vectorBucketName=bucket_name,
            indexName=index_name,
            dimension=vector_dim,
            distanceMetric="cosine",
            dataType="float32",
        )
        print(f"Created new index '{index_name}' with dim={vector_dim}")

        # Normalize embeddings for COSINE distance
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        print(
            f"Normalized embeddings, sample norm: {np.linalg.norm(embeddings[0]):.4f}"
        )

        # Prepare vectors with metadata
        vectors = [
            {
                "key": item_id,
                "data": {"float32": vec.tolist()},
                "metadata": {
                    "item_id": str(item_id),
                    "category": "item",
                    "index_timestamp": "2025-07-27",
                },
            }
            for item_id, vec in zip(item_ids, embeddings)
        ]
        print(f"Sample vector metadata: {vectors[0]['metadata']}")
        print(f"Sample vector first 5 values: {vectors[0]['data']['float32'][:5]}")

        # Put vectors in batches (max 500 per request)
        batch_size = 500
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i : i + batch_size]
            client.put_vectors(
                vectorBucketName=bucket_name, indexName=index_name, vectors=batch
            )
            print(f"Inserted batch {i//batch_size + 1} ({len(batch)} vectors)")

        print(
            f"Completed inserting {len(vectors)} embeddings into S3 Vectors index '{index_name}' in bucket '{bucket_name}'"
        )

        # Save embeddings and item_ids for debugging
        np.save("embeddings.npy", embeddings)
        with open("item_ids.json", "w") as f:
            json.dump(item_ids, f)
        print("Saved embeddings and item_ids for debugging.")

    except Exception as e:
        print(f"Failed to index embeddings to S3 Vectors: {str(e)}")
        raise


# Main
def main():
    print("Starting embedding indexing...")
    model, id_mapper = load_model_from_mlflow(MODEL_NAME, TAG_NAME)
    item_ids, embeddings = get_all_embeddings(model, id_mapper)

    print("Checking index configuration...")
    check_index_config(
        S3_VECTOR_BUCKET, S3_VECTOR_INDEX, AWS_REGION, AWS_KEY_S3, AWS_SECRET_S3
    )

    print("Indexing embeddings...")
    index_embeddings_to_s3_vectors(
        item_ids,
        embeddings,
        S3_VECTOR_BUCKET,
        S3_VECTOR_INDEX,
        AWS_REGION,
        AWS_KEY_S3,
        AWS_SECRET_S3,
    )


if __name__ == "__main__":
    main()



Starting embedding indexing...
Loading model from URI: models:/item2vec_skipgram/1
Loaded model version 1 (run_id=aac89aa42a3c447eac1294546df933e6) with 4143 items.
Extracted embeddings with shape (4143, 256)
Checking index configuration...
Index config: {
  "vectorBucketName": "recsys-ops-s3-vector",
  "indexName": "item2vec-index-dim-256",
  "indexArn": "arn:aws:s3vectors:us-east-1:796973475591:bucket/recsys-ops-s3-vector/index/item2vec-index-dim-256",
  "creationTime": "2025-07-27 09:00:53+07:00"
}
Indexing embeddings...
Connecting to S3 Vectors in region us-east-1
Deleted existing index 'item2vec-index-dim-256' to clear all old vectors.
Created new index 'item2vec-index-dim-256' with dim=256
Normalized embeddings, sample norm: 1.0000
Sample vector metadata: {'item_id': '0439893577', 'category': 'item', 'index_timestamp': '2025-07-27'}
Sample vector first 5 values: [-0.11034437268972397, 0.10069891065359116, -0.10829982906579971, 0.0698055773973465, -0.04339940473437309]
Inserted ba

# Test query s3 vector

In [None]:
import os
import sys
import json
import mlflow
import torch
import numpy as np
from dotenv import load_dotenv
from mlflow.tracking import MlflowClient
from mlflow.store.artifact.s3_artifact_repo import S3ArtifactRepository
import boto3
import warnings

# Suppress PyTorch warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Configuration
PROJECT_ROOT = os.getenv(
    "PROJECT_ROOT", "/home/duong/Documents/datn1/src/model_item2vec"
)
sys.path.append(PROJECT_ROOT)

load_dotenv()

# MLflow/MinIO config
S3_ENDPOINT = os.getenv("MLFLOW_S3_ENDPOINT", "http://127.0.0.1:9010")
AWS_KEY_MINIO = os.getenv("AWS_ACCESS_KEY_ID_MINIO", "admin")
AWS_SECRET_MINIO = os.getenv("AWS_SECRET_ACCESS_KEY_MINIO", "Password1234")
TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5002")

os.environ["AWS_ACCESS_KEY_ID"] = AWS_KEY_MINIO
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_MINIO
os.environ["MLFLOW_S3_ENDPOINT_URL"] = S3_ENDPOINT
os.environ["MLFLOW_S3_IGNORE_TLS"] = "true"

mlflow.set_tracking_uri(TRACKING_URI)

# Configure boto3 for MinIO (for MLflow artifacts)
minio_session = boto3.session.Session(
    aws_access_key_id=AWS_KEY_MINIO, aws_secret_access_key=AWS_SECRET_MINIO
)
S3ArtifactRepository._get_s3_client = lambda self: minio_session.client(
    "s3", endpoint_url=S3_ENDPOINT
)

# S3 Vectors config
S3_VECTOR_BUCKET = os.getenv("S3_VECTOR_BUCKET", "recsys-ops-s3-vector")
S3_VECTOR_INDEX = os.getenv("S3_VECTOR_INDEX", "item2vec-index-dim-256")
AWS_REGION = "us-east-1"
AWS_KEY_S3 = os.getenv("AWS_ACCESS_KEY_ID_AWS", "AKIA3TD2SE4D4EJQWMMS")
AWS_SECRET_S3 = os.getenv(
    "AWS_SECRET_ACCESS_KEY_AWS", "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"
)

# Model config
MODEL_NAME = os.getenv("MODEL_NAME", "item2vec_skipgram")
TAG_NAME = os.getenv("MODEL_TAG", "champion")


# SimpleIDMapper
class SimpleIDMapper:
    def __init__(self, mapping_path: str):
        with open(mapping_path, "r") as f:
            mapping = json.load(f)
        self.item_to_index = mapping["item_to_index"]
        self.index_to_item = mapping["index_to_item"]


# Load model and ID mapping
def load_model_from_mlflow(model_name: str, tag: str) -> tuple:
    client = MlflowClient()
    versions = client.search_model_versions(f"name='{model_name}'")
    champs = [v for v in versions if v.tags.get(tag, "").lower() == "true"]
    if not champs:
        raise ValueError(f"No version tagged '{tag}' found for model '{model_name}'.")

    champ = max(champs, key=lambda v: v.creation_timestamp)
    run_id = champ.run_id
    model_uri = f"models:/{model_name}/{champ.version}"
    print(f"Loading model from URI: {model_uri}")
    model = mlflow.pytorch.load_model(model_uri)
    model.eval()

    id_mapper_path = f"runs:/{run_id}/id_mapper/id_mapper.json"
    id_mapper_local = mlflow.artifacts.download_artifacts(id_mapper_path)
    id_mapper = SimpleIDMapper(id_mapper_local)

    print(
        f"Loaded model version {champ.version} (run_id={run_id}) with {len(id_mapper.item_to_index)} items."
    )
    return model, id_mapper


# Extract embeddings
def get_all_embeddings(model, id_mapper: SimpleIDMapper) -> tuple:
    item_ids = list(id_mapper.item_to_index.keys())
    item_indices = [id_mapper.item_to_index[id_] for id_ in item_ids]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    tensor_indices = torch.tensor(item_indices, device=device)
    with torch.no_grad():
        embeddings = model.embeddings(tensor_indices).detach().cpu().numpy()
    print(f"Extracted embeddings with shape {embeddings.shape}")
    return item_ids, embeddings


# Query S3 Vectors
def query_s3_vectors(
    query_embedding: np.ndarray,
    bucket_name: str,
    index_name: str,
    region: str,
    aws_key: str,
    aws_secret: str,
    top_k: int = 10,
    filter_dict: dict = None,
):
    try:
        s3_session = boto3.session.Session(
            aws_access_key_id=aws_key, aws_secret_access_key=aws_secret
        )
        client = s3_session.client("s3vectors", region_name=region)
        print(f"Querying S3 Vectors in region {region}")

        # Normalize query vector for COSINE distance
        query_embedding = query_embedding / np.linalg.norm(query_embedding)
        print(
            f"Query vector shape: {query_embedding.shape}, norm: {np.linalg.norm(query_embedding):.4f}"
        )
        print(f"Query vector first 5 values: {query_embedding[:5].tolist()}")

        # Prepare query
        query_params = {
            "vectorBucketName": bucket_name,
            "indexName": index_name,
            "queryVector": {"float32": query_embedding.tolist()},
            "topK": top_k,
            "returnMetadata": True,
            "returnDistance": True,
        }
        if filter_dict:
            print(f"Applying filter: {filter_dict}")
            query_params["filter"] = filter_dict

        # Execute query
        response = client.query_vectors(**query_params)
        results = response.get("vectors", [])
        print(f"Queried S3 Vectors, found {len(results)} results")
        print(f"Raw response: {json.dumps(response, indent=2)}")

        # Print results
        for result in results:
            metadata = result.get("metadata") or result.get("Metadata", {})
            print(
                f"Key: {result['key']}, Score: {result['distance']:.4f}, Metadata: {metadata}"
            )

        return results

    except Exception as e:
        print(f"Failed to query S3 Vectors: {str(e)}")
        raise


# Get sample vector for debugging
def get_sample_vector(
    bucket_name: str,
    index_name: str,
    key: str,
    region: str,
    aws_key: str,
    aws_secret: str,
):
    try:
        s3_session = boto3.session.Session(
            aws_access_key_id=aws_key, aws_secret_access_key=aws_secret
        )
        client = s3_session.client("s3vectors", region_name=region)
        response = client.get_vectors(
            vectorBucketName=bucket_name,
            indexName=index_name,
            keys=[key],
            returnMetadata=True,
        )
        print(f"Raw get_vectors response: {json.dumps(response, indent=2)}")
        vectors = response.get("vectors", [])
        if vectors:
            vector = vectors[0]
            metadata = vector.get("metadata") or vector.get("Metadata", {})
            print(f"Retrieved vector: Key: {vector['key']}, Metadata: {metadata}")
            print(f"Vector first 5 values: {vector['data']['float32'][:5]}")
            print(f"Vector norm: {np.linalg.norm(vector['data']['float32']):.4f}")
        else:
            print(f"No vector found for key: {key}")
        return vectors

    except Exception as e:
        print(f"Failed to get vector for key {key}: {str(e)}")
        return []


def main():
    print("Starting embedding querying...")
    model, id_mapper = load_model_from_mlflow(MODEL_NAME, TAG_NAME)
    item_ids, embeddings = get_all_embeddings(model, id_mapper)

    # Debug: Get a sample vector
    print("Retrieving sample vector...")
    sample_key = item_ids[0]  # e.g., '0439893577'
    get_sample_vector(
        S3_VECTOR_BUCKET,
        S3_VECTOR_INDEX,
        sample_key,
        AWS_REGION,
        AWS_KEY_S3,
        AWS_SECRET_S3,
    )

    # Query with sample embedding
    print("Testing query with first embedding...")
    query_embedding = embeddings[0]
    query_s3_vectors(
        query_embedding,
        S3_VECTOR_BUCKET,
        S3_VECTOR_INDEX,
        AWS_REGION,
        AWS_KEY_S3,
        AWS_SECRET_S3,
        top_k=10,
        filter_dict=None,
    )

    # Query with filter
    print("Testing query with metadata filter...")
    filter_dict = {"category": {"$eq": "item"}}
    query_s3_vectors(
        query_embedding,
        S3_VECTOR_BUCKET,
        S3_VECTOR_INDEX,
        AWS_REGION,
        AWS_KEY_S3,
        AWS_SECRET_S3,
        top_k=10,
        filter_dict=filter_dict,
    )


if __name__ == "__main__":
    main()

Starting embedding querying...
Loading model from URI: models:/item2vec_skipgram/1




Loaded model version 1 (run_id=aac89aa42a3c447eac1294546df933e6) with 4143 items.
Extracted embeddings with shape (4143, 256)
Retrieving sample vector...
Raw get_vectors response: {
  "ResponseMetadata": {
    "RequestId": "056e336e-d3b4-484d-8329-eef23b2e4335",
    "HostId": "",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Sun, 27 Jul 2025 02:11:01 GMT",
      "content-type": "application/json",
      "content-length": "119",
      "connection": "keep-alive",
      "x-amz-request-id": "056e336e-d3b4-484d-8329-eef23b2e4335",
      "access-control-allow-origin": "*",
      "vary": "origin, access-control-request-method, access-control-request-headers",
      "access-control-expose-headers": "*"
    },
    "RetryAttempts": 0
  },
  "vectors": [
    {
      "key": "0439893577",
      "metadata": {
        "index_timestamp": "2025-07-27",
        "item_id": "0439893577",
        "category": "item"
      }
    }
  ]
}
Retrieved vector: Key: 0439893577, Metadata: {'index_ti

# Load pre-recommend for each item to Redis

In [None]:
import json
import os
import redis
import torch
import mlflow
import numpy as np
import sys
from tqdm.auto import tqdm
import boto3
from mlflow.tracking import MlflowClient
from mlflow.store.artifact.s3_artifact_repo import S3ArtifactRepository

# === CONFIGURATION ===
PROJECT_ROOT = "/home/duong/Documents/datn1/src/model_item2vec"
sys.path.append(PROJECT_ROOT)

CONFIG = {
    "mlf_model_name": os.getenv("MODEL_NAME", "item2vec_skipgram"),
    "s3_vector_bucket": os.getenv("S3_VECTOR_BUCKET", "recsys-ops-s3-vector"),
    "s3_vector_index": os.getenv("S3_VECTOR_INDEX", "item2vec-index-dim-256"),
    "aws_region": "us-east-1",
    "aws_access_key_id_aws": os.getenv("AWS_ACCESS_KEY_ID_AWS", "AKIA3TD2SE4D4EJQWMMS"),
    "aws_secret_access_key_aws": os.getenv(
        "AWS_SECRET_ACCESS_KEY_AWS", "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"
    ),
    "redis_host": os.getenv("REDIS_HOST", "localhost"),
    "redis_port": int(os.getenv("REDIS_PORT", 6379)),
    "redis_db": int(os.getenv("REDIS_DB", 0)),
    "redis_password": os.getenv("REDIS_PASSWORD", "123456"),
    "batch_size": int(os.getenv("BATCH_SIZE", 256)),
    "top_k": int(os.getenv("TOP_K", 10)),
    "top_K": int(os.getenv("TOP_K_LARGE", 20)),
    "output_file": os.getenv("OUTPUT_FILE", "../../data/batch_recs.jsonl"),
    "s3_endpoint": os.getenv("MLFLOW_S3_ENDPOINT", "http://127.0.0.1:9010"),
    "aws_key": os.getenv("AWS_ACCESS_KEY_ID_MINIO", "admin"),
    "aws_secret": os.getenv("AWS_SECRET_ACCESS_KEY_MINIO", "Password1234"),
}


# ENV SETUP
def configure_mlflow():
    os.environ.update(
        {
            "AWS_ACCESS_KEY_ID": CONFIG["aws_key"],
            "AWS_SECRET_ACCESS_KEY": CONFIG["aws_secret"],
            "MLFLOW_S3_ENDPOINT_URL": CONFIG["s3_endpoint"],
            "MLFLOW_S3_IGNORE_TLS": "true",
        }
    )
    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5002"))

    session = boto3.session.Session(
        aws_access_key_id=CONFIG["aws_key"], aws_secret_access_key=CONFIG["aws_secret"]
    )
    S3ArtifactRepository._get_s3_client = lambda self: session.client(
        "s3", endpoint_url=CONFIG["s3_endpoint"]
    )


# CLIENTS
redis_client = redis.Redis(
    host=CONFIG["redis_host"],
    port=CONFIG["redis_port"],
    db=CONFIG["redis_db"],
    decode_responses=True,
    password=CONFIG["redis_password"],
)

s3vectors_client = boto3.client(
    "s3vectors",
    region_name=CONFIG["aws_region"],
    aws_access_key_id=CONFIG["aws_access_key_id_aws"],
    aws_secret_access_key=CONFIG["aws_secret_access_key_aws"],
)


# SimpleIDMapper
class SimpleIDMapper:
    def __init__(self, mapping_path: str):
        with open(mapping_path, "r") as f:
            mapping = json.load(f)
        self.item_to_index = mapping["item_to_index"]
        self.index_to_item = {
            i: item_id for i, item_id in enumerate(mapping["index_to_item"])
        }


# Load TorchScript model + IDMapper
def load_model():
    configure_mlflow()
    client = MlflowClient()
    versions = client.search_model_versions(f"name='{CONFIG['mlf_model_name']}'")
    champs = [v for v in versions if v.tags.get("champion", "").lower() == "true"]
    if not champs:
        raise RuntimeError(
            f"No champion version found for model '{CONFIG['mlf_model_name']}'"
        )

    champ = max(champs, key=lambda v: v.creation_timestamp)
    run_id = champ.run_id
    model = mlflow.pytorch.load_model(
        f"models:/{CONFIG['mlf_model_name']}/{champ.version}"
    )
    model.eval()

    id_mapper_path = f"runs:/{run_id}/id_mapper/id_mapper.json"
    id_mapper_local = mlflow.artifacts.download_artifacts(id_mapper_path)
    id_mapper = SimpleIDMapper(id_mapper_local)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    print(f"Loaded model v{champ.version} (run_id={run_id}) and IDMapper.")
    return model, id_mapper


# Inference Utilities
def get_item_embedding(
    model, id_mapper, item_id, device=torch.device("cpu")
) -> torch.Tensor:
    idx = torch.tensor([id_mapper.item_to_index[item_id]], device=device)
    return model.embeddings(idx).squeeze(0)


def get_topk_similar(
    model, id_mapper, item_id, top_k=10, device=torch.device("cpu")
) -> list:
    target_idx = id_mapper.item_to_index[item_id]
    weight = model.embeddings.weight[:-1].to(device)  # exclude padding_idx
    with torch.no_grad():
        target_emb = weight[target_idx]
        sims = torch.matmul(weight, target_emb)
    sims[target_idx] = -np.inf  # exclude self
    topk = torch.topk(sims, k=top_k)
    return [
        (id_mapper.index_to_item[int(i)], float(s))
        for i, s in zip(topk.indices, topk.values)
    ]


# Compute Recommendations
def compute_recommendations():
    model, id_mapper = load_model()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Clear all existing recommendation keys in Redis
    keys = redis_client.keys("rec:*")
    if keys:
        pipe = redis_client.pipeline()
        for key in keys:
            pipe.delete(key)
        pipe.execute()
        print(f"Cleared {len(keys)} existing recommendation keys in Redis.")
    else:
        print("No existing recommendation keys in Redis to clear.")

    all_indices = list(id_mapper.index_to_item.keys())

    recs = []

    for i in tqdm(range(0, len(all_indices), CONFIG["batch_size"]), desc="Processing"):
        batch_indices = all_indices[i : i + CONFIG["batch_size"]]
        batch_item_ids = [id_mapper.index_to_item[idx] for idx in batch_indices]

        # Generate batch embeddings
        batch_indices_tensor = torch.tensor(batch_indices, device=device)
        with torch.no_grad():
            batch_embeddings = (
                model.embeddings(batch_indices_tensor).detach().cpu().numpy()
            )
        # Normalize embeddings for COSINE distance
        batch_embeddings = batch_embeddings / np.linalg.norm(
            batch_embeddings, axis=1, keepdims=True
        )

        batch_neighbors = []
        for target_id, vec in zip(batch_item_ids, batch_embeddings):
            # Search for top-K + 1 neighbors in S3 Vectors
            try:
                response = s3vectors_client.query_vectors(
                    vectorBucketName=CONFIG["s3_vector_bucket"],
                    indexName=CONFIG["s3_vector_index"],
                    queryVector={"float32": vec.tolist()},
                    topK=CONFIG["top_K"] + 1,
                    returnMetadata=True,
                    returnDistance=True,
                )
                neighbors = response.get("vectors", [])
                neighbor_ids = [n["key"] for n in neighbors if n["key"] != target_id][
                    : CONFIG["top_K"]
                ]
                batch_neighbors.append(neighbor_ids)
            except Exception as e:
                print(f"Failed to query S3 Vectors for {target_id}: {e}")
                batch_neighbors.append([])

        batch_scores = []
        for target_id, neighbors in zip(batch_item_ids, batch_neighbors):
            if not neighbors:
                batch_scores.append([])
                continue

            # Compute scores using model
            sample_input = {
                "target_items": [target_id] * len(neighbors),
                "context_items": neighbors,
            }
            try:
                scores = predict_batch(model, id_mapper, sample_input, device)
                batch_scores.append(scores.tolist())
            except Exception as e:
                print(f"Prediction failed for {target_id}: {e}")
                batch_scores.append([0.0] * len(neighbors))

        # Write to Redis and JSONL
        for target_id, neighbors, scores in zip(
            batch_item_ids, batch_neighbors, batch_scores
        ):
            if not neighbors:
                continue
            sorted_pairs = sorted(
                zip(neighbors, scores), key=lambda x: x[1], reverse=True
            )
            top_neighbors, top_scores = zip(*sorted_pairs[: CONFIG["top_k"]])
            rec = {
                "target_item": target_id,
                "rec_item_ids": list(top_neighbors),
                "rec_scores": list(top_scores),
            }

            recs.append(rec)
            redis_key = f"rec:{target_id}"
            redis_client.set(redis_key, json.dumps(rec))
            print(f"Stored recommendation for {target_id} in Redis.")

    os.makedirs(os.path.dirname(CONFIG["output_file"]), exist_ok=True)
    with open(CONFIG["output_file"], "w") as f:
        for rec in recs:
            f.write(json.dumps(rec) + "\n")

    print(
        f"Completed. Stored {len(recs)} recommendations to Redis and {CONFIG['output_file']}"
    )


# Inference Utility for Batch Prediction
def predict_batch(
    model, id_mapper, batch: dict, device=torch.device("cpu")
) -> np.ndarray:
    tgt = torch.tensor(
        [id_mapper.item_to_index[i] for i in batch["target_items"]], device=device
    )
    ctx = torch.tensor(
        [id_mapper.item_to_index[i] for i in batch["context_items"]], device=device
    )
    with torch.no_grad():
        return model(tgt, ctx).cpu().numpy()


# MAIN
if __name__ == "__main__":
    compute_recommendations()

In [None]:
import redis
import json

# Cấu hình Redis (đảm bảo giống CONFIG trong script gốc)
redis_client = redis.Redis(
    host="localhost",  # Hoặc thay bằng giá trị từ CONFIG["redis_host"]
    port=6379,  # CONFIG["redis_port"]
    db=0,  # CONFIG["redis_db"]
    decode_responses=True,  # Đảm bảo trả về string thay vì bytes
    password="123456",  # Nếu có auth
)

# Item ID cần truy vấn
item_id = "B0002YV94U"
key = f"rec:{item_id}"

# Lấy dữ liệu từ Redis
rec_json = redis_client.get(key)

if rec_json:
    rec_data = json.loads(rec_json)
    print("Recommendation found:")
    print(json.dumps(rec_data, indent=2))
else:
    print(f"No recommendation found for item {item_id}")

## Load popular item to Redis

In [None]:
import pandas as pd
import redis
from sqlalchemy import create_engine

from dotenv import load_dotenv
import os

dotenv_path = os.path.abspath("../../.env")
load_dotenv(dotenv_path)

POSTGRES_URI = os.environ["POSTGRES_URI_OLTP"]
conn_str = POSTGRES_URI
table = "public.reviews"

# Redis config
redis_client = redis.Redis(
    host="localhost", port=6379, db=0, decode_responses=True, password="123456"
)

# 1. Load từ PostgreSQL
engine = create_engine(conn_str)
query = f"""
    SELECT 
        parent_asin,
        rating
    FROM {table}
    WHERE parent_asin IS NOT NULL
"""
df = pd.read_sql(query, engine)

# 2. Tính toán thống kê
agg_df = (
    df.groupby("parent_asin")
    .agg(rating_count=("rating", "count"), rating_avg=("rating", "mean"))
    .reset_index()
)

# 3. Tính popularity score
agg_df["score"] = agg_df["rating_count"] * (agg_df["rating_avg"] - 3.0)

# 4. Lấy top phổ biến theo score
TOP_K = 500
top_df = agg_df.sort_values("score", ascending=False).head(TOP_K)

# In ra để kiểm tra trước khi lưu
print("Top popular parent_asin (score-based):")
print(top_df[["parent_asin", "rating_count", "rating_avg", "score"]])

# 5. Ghi vào Redis nếu OK
redis_key = "popular_parent_asin_score"
redis_client.delete(redis_key)

for _, row in top_df.iterrows():
    redis_client.zadd(redis_key, {row["parent_asin"]: float(row["score"])})

print(f"\nĐã lưu {len(top_df)} popular parent_asin vào Redis key: {redis_key}")

🔍 Top popular parent_asin (score-based):
     parent_asin  rating_count  rating_avg  score
3934  B0BW3QTWJJ           438    4.780822  780.0
2559  B07C4NGT17           226    4.867257  422.0
801   B0054TRQA4           216    4.750000  378.0
1223  B00D8STBHY           202    4.816832  367.0
3589  B09QPXVW35           166    4.885542  313.0
...          ...           ...         ...    ...
91    B000067NXE            45    4.333333   60.0
3290  B08PMPDGXM            36    4.638889   59.0
311   B000NV7L0I            42    4.404762   59.0
103   B000088UPW            56    4.053571   59.0
1451  B00IL7IFP6            33    4.787879   59.0

[500 rows x 4 columns]

✅ Đã lưu 500 popular parent_asin vào Redis key: popular_parent_asin_score


In [None]:
import redis

# Kết nối Redis
redis_client = redis.Redis(
    host="localhost", port=6379, db=0, decode_responses=True, password="123456"
)

# Redis key bạn đã lưu trước đó
redis_key = "popular_parent_asin_score"

# Truy vấn top 10 phổ biến nhất (score cao nhất)
top_items = redis_client.zrevrange(redis_key, 0, 9, withscores=True)

# In ra kết quả
print("Top 10 popular parent_asin from Redis:")
for rank, (asin, score) in enumerate(top_items, start=1):
    print(f"{rank:2d}. ASIN: {asin} | Score: {score:.2f}")

🔥 Top 10 popular parent_asin from Redis:
 1. ASIN: B0BW3QTWJJ | Score: 780.00
 2. ASIN: B07C4NGT17 | Score: 422.00
 3. ASIN: B0054TRQA4 | Score: 378.00
 4. ASIN: B00D8STBHY | Score: 367.00
 5. ASIN: B09QPXVW35 | Score: 313.00
 6. ASIN: B07N29HQMN | Score: 257.00
 7. ASIN: B00FZMDAO6 | Score: 248.00
 8. ASIN: B0BG94QRLZ | Score: 241.00
 9. ASIN: B09PH8LV57 | Score: 238.00
10. ASIN: B004S8F7QM | Score: 238.00
