# Explore Retrievers & Chains
<img src="images/demo-stack.png" alt="summary" width="1000"/>

    

## Setup

In [None]:
%%capture
%pip install sentence_transformers langchain langchain-openai langchain_community openai tiktoken python-dotenv gradio graphdatascience altair
%pip install "vegafusion[embed]"

In [1]:
import pandas as pd

pd.set_option('display.max_rows', 7)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 0)

In [2]:
# You can skip this cell if not using a ws.env file - alternative to above
from dotenv import load_dotenv
import os

load_dotenv('.env', override=True)

# Neo4j
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

# AI
LLM = 'gpt-4o'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

## Discovery & Search Retriever

<img src="images/app-flow-semantic-search.png" alt="summary" width="1000"/>


In [3]:
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import OpenAIEmbeddings
import pandas as pd

search_prompt = 'denim jeans'

embedding_model = OpenAIEmbeddings()

# define retriever
vector_only_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='product_text_embeddings',
    retrieval_query="""
    WITH node AS product, score
    RETURN product.productCode AS productCode,
        product.text AS text, score,
        {score:score, productCode: product.productCode} AS metadata
        ORDER BY score DESC
        """)

# similarity search
vector_only_res = vector_only_search.similarity_search(search_prompt, k=100)

# Visualize as a dataframe
vector_only_df = pd.DataFrame([{'productCode': d.metadata['productCode'],
                                'document': d.page_content,
                                'score': d.metadata['score']} for d in vector_only_res])
vector_only_df

Unnamed: 0,productCode,document,score
0,252298,##Product\nName: Didi denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Dresses La...,0.938463
1,598423,##Product\nName: Night Denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Dresses L...,0.936840
2,727804,##Product\nName: Didi HW Skinny denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: ...,0.934703
...,...,...,...
97,663133,##Product\nName: RELAXED SKINNY\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Trouse...,0.922477
98,820827,##Product\nName: Jade HW Skinny Button dnm\nType: Trousers\nGroup: Garment Lower body\nGarment T...,0.922452
99,309864,##Product\nName: Skinny Cheapo 89\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Trou...,0.922402


In [7]:
# define retriever
kg_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='product_text_embeddings',
    retrieval_query="""
    WITH node AS product, score AS vectorScore

    OPTIONAL MATCH(product)<-[:VARIANT_OF]-(:Article)<-[:PURCHASED]-(:Customer)
    -[:PURCHASED]->(a:Article)<-[:PURCHASED]-(:Customer {customerId: $customerId})

    WITH count(a) AS graphScore, 
        product.text AS text, 
        vectorScore, 
        product.productCode AS productCode
    RETURN text,
        (1+graphScore)*vectorScore AS score,
        {productCode: productCode, 
            graphScore:graphScore, 
            vectorScore:vectorScore} AS metadata
    ORDER BY graphScore DESC, vectorScore DESC LIMIT 15
    """)


# similarity search (with personalized graph pattern)
CUSTOMER_ID = "daae10780ecd14990ea190a1e9917da33fe96cd8cfa5e80b67b4600171aa77e0"
kg_res = kg_search.similarity_search(search_prompt, 
                                     k=100, 
                                     params={'customerId': CUSTOMER_ID})

# visualize as a dataframe
vector_kg_df = pd.DataFrame([{'productCode': d.metadata['productCode'],
               'document': d.page_content,
               'vectorScore': d.metadata['vectorScore'],
               'graphScore': d.metadata['graphScore']} for d in kg_res])
vector_kg_df

Unnamed: 0,productCode,document,vectorScore,graphScore
0,670698,##Product\nName: Rachel HW Denim TRS\nType: Trousers\nGroup: Garment Lower body\nGarment Type: T...,0.922642,22
1,706016,##Product\nName: Jade HW Skinny Denim TRS\nType: Trousers\nGroup: Garment Lower body\nGarment Ty...,0.926760,11
2,777038,##Product\nName: Bono NW slim denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Tr...,0.926300,8
...,...,...,...,...
12,598423,##Product\nName: Night Denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Dresses L...,0.936840,0
13,727804,##Product\nName: Didi HW Skinny denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: ...,0.934703,0
14,652924,##Product\nName: &DENIM Jeggings HW\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Tr...,0.934462,0


In [8]:
#merge and compare
(vector_only_df
 .reset_index(names='vectorRank')[['productCode', 'vectorRank']]
 .merge(vector_kg_df.reset_index(names='graphRank'), 
        on='productCode', how='right')
)

Unnamed: 0,productCode,vectorRank,graphRank,document,vectorScore,graphScore
0,670698,95,0,##Product\nName: Rachel HW Denim TRS\nType: Trousers\nGroup: Garment Lower body\nGarment Type: T...,0.922642,22
1,706016,41,1,##Product\nName: Jade HW Skinny Denim TRS\nType: Trousers\nGroup: Garment Lower body\nGarment Ty...,0.926760,11
2,777038,47,2,##Product\nName: Bono NW slim denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Tr...,0.926300,8
...,...,...,...,...,...,...
12,598423,1,12,##Product\nName: Night Denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Dresses L...,0.936840,0
13,727804,2,13,##Product\nName: Didi HW Skinny denim\nType: Trousers\nGroup: Garment Lower body\nGarment Type: ...,0.934703,0
14,652924,3,14,##Product\nName: &DENIM Jeggings HW\nType: Trousers\nGroup: Garment Lower body\nGarment Type: Tr...,0.934462,0


## Recommendations Chain

<img src="images/app-flow-recommendations.png" alt="summary" width="1000"/>


In [None]:
from typing import Dict
from langchain.graphs import Neo4jGraph
import json

kg = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

In [None]:
graph_emb_df = pd.DataFrame(kg.query('''
MATCH (p:Product)<-[:VARIANT_OF]-(a:Article)-[:FROM_DEPARTMENT]-(d)
RETURN a.articleId AS articleId,
    p.prodName AS productName,
    p.productTypeName AS productTypeName,
    d.departmentName AS departmentName,
    d.sectionName AS sectionName,
    p.detailDesc AS detailDesc,
    a.graphEmbedding AS embedding
'''))

In [None]:
import numpy as np
from sklearn.manifold import TSNE

df = graph_emb_df.copy()
filtered_node_df = df[df.embedding.apply(lambda x: np.count_nonzero(x) > 0)].reset_index(drop=True)
# instantiate the TSNE model
tsne = TSNE(n_components=2, random_state=7474, init='random', learning_rate="auto")
# Use the TSNE model to fit and output a 2-d representation
E = tsne.fit_transform(np.stack(filtered_node_df['embedding'], axis=0))

coord_df = pd.concat([filtered_node_df, pd.DataFrame(E, columns=['x', 'y'])], axis=1)

In [None]:
import altair as alt
import warnings

warnings.filterwarnings('ignore')

alt.data_transformers.disable_max_rows()
chart = alt.Chart(coord_df.sample(n=5000, random_state=7474)).mark_circle(size=60).encode(
    x='x',
    y='y',
    tooltip=['productName', 'productTypeName', 'departmentName' , 'sectionName', 'detailDesc']
).properties(title="Article Embedding (2D Representation)", width=650, height=600)

chart = chart.configure_axis(titleFontSize=20)
chart.configure_legend(labelFontSize = 20)
chart

In [None]:
from typing import Dict
import json

def format_res_dicts(d: Dict) -> Dict:
    res = dict()
    for k, v in d.items():
        if k != "metadata":
            res[k] = v
    for k, v in d['metadata'].items():
        if v is not None:
            res[k] = v
    return res

vector_top_k = 100
res_top_k = 30

In [None]:
retrieval_query_template = """
MATCH(searchProduct:Product {productCode: $productCode})<-[:VARIANT_OF]-(searchArticle:Article)
WHERE  searchArticle.graphEmbedding IS NOT NULL
CALL db.index.vector.queryNodes('article_graph_embeddings', $vectorTopK, searchArticle.graphEmbedding) YIELD node, score
WHERE score < 1.0
MATCH (node)-[:VARIANT_OF]->(product)
RETURN product.`text` AS text, 
    max(score) AS score, 
    product {.*, `text`: Null, `textEmbedding`: Null, id: Null} AS metadata
ORDER by score DESC LIMIT $resTopK"""

PRODUCT_CODE = 670698

res =  kg.query(retrieval_query_template,{'productCode': PRODUCT_CODE,
                                          'vectorTopK': vector_top_k,
                                          'resTopK': res_top_k})

#visualize as dataframe. result is list of dict
pd.DataFrame([format_res_dicts(d) for d in res])

In [None]:
# Import relevant libraries
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema import StrOutputParser

#Instantiate LLM
llm = ChatOpenAI(temperature=0, model_name=LLM, streaming=True)

In [None]:
prompt = PromptTemplate.from_template("""
You are a personal assistant named Sally for a fashion, home, and beauty company called HRM.
Your customer, {customerName}, is currently browsing the website. 
Please write an engaging message to them recommending and summarizing products that pair well
with their interests and the item they are currently viewing given: 
- Item they are currently viewing: {productDescription}
- The current season / time of year: {timeofYear} 
- Recent searches: {customerInterests}

Please only mention the product candidates listed in the context below. 
Do not come up with or add any new products to the list.
The below candidates are recommended based on the shared purchase patterns of other customers in the HRM database.
Select the best 4 to 5 product subset from the context that best match the time of year: {timeofYear} and to pair 
with the current product being viewed and recent searches. 
For example, even if scarfs are listed here, they may not be appropriate for a summer time of year so best not to include those.
Each product comes with an http `url` field. 
Make sure to provide that http url with descriptive name text in markdown for each product. Do not alter the url.

# Context:
{context}
""")

vector_top_k = 100
res_top_k = 30

def retriever(product_code):
    params = {'productCode': product_code,
              'vectorTopK': vector_top_k,
              'resTopK': res_top_k}
    query_results = kg.query(retrieval_query_template, params=params)
    res = json.dumps([format_res_dicts(d) for d in query_results], indent=1)
    return res

In [None]:
from langchain_core.runnables import RunnableParallel

chain = (
        RunnableParallel(
            {'context': (lambda x: x['product_code']) | RunnableLambda(retriever),
             'customerName': (lambda x: x['customer_name']),
             'productDescription': (lambda x: x['product_description']),
             'customerInterests': (lambda x: x['customer_interests']),
             'timeofYear': (lambda x: x['time_of_year']),
             })
        | prompt
        | llm
        | StrOutputParser())

In [None]:
print(chain.invoke({'product_code': PRODUCT_CODE,
                    'customer_interests':search_prompt,
                    'product_description':'',
                    'customer_name':'Alex Smith',
                    'time_of_year':'July, 2024'}))