# Neo4j Generative AI Workshop

## Setup

In [1]:
%%capture
%pip install sentence_transformers langchain openai tiktoken python-dotenv gradio graphdatascience

In [107]:
from graphdatascience import GraphDataScience
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from langchain.embeddings import OpenAIEmbeddings, BedrockEmbeddings, SentenceTransformerEmbeddings

### Setup Credentials and Environment Variables

To make this easy, you can write the credentials and env variables directly into the below cell

If you like you can use an environments file instead by copying `ws.env.template` to `ws.env` and filling credentials and variables in there. This is a best practice for the future, but fine to skip for this workshop.

In [108]:
# Neo4j
NEO4J_URI = 'bolt://localhost:7687' #change this
NEO4J_PASSWORD = 'password' #cahnge this
NEO4J_USERNAME = 'neo4j'
AURA_DS = False

# AI
EMBEDDING_MODEL = 'openai' #or sentence_transformer or aws
LLM = 'gpt-3.5' #LLM=gpt-3.5 #or gpt-4 or claudev2

# OpenAI - Required when using OpenAI models
os.environ['OPENAI_API_KEY'] = 'sk-...' #cahnge this

# AWS - Only required when using AWS Bedrock models
#os.environ['AWS_ACCESS_KEY_ID'] =
#os.environ['AWS_SECRET_ACCESS_KEY'] =
#os.environ['AWS_DEFAULT_REGION=us-east-1'] =

In [109]:
# You can skip this if not using a ws.env file
if os.path.exists('ws.env'):
    load_dotenv('ws.env', override=True)

    # Neo4j
    NEO4J_URI = os.getenv('NEO4J_URI')
    NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
    NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
    AURA_DS = eval(os.getenv('AURA_DS').title())

    # AI
    EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
    LLM = os.getenv('LLM')

## Knowledge Graph Building

### Get Source Data

In [110]:
department_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/department.csv')
department_df

Unnamed: 0,departmentNo,departmentName,sectionNo,sectionName
0,1676,Jersey Basic,16,Womens Everyday Basics
1,1339,Clean Lingerie,61,Womens Lingerie
2,3608,Tights basic,62,"Womens Nightwear, Socks & Tigh"
3,5883,Jersey Basic,26,Men Underwear
4,2032,Jersey,8,Mama
...,...,...,...,...
261,7510,Woven,28,Men Edition
262,3420,Small Accessories Extended,66,Womens Small accessories
263,5231,Jacket,31,Mens Outerwear
264,8090,Promotion/Other/Offer,29,Men Other


In [111]:
product_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/product.csv')
product_df

Unnamed: 0,productCode,prodName,productTypeNo,productTypeName,productGroupName,garmentGroupNo,garmentGroupName,detailDesc
0,108775,Strap top,253,Vest top,Garment Upper body,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,110065,OP T-shirt (Idro),306,Bra,Underwear,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
2,111565,20 den 1p Stockings,304,Underwear Tights,Socks & Tights,1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
3,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1021,Socks and Tights,Tights with built-in support to lift the botto...
4,111593,Support 40 den 1p Tights,304,Underwear Tights,Socks & Tights,1021,Socks and Tights,"Semi shiny tights that shape the tummy, thighs..."
...,...,...,...,...,...,...,...,...
8039,936862,EDC Marla dress,265,Dress,Garment Full body,1023,Special Offers,Calf-length dress in a patterned Tencel™ lyoce...
8040,936979,Class Filippa Necklace,77,Necklace,Accessories,1019,Accessories,Metal chain necklace with a pendant. Adjustabl...
8041,937138,Flirty Albin bracelet pk,68,Bracelet,Accessories,1019,Accessories,Metal chain bracelets. Two plain and two with ...
8042,942187,ED Sasha tee,255,T-shirt,Garment Upper body,1005,Jersey Fancy,"Oversized, straight-cut T-shirt in a soft moda..."


In [112]:
article_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/article.csv')
article_df

Unnamed: 0,articleId,productCode,departmentNo,prodName,productTypeName,graphicalAppearanceNo,graphicalAppearanceName,colourGroupCode,colourGroupName
0,108775015,108775,1676,Strap top,Vest top,1010016,Solid,9,Black
1,108775044,108775,1676,Strap top,Vest top,1010016,Solid,10,White
2,110065001,110065,1339,OP T-shirt (Idro),Bra,1010016,Solid,9,Black
3,111565001,111565,3608,20 den 1p Stockings,Underwear Tights,1010016,Solid,9,Black
4,111586001,111586,3608,Shape Up 30 den 1p Tights,Leggings/Tights,1010016,Solid,9,Black
...,...,...,...,...,...,...,...,...,...
13346,936862001,936862,3090,EDC Marla dress,Dress,1010001,All over pattern,52,Pink
13347,936979001,936979,4344,Class Filippa Necklace,Necklace,1010016,Solid,5,Gold
13348,937138001,937138,4345,Flirty Albin bracelet pk,Bracelet,1010016,Solid,5,Gold
13349,942187001,942187,1919,ED Sasha tee,T-shirt,1010016,Solid,9,Black


In [113]:
customer_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/customer.csv')
customer_df

Unnamed: 0,customerId,fn,active,clubMemberStatus,fashionNewsFrequency,age,postalCode
0,00264b7d4cd6498292e8a355b699c2d07725d123f04867...,1.0,1.0,ACTIVE,Regularly,53.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
1,005c6d3bb66c86aab606814cd9995a12f99b3a44b58c72...,,,PRE-CREATE,NONE,,177b4a2258a85a2247daaa7cdffba96a74c741ea8a6605...
2,00abec3de294e03d192db15b91e154853ee1c89415e7cd...,,,ACTIVE,NONE,49.0,86557a458110ac98f4ca80e5a815ba2e8ea086dd8039b0...
3,00f311a42124fc44d117135f34e1fca29fcac271e6fbd0...,1.0,1.0,ACTIVE,Regularly,55.0,1a80c5651ae36327a86e71d5b967cf62c31126d1b57ae0...
4,0132cd2eb3c6b1f66784f65f94ddd8352add2653e0caf5...,,,ACTIVE,NONE,49.0,49f7ec29bcacbbf2120af5162f9f99c212e9dd26b48d79...
...,...,...,...,...,...,...,...
995,fdf1294f414faac2b00a725f5d80c34f98a744d9b8b3ce...,,,ACTIVE,NONE,32.0,0cd87888c3a13ebbb1e90cac6b9fbf34c51afa40865f55...
996,fe6faeed37fe86e885928d3ab30d8d9b072d6643c8aa15...,1.0,1.0,ACTIVE,Regularly,46.0,fe234b03107b233aec5695dc4c3fbe8e638338643f4e14...
997,fef793ec3a7d62d782824517355d74ded50964dce33009...,,,ACTIVE,NONE,46.0,5799a39cffe701ebdb12181348bf10f9e23abcc3868c43...
998,ffb925b11e1bb2e375d22a02d67907994eb8cb92ec2e7d...,,,ACTIVE,NONE,34.0,ebdd8c5c893683c3cf52c011d4e35024e46d183c95f0fa...


In [114]:
transaction_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/transaction.csv')
transaction_df

Unnamed: 0,tDat,customerId,articleId,price,salesChannelId,txId
0,2018-09-20,0ddcd6055c5830c1fda493843d051edb04ce1bf888aa4b...,653428002,0.135576,1,2445
1,2018-09-20,210f113fe87db5d6391e986dc06b8e4369e46284e3b989...,636587001,0.008458,1,6182
2,2018-09-20,210f113fe87db5d6391e986dc06b8e4369e46284e3b989...,640462002,0.032186,1,6183
3,2018-09-20,211a2ef477fcfc8fc40a63ffa70bb41086dd06ca85d4af...,645422002,0.014390,2,6188
4,2018-09-20,211a2ef477fcfc8fc40a63ffa70bb41086dd06ca85d4af...,645422002,0.014390,2,6189
...,...,...,...,...,...,...
23194,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,921266007,0.016932,2,31779124
23195,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,812530004,0.010153,2,31779125
23196,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,942187001,0.016932,2,31779126
23197,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,866731001,0.025407,2,31779127


### Connect to Neo4j

In [115]:
# Use Neo4j URI and credentials according to our setup
gds = GraphDataScience(
    NEO4J_URI,
    auth=(NEO4J_USERNAME, NEO4J_PASSWORD),
    aura_ds=AURA_DS)

# Necessary if you enabled Arrow on the db - this is true for AuraDS
gds.set_database("neo4j")

### Create Constraints

In [116]:
# one uniqueness constraint for each node label
gds.run_cypher('CREATE CONSTRAINT unique_department_no IF NOT EXISTS FOR (n:Department) REQUIRE n.departmentNo IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT unique_product_code IF NOT EXISTS FOR (n:Product) REQUIRE n.productCode IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT unique_article_id IF NOT EXISTS FOR (n:Article) REQUIRE n.articleId IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT unique_customer_id IF NOT EXISTS FOR (n:Customer) REQUIRE n.customerId IS UNIQUE')

### Helper Functions

In [117]:
from typing import Tuple, Union
from numpy.typing import ArrayLike


def make_map(x):
    if type(x) == str:
        return x, x
    elif type(x) == tuple:
        return x
    else:
        raise Exception("Entry must of type string or tuple")


def make_set_clause(prop_names: ArrayLike, element_name='n', item_name='rec'):
    clause_list = []
    for prop_name in prop_names:
        clause_list.append(f'{element_name}.{prop_name} = {item_name}.{prop_name}')
    return 'SET ' + ', '.join(clause_list)


def make_node_merge_query(node_key_name: str, node_label: str, cols: ArrayLike):
    template = f'''UNWIND $recs AS rec\nMERGE(n:{node_label} {{{node_key_name}: rec.{node_key_name}}})'''
    prop_names = [x for x in cols if x != node_key_name]
    if len(prop_names) > 0:
        template = template + '\n' + make_set_clause(prop_names)
    return template + '\nRETURN count(n) AS nodeLoadedCount'


def make_rel_merge_query(source_target_labels: Union[Tuple[str, str], str],
                         source_node_key: Union[Tuple[str, str], str],
                         target_node_key: Union[Tuple[str, str], str],
                         rel_type: str,
                         cols: ArrayLike,
                         rel_key: str = None):
    source_target_label_map = make_map(source_target_labels)
    source_node_key_map = make_map(source_node_key)
    target_node_key_map = make_map(target_node_key)

    merge_statement = f'MERGE(s)-[r:{rel_type}]->(t)'
    if rel_key is not None:
        merge_statement = f'MERGE(s)-[r:{rel_type} {{{rel_key}: rec.{rel_key}}}]->(t)'

    template = f'''\tUNWIND $recs AS rec
    MATCH(s:{source_target_label_map[0]} {{{source_node_key_map[0]}: rec.{source_node_key_map[1]}}})
    MATCH(t:{source_target_label_map[1]} {{{target_node_key_map[0]}: rec.{target_node_key_map[1]}}})\n\t''' + merge_statement
    prop_names = [x for x in cols if x not in [rel_key, source_node_key_map[1], target_node_key_map[1]]]
    if len(prop_names) > 0:
        template = template + '\n\t' + make_set_clause(prop_names, 'r')
    return template + '\n\tRETURN count(r) AS relLoadedCount'


def chunks(xs, n=10_000):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]


def load_nodes(gds: GraphDataScience, node_df: pd.DataFrame, node_key_col: str, node_label: str, chunk_size=10_000):
    records = node_df.to_dict('records')
    print(f'======  loading {node_label} nodes  ======')
    total = len(records)
    print(f'staging {total:,} records')
    query = make_node_merge_query(node_key_col, node_label, node_df.columns.copy())
    cumulative_count = 0
    for recs in chunks(records, chunk_size):
        res = gds.run_cypher(query, params={'recs': recs})
        cumulative_count += res.iloc[0, 0]
        print(f'Loaded {cumulative_count:,} of {total:,} nodes')


def load_rels(gds: GraphDataScience,
              rel_df: pd.DataFrame,
              source_target_labels: Union[Tuple[str, str], str],
              source_node_key: Union[Tuple[str, str], str],
              target_node_key: Union[Tuple[str, str], str],
              rel_type: str,
              rel_key: str = None,
              chunk_size=10_000):
    records = rel_df.to_dict('records')
    print(f'======  loading {rel_type} relationships  ======')
    total = len(records)
    print(f'staging {total:,} records')
    query = make_rel_merge_query(source_target_labels, source_node_key,
                                 target_node_key, rel_type, rel_df.columns.copy(), rel_key)
    cumulative_count = 0
    for recs in chunks(records, chunk_size):
        res = gds.run_cypher(query, params={'recs': recs})
        cumulative_count += res.iloc[0, 0]
        print(f'Loaded {cumulative_count:,} of {total:,} relationships')

### Load Nodes

In [118]:
%%time
load_nodes(gds, department_df, 'departmentNo', 'Department')

staging 266 records
Loaded 266 of 266 nodes
CPU times: user 7.49 ms, sys: 2.01 ms, total: 9.5 ms
Wall time: 5.92 s


In [119]:
%%time
load_nodes(gds, product_df, 'productCode', 'Product')

staging 8,044 records
Loaded 8,044 of 8,044 nodes
CPU times: user 181 ms, sys: 8.21 ms, total: 189 ms
Wall time: 15.2 s


In [120]:
%%time
load_nodes(gds, article_df.drop(columns=['productCode', 'departmentNo']), 'articleId', 'Article')

staging 13,351 records
Loaded 10,000 of 13,351 nodes
Loaded 13,351 of 13,351 nodes
CPU times: user 264 ms, sys: 10.8 ms, total: 275 ms
Wall time: 10.9 s


In [121]:
%%time
load_nodes(gds, customer_df, 'customerId', 'Customer')

staging 1,000 records
Loaded 1,000 of 1,000 nodes
CPU times: user 22.1 ms, sys: 2.77 ms, total: 24.9 ms
Wall time: 2.78 s


### Load Relationships

In [122]:
%%time
load_rels(gds, article_df[['articleId', 'departmentNo']], source_target_labels=('Article', 'Department'),
          source_node_key='articleId', target_node_key='departmentNo',
          rel_type='FROM_DEPARTMENT')

staging 13,351 records
Loaded 10,000 of 13,351 relationships
Loaded 13,351 of 13,351 relationships
CPU times: user 81.6 ms, sys: 4.18 ms, total: 85.8 ms
Wall time: 11.5 s


In [123]:
%%time
load_rels(gds, article_df[['articleId', 'productCode']], source_target_labels=('Article', 'Product'),
          source_node_key='articleId',target_node_key='productCode',
          rel_type='VARIANT_OF')

staging 13,351 records
Loaded 10,000 of 13,351 relationships
Loaded 13,351 of 13,351 relationships
CPU times: user 92.2 ms, sys: 3.94 ms, total: 96.1 ms
Wall time: 7.57 s


In [124]:
%%time
load_rels(gds, transaction_df, source_target_labels=('Customer', 'Article'),
          source_node_key='customerId', target_node_key='articleId',
          rel_type='PURCHASED')

staging 23,199 records
Loaded 10,000 of 23,199 relationships
Loaded 20,000 of 23,199 relationships
Loaded 23,199 of 23,199 relationships
CPU times: user 382 ms, sys: 12.4 ms, total: 394 ms
Wall time: 16.5 s


### Convert Transaction Dates

In [125]:
gds.run_cypher('''
MATCH (:Customer)-[r:PURCHASED]->()
SET r.tDat = date(r.tDat)
''')

## Vector Search
In this Section We will build Text Embeddings of Product and demonstrate how to leverage the Neo4j vector index for vector search.

### Creating Text Embeddings

In [126]:
def load_embedding_model(embedding_model_name: str):
    if embedding_model_name == "openai":
        embeddings = OpenAIEmbeddings()
        dimension = 1536
        print("Embedding Model: openai")
    elif embedding_model_name == "aws":
        embeddings = BedrockEmbeddings()
        dimension = 1536
        print("Embedding Model: aws")
    else:
        embeddings = SentenceTransformerEmbeddings(
            model_name="all-MiniLM-L6-v2", cache_folder="/embedding_model")
        print("Embedding Model: sentence transformer")
        dimension = 384
    return embeddings, dimension

In [127]:
embedding_model, dimension = load_embedding_model(EMBEDDING_MODEL)

Embedding Model: openai


In [133]:
product_emb_df = product_df[['productCode', 'prodName', 'productTypeName', 'productGroupName', 'garmentGroupName', 'detailDesc']]
product_emb_df = product_emb_df[product_emb_df.detailDesc.notnull()]

In [134]:
def create_doc(row):
    return f'''
##Product
Name: {row.prodName}
Type: {row.productTypeName}
Group: {row.productGroupName}
Garment Type: {row.garmentGroupName}
Description: {row.detailDesc}
'''

product_emb_df['text'] = product_emb_df.apply(create_doc, axis=1)
product_emb_df = product_emb_df.drop(columns=['prodName', 'productTypeName', 'productGroupName', 'garmentGroupName', 'detailDesc'])
product_emb_df

Unnamed: 0,productCode,text
0,108775,\n##Product\nName: Strap top\nType: Vest top\n...
1,110065,\n##Product\nName: OP T-shirt (Idro)\nType: Br...
2,111565,\n##Product\nName: 20 den 1p Stockings\nType: ...
3,111586,\n##Product\nName: Shape Up 30 den 1p Tights\n...
4,111593,\n##Product\nName: Support 40 den 1p Tights\nT...
...,...,...
8039,936862,\n##Product\nName: EDC Marla dress\nType: Dres...
8040,936979,\n##Product\nName: Class Filippa Necklace\nTyp...
8041,937138,\n##Product\nName: Flirty Albin bracelet pk\nT...
8042,942187,\n##Product\nName: ED Sasha tee\nType: T-shirt...


In [137]:
%%time

count = 0
embeddings = []
for docs in chunks(product_emb_df.text, n=500):
    count += len(docs)
    print(f'Embedded {count} of {product_emb_df.shape[0]}')
    embeddings.extend(embedding_model.embed_documents(docs))

Embedded 500 of 8018
Embedded 1000 of 8018
Embedded 1500 of 8018
Embedded 2000 of 8018
Embedded 2500 of 8018
Embedded 3000 of 8018
Embedded 3500 of 8018
Embedded 4000 of 8018
Embedded 4500 of 8018
Embedded 5000 of 8018
Embedded 5500 of 8018
Embedded 6000 of 8018
Embedded 6500 of 8018
Embedded 7000 of 8018
Embedded 7500 of 8018
Embedded 8000 of 8018
Embedded 8018 of 8018
CPU times: user 1.81 s, sys: 206 ms, total: 2.02 s
Wall time: 25.8 s


In [139]:
product_emb_df['textEmbedding'] = embeddings
product_emb_df

Unnamed: 0,productCode,text,textEmbedding
0,108775,\n##Product\nName: Strap top\nType: Vest top\n...,"[-0.03165835786785922, 0.010735359633722455, -..."
1,110065,\n##Product\nName: OP T-shirt (Idro)\nType: Br...,"[-0.012618998246342304, 0.006922205543577324, ..."
2,111565,\n##Product\nName: 20 den 1p Stockings\nType: ...,"[-0.004641780213279706, -0.0002350378369905564..."
3,111586,\n##Product\nName: Shape Up 30 den 1p Tights\n...,"[-0.004515952400410634, -0.00448439686023379, ..."
4,111593,\n##Product\nName: Support 40 den 1p Tights\nT...,"[-0.011169078802241895, 0.002781850762048061, ..."
...,...,...,...
8039,936862,\n##Product\nName: EDC Marla dress\nType: Dres...,"[-0.02878346256636124, 0.01268400503673369, -0..."
8040,936979,\n##Product\nName: Class Filippa Necklace\nTyp...,"[-0.016470516922509736, 0.01204499569297766, -..."
8041,937138,\n##Product\nName: Flirty Albin bracelet pk\nT...,"[-0.032225362565817726, 0.02832645888555731, -..."
8042,942187,\n##Product\nName: ED Sasha tee\nType: T-shirt...,"[-0.008373181850647395, 0.007136464695464776, ..."


#### Create Vector Property

In [140]:
records = product_emb_df[['productCode', 'textEmbedding', 'text']].to_dict('records')
print(f'======  loading Product text embeddings ======')
total = len(records)
print(f'staging {total:,} records')
cumulative_count = 0
for recs in chunks(records, n=100):
    res = gds.run_cypher('''
    UNWIND $recs AS rec
    MATCH(n:Product {productCode: rec.productCode})
    CALL db.create.setNodeVectorProperty(n, "textEmbedding", rec.textEmbedding)
    SET n.text = rec.text
    RETURN count(n) AS propertySetCount
    ''', params={'recs': recs})
    cumulative_count += res.iloc[0, 0]
    print(f'Set {cumulative_count:,} of {total:,} text embeddings')

staging 8,018 records
Set 100 of 8,018 text embeddings
Set 200 of 8,018 text embeddings
Set 300 of 8,018 text embeddings
Set 400 of 8,018 text embeddings
Set 500 of 8,018 text embeddings
Set 600 of 8,018 text embeddings
Set 700 of 8,018 text embeddings
Set 800 of 8,018 text embeddings
Set 900 of 8,018 text embeddings
Set 1,000 of 8,018 text embeddings
Set 1,100 of 8,018 text embeddings
Set 1,200 of 8,018 text embeddings
Set 1,300 of 8,018 text embeddings
Set 1,400 of 8,018 text embeddings
Set 1,500 of 8,018 text embeddings
Set 1,600 of 8,018 text embeddings
Set 1,700 of 8,018 text embeddings
Set 1,800 of 8,018 text embeddings
Set 1,900 of 8,018 text embeddings
Set 2,000 of 8,018 text embeddings
Set 2,100 of 8,018 text embeddings
Set 2,200 of 8,018 text embeddings
Set 2,300 of 8,018 text embeddings
Set 2,400 of 8,018 text embeddings
Set 2,500 of 8,018 text embeddings
Set 2,600 of 8,018 text embeddings
Set 2,700 of 8,018 text embeddings
Set 2,800 of 8,018 text embeddings
Set 2,900 of 8,0

#### Create Vector Index

In [141]:
%%time

gds.run_cypher(f'CALL db.index.vector.createNodeIndex("product-text-embeddings", "Product", "textEmbedding", {dimension}, "cosine")')

# wait for full index creation (timeout after 300 seconds)
gds.run_cypher('CALL db.awaitIndex("product-text-embeddings", 300)')

CPU times: user 5.43 ms, sys: 2.86 ms, total: 8.29 ms
Wall time: 1min 7s


### Vector Search Using Cypher

In [142]:
#search_prompt = 'denim jeans, loose fit, high-waist'
search_prompt = 'Oversized Sweaters'

In [143]:
query_vector = embedding_model.embed_query(search_prompt)
print(f'query vector length: {len(query_vector)}')
print(f'query vector sample: {query_vector[:10]}')

query vector length: 1536
query vector sample: [-0.023104330485734847, -0.013533096737628139, 0.0017341322934734697, -0.033839277865887946, -0.024241894442575403, 0.011094523575023529, -0.006240261242417779, -0.0017504765389690473, 0.005606101350692616, -0.024660309734965396]


In [170]:
gds.run_cypher('''
CALL db.index.vector.queryNodes("product-text-embeddings", 10, $queryVector)
YIELD node AS product, score
RETURN product.productCode AS productCode,
    product.text AS text,
    score
''', params={'queryVector': query_vector})

Unnamed: 0,productCode,text,score
0,842001,\n##Product\nName: Betsy Oversized\nType: Swea...,0.942261
1,817392,\n##Product\nName: Japp oversize sweater\nType...,0.939759
2,709418,\n##Product\nName: DIV Anni oversize hood\nTyp...,0.928798
3,860833,\n##Product\nName: Runar sweater\nType: Sweate...,0.926939
4,893141,\n##Product\nName: Sandy\nType: Sweater\nGroup...,0.925745
5,812167,\n##Product\nName: Macy\nType: Sweater\nGroup:...,0.925685
6,690623,\n##Product\nName: Simba\nType: Sweater\nGroup...,0.924674
7,557247,\n##Product\nName: Petar Sweater(1)\nType: Swe...,0.923877
8,687934,\n##Product\nName: Sister off shoulder\nType: ...,0.92315
9,594834,\n##Product\nName: Dolly hood\nType: Sweater\n...,0.923108


### Vector Search Using Langchain

We can also do this with langchain which is a recommended approach going forward.  To do this we use the Neo4jVector class and call the method to sert it up from an existing index in the graph.

In [43]:
from langchain.vectorstores.neo4j_vector import Neo4jVector

In [145]:
kg_vector_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='product-text-embeddings')

In [146]:
res = kg_vector_search.similarity_search(search_prompt, k=10)
res

[Document(page_content='\n##Product\nName: Betsy Oversized\nType: Sweater\nGroup: Garment Upper body\nGarment Type: Knitwear\nDescription: Oversized, V-neck jumper in a soft, loose knit containing some wool and alpaca wool. Dropped shoulders, long, wide sleeves, wide ribbing around the neckline, cuffs and hem, and slits in the sides.\n', metadata={'prodName': 'Betsy Oversized', 'garmentGroupName': 'Knitwear', 'garmentGroupNo': 1003, 'productCode': 842001, 'productTypeName': 'Sweater', 'productTypeNo': 252, 'detailDesc': 'Oversized, V-neck jumper in a soft, loose knit containing some wool and alpaca wool. Dropped shoulders, long, wide sleeves, wide ribbing around the neckline, cuffs and hem, and slits in the sides.', 'productGroupName': 'Garment Upper body'}),
 Document(page_content='\n##Product\nName: Japp oversize sweater\nType: Sweater\nGroup: Garment Upper body\nGarment Type: Jersey Basic\nDescription: Relaxed-fit top in sweatshirt fabric with a ribbed turtle neck, dropped shoulders

In [147]:
# Visualize as a dataframe
pd.DataFrame([{'document': d.page_content} for d in res])

Unnamed: 0,document
0,\n##Product\nName: Betsy Oversized\nType: Swea...
1,\n##Product\nName: Japp oversize sweater\nType...
2,\n##Product\nName: DIV Anni oversize hood\nTyp...
3,\n##Product\nName: Runar sweater\nType: Sweate...
4,\n##Product\nName: Sandy\nType: Sweater\nGroup...
5,\n##Product\nName: Macy\nType: Sweater\nGroup:...
6,\n##Product\nName: Simba\nType: Sweater\nGroup...
7,\n##Product\nName: Petar Sweater(1)\nType: Swe...
8,\n##Product\nName: Sister off shoulder\nType: ...
9,\n##Product\nName: Dolly hood\nType: Sweater\n...


## Semantic Search with Context
Using Explicit Relationships in EN terprise data


Above we see how you can use the vector index to find semantic similar products in user searches.  but there is a rich graph full of other information in it. Lets leverage our knowledge graph to make this better

An important piece of information expressed in this graph, but not directly in the documents, is customer purchasing behavior.  We can use A Cypher Query to make recommendations without any document behavior. this is similar to collaborative filtering but generalized to purchase history (not necessarily rating based)

#### Example Purchase History

Consider the below customer

In [171]:
CUSTOMER_ID = "daae10780ecd14990ea190a1e9917da33fe96cd8cfa5e80b67b4600171aa77e0"
print('Customer Purchase History')
gds.run_cypher('''
    MATCH(c:Customer {customerId: $customerId})-[:PURCHASED]->(:Article)
    -[:VARIANT_OF]->(p:Product)
    RETURN p.productCode AS productCode,
        p.prodName AS prodName,
        p.productTypeName AS productTypeName,
        p.garmentGroupName AS garmentGroupName,
        p.detailDesc AS detailDesc,
        count(*) AS purchaseCount
    ORDER BY purchaseCount DESC
''', params={'customerId': CUSTOMER_ID})

Customer Purchase History


Unnamed: 0,productCode,prodName,productTypeName,garmentGroupName,detailDesc,purchaseCount
0,569974,DONT USE ROLAND HOOD,Hoodie,Jersey Basic,Top in sweatshirt fabric with a lined drawstri...,2
1,557247,Petar Sweater(1),Sweater,Jersey Basic,Oversized top in sturdy sweatshirt fabric with...,2
2,733027,Tove,Top,Jersey Fancy,Short top in soft cotton jersey with a round n...,1
3,753724,Rosemary,Dress,Dresses Ladies,Short dress in woven fabric with 3/4-length sl...,1
4,687016,DORIS CREW,Sweater,Jersey Fancy,Top in sweatshirt fabric with a motif on the f...,1
5,691072,JEKYL SWEATSHIRT,Sweater,Jersey Basic,Top in sweatshirt fabric with long raglan slee...,1
6,244267,Silver lake,Sweater,Knitwear,Purl-knit jumper in a cotton blend with a slig...,1
7,606711,Rylee flatform,Heeled sandals,Shoes,"Sandals with imitation suede straps, an elasti...",1
8,660519,Haven back detail,Bra,"Under-, Nightwear","Push-up bra in lace and mesh with underwired, ...",1
9,585480,Adore strapless push,Bra,"Under-, Nightwear",Strapless balconette bra in microfibre with un...,1


#### Graph Patterns For Retrieval Query

In [102]:
# This is the example Pattern we can use to predict likely customer preferences based on collaborative behavior
gds.run_cypher('''
    MATCH(c:Customer {customerId: $customerId})-[:PURCHASED]->(:Article)
    <-[:PURCHASED]-(:Customer)-[:PURCHASED]->(:Article)
    -[:VARIANT_OF]->(p:Product)
    RETURN p.productCode AS productCode,
        p.prodName AS prodName,
        p.productTypeName AS productTypeName,
        p.garmentGroupName AS garmentGroupName,
        p.detailDesc AS detailDesc,
        count(*) AS score
    ORDER BY score DESC LIMIT 10
''', params={'customerId': CUSTOMER_ID})

Unnamed: 0,productCode,prodName,productTypeName,garmentGroupName,detailDesc,score
0,685816,RONNY REG RN T-SHIRT,T-shirt,Jersey Basic,Round-necked T-shirt in soft cotton jersey.,17
1,599580,Timeless Midrise Brief,Swimwear bottom,Swimwear,Fully lined bikini bottoms with a mid waist an...,16
2,684209,Simple as That Triangle Top,Bikini top,Swimwear,"Lined, non-wired, triangle bikini top with a w...",13
3,688537,Simple as that Cheeky Tanga,Swimwear bottom,Swimwear,Fully lined bikini bottoms with a mid waist an...,12
4,778064,Claudine t-shirt,T-shirt,Jersey Basic,Fitted top in soft organic cotton jersey with ...,9
5,656719,Serpente HW slim trouser,Trousers,Trousers,Tailored trousers in a stretch weave with two ...,6
6,615141,Juanos,Top,Jersey Fancy,"Long-sleeved, fitted top in ribbed jersey with...",6
7,776237,Shake it in Balconette.,Bikini top,Swimwear,"Lined balconette bikini top with underwired, p...",6
8,685813,PETAR SWEATSHIRT,Sweater,Jersey Basic,Top in soft sweatshirt fabric. Slightly looser...,6
9,685814,RICHIE HOOD,Hoodie,Jersey Basic,Hoodie in sweatshirt fabric made from a cotton...,5


In [173]:
# This is the example Pattern we can use to predict likely customer preferences based on collaborative behavior
kg_personalized_vector_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='product-text-embeddings',
    retrieval_query=f"""
    WITH node AS product, score AS searchScore

    OPTIONAL MATCH(product)<-[:VARIANT_OF]-(:Article)<-[:PURCHASED]-(:Customer)
    -[:PURCHASED]->(a:Article)<-[:PURCHASED]-(:Customer {{customerId: '{CUSTOMER_ID}'}})

    WITH count(a) AS purchaseScore, product.text AS text, searchScore, product.productCode AS productCode
    RETURN text,
        (1+purchaseScore)*searchScore AS score,
        {{productCode: productCode, purchaseScore:purchaseScore, searchScore:searchScore}} AS metadata
    ORDER BY purchaseScore DESC, searchScore DESC LIMIT 15
    """)

In [174]:
res = kg_personalized_vector_search.similarity_search(search_prompt, k=100)

# Visualize as a dataframe
pd.DataFrame([{'productCode': d.metadata['productCode'],
               'document': d.page_content,
               'searchScore': d.metadata['searchScore'],
               'purchaseScore': d.metadata['purchaseScore']} for d in res])

Unnamed: 0,productCode,document,searchScore,purchaseScore
0,677930,\n##Product\nName: Queen Sweater\nType: Sweate...,0.918759,4
1,516712,\n##Product\nName: Jess oversize LS\nType: Top...,0.918976,2
2,669682,\n##Product\nName: Irma sweater\nType: Sweater...,0.917643,2
3,640755,\n##Product\nName: Allen Sweater\nType: Sweate...,0.922795,1
4,687948,\n##Product\nName: Annie Oversized Hood\nType:...,0.921786,1
5,709991,\n##Product\nName: SISTER OL\nType: Sweater\nG...,0.921149,1
6,687856,\n##Product\nName: Jacket Oversize\nType: Jack...,0.919922,1
7,686265,\n##Product\nName: Family Crew Ladies\nType: S...,0.91864,1
8,845520,\n##Product\nName: Dolls Printed\nType: Sweate...,0.917355,1
9,674826,\n##Product\nName: Fine knit\nType: Sweater\nG...,0.917115,1


In [172]:
# OPTIONAL version without Langchain
# gds.run_cypher('''
# CALL db.index.vector.queryNodes("product-text-embeddings", 100, $queryVector)
# YIELD node AS product, score AS searchScore
#
# OPTIONAL MATCH(product)<-[:VARIANT_OF]-(:Article)<-[:PURCHASED]-(:Customer)
# -[:PURCHASED]->(a:Article)<-[:PURCHASED]-(:Customer {customerId: $customerId})
#
# WITH product.text AS text, count(a) AS purchaseScore, searchScore, product.productCode AS productCode
# RETURN text, (1+purchaseScore)*searchScore AS score, productCode, purchaseScore, searchScore
# ORDER BY purchaseScore DESC, searchScore DESC LIMIT 15
# ''', params={'queryVector': embedding_model.embed_query("Oversized Sweater"), 'customerId': CUSTOMER_ID})

## KG Powered Inference for AI

We saw before how could use graph pattern matching to personalize search and make it more relevant.

TODO: We also saw how we could use similar tools to power semantic search and analytics on entities connected to documents

Graph pattern matching is very power and can work well in a lot of scenarios.

In addition to this, we also have Graph Data Science, which can allow as to enrich the current Knowledge graph with machine learning, that can
1. Provide addition information to improve relvancy of search results at scale
2. Provide additional inferences to GenAI

We will show an example of how this works using Node Embedding and K-Nearest Neighbor algorithms



### Embedding and KNN

In [None]:
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.width', 0)

In [None]:
def clear_all_graphs():
    g_names = gds.graph.list().graphName.tolist()
    for g_name in g_names:
        g = gds.graph.get(g_name)
        g.drop()

#### Clear Past Analysis (If rerunning this Notebook)

In [None]:
clear_all_graphs()

In [None]:
gds.run_cypher('''
    MATCH(:Article)-[r:CUSTOMERS_ALSO_PURCHASED]->()
    CALL {
        WITH r
        DELETE r
    } IN TRANSACTIONS OF 1000 ROWS
    ''')

#### Apply GDS FastRP Node Embeddings and K-Nearest Neighbor (KNN) Similarity

First, apply a graph projection to structure the portion of the graph we need in an optimized in-memory format for graph ML.

In [None]:
%%time
# graph projection
g, _ = gds.graph.project('proj',['Customer', 'Article', 'Product'],
                         {'PURCHASED':{'orientation':'UNDIRECTED'}, 'VARIANT_OF':{'orientation':'UNDIRECTED'}},
                         readConcurrency=4)

CPU times: user 2.2 ms, sys: 1.31 ms, total: 3.51 ms
Wall time: 431 ms


Next, we will generate node embeddings for similarity calculation.  In this case, we will use FastRP (Fast Random Projection) which is a fast, scalable, and robust embedding algorithm. FastRP calculates embeddings using probabilistic sampling and linear algebra.

In [None]:
%%time
# embeddings (writing back Article embeddings in case we want to introspect later)
gds.fastRP.mutate(g, mutateProperty='embedding', embeddingDimension=128, randomSeed=7474, concurrency=4)
gds.graph.writeNodeProperties(g, ['embedding'], ['Article'])

CPU times: user 4.06 ms, sys: 1.65 ms, total: 5.71 ms
Wall time: 4.27 s


writeMillis                 1358
graphName                   proj
nodeProperties       [embedding]
propertiesWritten          21596
Name: 0, dtype: object

This is what the node embeddings look like:

In [None]:
gds.run_cypher('MATCH(n:Article) RETURN n.articleId, n.embedding LIMIT 3')

Unnamed: 0,n.articleId,n.embedding
0,108775015,"[0.07331004738807678, 0.17926183342933655, -0.07301197201013565, -0.049210064113140106, 0.16503135859966278, -0.15523408353328705, -0.10822394490242004, -0.02722267620265484, 0.014662966132164001, 0.180541530251503, -0.08186649531126022, 0.23589280247688293, -0.1366392821073532, 0.11961854994297028, -0.02922603115439415, 0.04286094754934311, 0.04672681540250778, 0.086881123483181, -0.08363135904073715, -0.05791114270687103, -0.07598546147346497, 0.009629392065107822, -0.014021783135831356, 0..."
1,108775044,"[0.07349064946174622, 0.0967743992805481, -0.038152799010276794, -0.07673473656177521, 0.22684209048748016, -0.0060753063298761845, 0.16888433694839478, -0.11225447058677673, 0.09668536484241486, 0.03443354740738869, -0.0913434624671936, 0.14213915169239044, -0.1980677992105484, 0.15377040207386017, -0.1279836893081665, -0.03599829971790314, -0.0967002660036087, 0.14265014231204987, 0.03261440247297287, -0.11126242578029633, 0.06963685154914856, 0.2129615694284439, 0.006723809987306595, 0.12..."
2,110065001,"[-0.044018954038619995, 0.08427691459655762, -0.02065543830394745, 0.09351789951324463, -0.14247480034828186, -0.07779307663440704, -0.08055605739355087, 0.05282197892665863, 0.10809334367513657, -0.10579649358987808, -0.09285050630569458, 0.005664631258696318, 0.012895558029413223, -0.24933193624019623, -0.042424216866493225, 0.18484896421432495, -0.1699368953704834, -0.012699112296104431, 0.08738791197538376, -0.0476275309920311, 0.012392558157444, 0.024528082460165024, 0.080563485622406, ..."


Finally, we can do our similarity inference with K-Nearest Neighbor (KNN) and write back to the graph.
We will use a slightly low cutoff of 0.75 similarity score to extend the result size for exploration.  We can provide a higher cutoff at query time if needed.

In [None]:
%%time
# KNN
_ = gds.knn.write(g, nodeProperties=['embedding'], nodeLabels=['Article'],
              writeRelationshipType='CUSTOMERS_ALSO_PURCHASED', writeProperty='score',
             # sampleRate=1.0,maxIterations=1000,similarityCutoff=0.75,
              concurrency=4)
_

Knn:   0%|          | 0/100 [00:00<?, ?%/s]

CPU times: user 1.07 s, sys: 220 ms, total: 1.29 s
Wall time: 1min 30s


ranIterations                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          91
didConverge                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [None]:
# clear graph projection once done
g.drop()

graphName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               proj
database                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

### Recommendations Based on Search Prompt

In [None]:
search_res_df = gds.run_cypher('''
CALL db.index.vector.queryNodes("product-text-embeddings", 10, $queryVector)
YIELD node AS product, score
RETURN product.prodName AS prodName,
    product.productTypeName AS productTypeName,
    product.garmentGroupName AS garmentGroupName,
    product.detailDesc AS detailDesc,
    score
    ORDER BY score DESC
''', params={'queryVector': query_vector})

In [None]:
recommendation_res_df =gds.run_cypher('''
CALL db.index.vector.queryNodes("product-text-embeddings", 5, $queryVector)
YIELD node AS search_res_product, score AS search_score
MATCH(search_res_product)<-[:VARIANT_OF]-(a0)-[s:CUSTOMERS_ALSO_PURCHASED]->(a)-[:VARIANT_OF]-(p)
WITH p.prodName AS prodName,
    p.productTypeName AS productTypeName,
    p.garmentGroupName AS garmentGroupName,
    p.detailDesc AS detailDesc,
    sum(s.score) AS aggScore,
    sum(search_score) AS search_score
RETURN prodName, productTypeName, garmentGroupName, search_score*aggScore AS score, detailDesc
    ORDER BY aggScore DESC LIMIT 10
''', params={'queryVector': query_vector})

In [None]:
print(f'User Search Prompt: "{search_prompt}"\n\n')
print('Search Results:\n')
display(search_res_df)
print('\nUser May Also Be Interested In:\n')
display(recommendation_res_df)

User Search Prompt: "winter sweater with zipper"


Search Results:



Unnamed: 0,prodName,productTypeName,garmentGroupName,detailDesc,score
0,Catfish Zip,Sweater,Knitwear,"Soft, textured-knit jumper with a ribbed stand-up collar, zip at the top, dropped shoulders and ribbing at the cuffs and hem.",0.933434
1,Yolo Zip L/S,Sweater,Knitwear,"Jumper in a soft, rib knit with a stand-up collar, a visible zip at the front and long sleeves.",0.931779
2,BANANA HALF ZIP SWEATER,Sweater,Knitwear,"Jumper in a soft rib knit with a high stand-up collar, zip at the top, long raglan sleeves with elbow patches, and ribbing at the cuffs and hem.",0.930935
3,Ben zip hoodie,Sweater,Knitwear,"Fine-knit jacket in a soft viscose blend with a drawstring hood, zip and front pockets. Gently dropped shoulders and ribbing at the cuffs and hem.",0.930924
4,Raven Half Zip Sweater,Sweater,Jersey Fancy,"Sports top in stretch, fast-drying functional fabric with a stand-up collar, zip at the front with a chin guard, and a yoke at the back. Low dropped shoulders and long sleeves with thumbholes at the cuffs. Rounded and slightly longer at the back.",0.93011
5,Southern Sweater,Sweater,Knitwear,"Jumper in a soft rib knit with pointelle details, low dropped shoulders and long sleeves. Double ribbed trim around the neckline, and ribbing at the cuffs and hem.",0.929089
6,Zorro half-zip sweater,Sweater,Jersey Basic,"Short top in sweatshirt fabric with a high, ribbed stand-up collar with a zip at the front. Dropped shoulders, long sleeves and ribbing at the cuffs and hem. Soft brushed inside.",0.928748
7,BAY BLOCK STRIPE ZIP-UP,Sweater,Knitwear,"Jumper in a soft, fine-knit modal and cotton blend with a ribbed stand-up collar, zip at the top and long sleeves.",0.928719
8,Yolo Zip LS,Sweater,Knitwear,"Fitted jumper in a soft, rib knit with a turtle neck, visible zip at the top and long sleeves.",0.92771
9,Håkan half zip knit,Sweater,Knitwear,"Jumper in a soft cotton knit with a high, ribbed stand-up collar and zip at the top. Long sleeves, and ribbing at the cuffs and hem.",0.927483



User May Also Be Interested In:



Unnamed: 0,prodName,productTypeName,garmentGroupName,score,detailDesc
0,Niffler Trousers,Trousers,Woven/Jersey/Knitted mix Baby,3.248279,"Pull-on trousers in washed, stretch twill with an elasticated, drawstring waist, front pockets, a fake back pocket and tapered legs."
1,Sunspot Seamless Crop Top,Vest top,Jersey Fancy,2.949513,"Short, fitted sports top with a racer back and elasticated hem. The sports top is designed with the minimum number of seams for a more comfortable fit and increased mobility."
2,STINA 3p boxer,Underwear bottom,"Under-, Nightwear",2.818649,Boxer briefs in cotton jersey with an elasticated waist and lined gusset.
3,CC COSMO dress BG,Dress,Jersey Basic,0.814822,"Dress in soft sweatshirt fabric with an embroidered motif at the top, long sleeves, twisted seams at the front and side pockets. Ribbing around the neckline, cuffs and hem and short slits at the hem. Slightly longer at the back. Soft brushed inside. The dress is made partly from recycled cotton."
4,Cappucino Brazilian High Waist,Underwear bottom,"Under-, Nightwear",0.814637,"Brazilian briefs in lace and mesh with a mid waist, lined gusset, wide sides and high cut at the back."
5,ED Duno 2p.,Top,Jersey Fancy,0.81436,"Long-sleeved tops in soft, organic cotton jersey with a slightly wider neckline."
6,Cannes Crew,Sweater,"Under-, Nightwear",0.813946,"Round-necked jumper in a soft, fine-knit viscose blend with long sleeves, short slits in the sides and ribbing around the neckline, cuffs and hem. Slightly longer at the back."
7,Berry utility denim Trs,Trousers,Trousers,0.813191,"Ankle-length jeans in sturdy cotton denim with a high waist, zip fly and button, front and back pockets, leg pockets with a flap and concealed press-stud and tapered legs with a tab and metal buckle at the ankles. The cotton content of the jeans is partly recycled."
8,LOGG Amaretto blazer,Blazer,Outdoor,0.812981,"Jacket woven in a linen blend with narrow notch lapels, a button at the front and welt front pockets with a flap. Lined."
9,JANINE SWEATER NEON,Sweater,Jersey Fancy,0.812765,"Top in soft sweatshirt fabric with a text print motif, dropped shoulders, long sleeves, ribbing around the neckline and cuffs and a drawstring at the hem. Soft brushed inside."


### Personalized Recommendation

In [None]:
CUSTOMER_ID = "daae10780ecd14990ea190a1e9917da33fe96cd8cfa5e80b67b4600171aa77e0"

#### Example Purchase History

In [None]:
gds.run_cypher('''
    MATCH(c:Customer {customerId:$customerId})-[r:PURCHASED]->(a)-[:VARIANT_OF]->(p:Product)
        RETURN a.articleId AS articleId,
        a.prodName AS prodName,
        r.tDat AS purchaseDate,
        a.productTypeName AS productTypeName,
        p.detailDesc AS detailDesc
        ORDER BY purchaseDate DESC
''', params = { 'customerId':CUSTOMER_ID, 'queryVector': query_vector})

Unnamed: 0,articleId,prodName,purchaseDate,productTypeName,detailDesc
0,753724004,Rosemary,2019-08-05,Dress,"Short dress in woven fabric with 3/4-length sleeves with an opening and ties at the cuffs, and a gently rounded hem. Unlined."
1,733027002,Tove,2019-08-05,Top,"Short top in soft cotton jersey with a round neckline, short sleeves and a seam at the hem with a decorative knot detail at the front."
2,713577001,Malte r-neck,2019-06-27,Sweater,"Jumper in soft, patterned, fine-knit cotton with ribbing around the neckline, cuffs and hem."
3,731142001,Lead Superskinny,2019-06-27,Trousers,"Chinos in stretch twill with a zip fly and button, side pockets, welt back pockets and skinny legs."
4,687016004,DORIS CREW,2019-06-22,Sweater,"Top in sweatshirt fabric with a motif on the front and ribbing around the neckline, cuffs and hem. Soft brushed inside."
...,...,...,...,...,...
20,620425001,Karin headband,2018-10-12,Hairband,Wide hairband in cotton jersey with a twisted detail.
21,662328001,Survivor,2018-10-12,Blouse,"Straight-cut blouse in a crêpe weave with a collar, concealed buttons down the front and fake flap front pockets. Yoke with a pleat at the back, long sleeves with pleats and buttoned cuffs, and a straight cut hem with slits in the sides."
22,682848003,Skinny RW Ankle Milo Zip,2018-10-12,Trousers,"5-pocket, ankle-length jeans in washed stretch denim with hard-worn details, a regular waist, zip fly and button, and skinny legs with a zip at the hems. The jeans are made partly from recycled cotton."
23,691072002,JEKYL SWEATSHIRT,2018-10-12,Sweater,"Top in sweatshirt fabric with long raglan sleeves and ribbing around the neckline, cuffs and hem. Soft brushed inside. Regular fit."


#### Personalized Product Recommendations

In [None]:
personalized_res_df = gds.run_cypher('''
    MATCH(c:Customer {customerId:$customerId})-[r:PURCHASED]->(a0)
    WITH a0
    MATCH(a0)-[s:CUSTOMERS_ALSO_PURCHASED]->(a)-[:VARIANT_OF]->(p:Product)
    WITH p, sum(s.score) AS aggRecScore
    WITH p, aggRecScore, gds.similarity.cosine($queryVector, p.textEmbedding) AS cosineSimilarity
    RETURN p.productCode AS productCode,
        aggRecScore,
        cosineSimilarity as searchScore,
        p.productTypeName AS productType,
        p.prodName AS name,
        p.detailDesc AS description
        ORDER BY searchScore DESC LIMIT 10
''', params = {'customerId':CUSTOMER_ID, 'queryVector': query_vector})

In [None]:
print(f'User Search Prompt: "{search_prompt}"\n\n')
print('Search Results:\n')
display(search_res_df)
print('\nUser May Also Be Interested In:\n')
display(personalized_res_df)

User Search Prompt: "winter sweater with zipper"


Search Results:



Unnamed: 0,prodName,productTypeName,garmentGroupName,detailDesc,score
0,Catfish Zip,Sweater,Knitwear,"Soft, textured-knit jumper with a ribbed stand-up collar, zip at the top, dropped shoulders and ribbing at the cuffs and hem.",0.933434
1,Yolo Zip L/S,Sweater,Knitwear,"Jumper in a soft, rib knit with a stand-up collar, a visible zip at the front and long sleeves.",0.931779
2,BANANA HALF ZIP SWEATER,Sweater,Knitwear,"Jumper in a soft rib knit with a high stand-up collar, zip at the top, long raglan sleeves with elbow patches, and ribbing at the cuffs and hem.",0.930935
3,Ben zip hoodie,Sweater,Knitwear,"Fine-knit jacket in a soft viscose blend with a drawstring hood, zip and front pockets. Gently dropped shoulders and ribbing at the cuffs and hem.",0.930924
4,Raven Half Zip Sweater,Sweater,Jersey Fancy,"Sports top in stretch, fast-drying functional fabric with a stand-up collar, zip at the front with a chin guard, and a yoke at the back. Low dropped shoulders and long sleeves with thumbholes at the cuffs. Rounded and slightly longer at the back.",0.93011
5,Southern Sweater,Sweater,Knitwear,"Jumper in a soft rib knit with pointelle details, low dropped shoulders and long sleeves. Double ribbed trim around the neckline, and ribbing at the cuffs and hem.",0.929089
6,Zorro half-zip sweater,Sweater,Jersey Basic,"Short top in sweatshirt fabric with a high, ribbed stand-up collar with a zip at the front. Dropped shoulders, long sleeves and ribbing at the cuffs and hem. Soft brushed inside.",0.928748
7,BAY BLOCK STRIPE ZIP-UP,Sweater,Knitwear,"Jumper in a soft, fine-knit modal and cotton blend with a ribbed stand-up collar, zip at the top and long sleeves.",0.928719
8,Yolo Zip LS,Sweater,Knitwear,"Fitted jumper in a soft, rib knit with a turtle neck, visible zip at the top and long sleeves.",0.92771
9,Håkan half zip knit,Sweater,Knitwear,"Jumper in a soft cotton knit with a high, ribbed stand-up collar and zip at the top. Long sleeves, and ribbing at the cuffs and hem.",0.927483



User May Also Be Interested In:



Unnamed: 0,productCode,aggRecScore,searchScore,productType,name,description
0,640755,0.741509,0.850858,Sweater,Allen Sweater,"Top in sweatshirt fabric made from a cotton blend with a stand-up collar with a zip at the front and a kangaroo pocket. Dropped shoulders, long sleeves and ribbing at the cuffs and hem. Soft brushed inside."
1,697980,0.751843,0.839161,Sweater,Nicky,"Jumper in a soft, fine knit with ribbing around the neckline, cuffs and hem."
2,934053,0.768552,0.838258,Sweater,Chain,"Jumper in a soft knit containing some wool. Ribbed neckline decorated with a metal chain at the front, long, wide sleeves and ribbing at the cuffs and hem. The polyester content of the jumper is recycled."
3,935858,1.554463,0.834631,Sweater,Piper sweatshirt,"Top in soft sweatshirt fabric with a rounded, frill-trimmed collar in woven fabric, an opening with a button at the back of the neck, dropped shoulders and long, wide sleeves with ribbing at the cuffs. Soft brushed inside. The polyester content of the sweatshirt is recycled."
4,656401,10.335908,0.834584,Sweater,PASTRY SWEATER,"Jumper in soft, textured-knit cotton with long raglan sleeves and ribbing around the neckline, cuffs and hem."
5,557247,8.560783,0.83333,Sweater,Petar Sweater,"Oversized top in sturdy sweatshirt fabric with dropped shoulders and ribbing around the neckline, cuffs and hem. Soft brushed inside."
6,714826,0.747204,0.832716,Sweater,Shaun,"Jumper in a soft knit containing some wool with ribbing at the top, a wide V-neck front and back, long raglan sleeves and ribbing around the neckline, cuffs and hem."
7,674826,0.709781,0.828895,Sweater,Fine knit,"Jumper in fine-knit slub cotton with dropped shoulders, long sleeves and roll edges around the neckline, cuffs and hem."
8,851339,0.770838,0.828868,Sweater,Papaya Hood,"Oversized jacket in soft sweatshirt fabric with a small embroidered text detail on the front. Jersey-lined, drawstring hood, a zip down the front, front pockets, ribbing at the cuffs and wide ribbing at the hem. Soft brushed inside."
9,244267,6.236464,0.825758,Sweater,Silver lake,Purl-knit jumper in a cotton blend with a slightly wider neckline and 3/4-length sleeves.


## LLM For Generating Grounded Content

Let's use an LLM to automatically generate content for targeted marketing campaigns grounded with our knowledge graph using the above tools.
Here is a quick example for generating promotional emails. but you can create all sorts of content with this!

In [None]:
# Import relevant libraries
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.chat_models import ChatOpenAI, BedrockChat
from langchain.schema import StrOutputParser

In [16]:
#load LLM

def load_llm(llm_name: str):
    if llm_name == "gpt-4":
        print("LLM: Using GPT-4")
        return ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True)
    elif llm_name == "gpt-3.5":
        print("LLM: Using GPT-3.5")
        return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True)
    elif llm_name == "claudev2":
        print("LLM: ClaudeV2")
        return BedrockChat(
            model_id="anthropic.claude-v2",
            model_kwargs={"temperature": 0.0, "max_tokens_to_sample": 1024},
            streaming=True,
        )
    print("LLM: Using GPT-3.5")
    return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True)


llm = load_llm(LLM)

LLM: Using GPT-3.5


### Create Knowledge Graph Stores for Retrieval
To ground our content Generation we need to define retrievers to pull information from our knowledge graph.  Let's make two stores:
1. Personalized Search Retriever (`kg_personalized_search`): Based on recent customer searches and purchase history, pull relevant products
2. Recommendations retriever (`kg_recommendations`): Based on recent customer searches, what else may we recommend to them?

In [131]:
# This will be a function so we can change per customer id
def kg_personalized_search_gen(customer_id):
    return Neo4jVector.from_existing_index(
        embedding=embedding_model,
        url=NEO4J_URI,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
        index_name='product-text-embeddings',
        retrieval_query=f"""
    WITH node as product, score as searchScore
    OPTIONAL MATCH(product)<-[:VARIANT_OF]-(:Article)<-[r:CUSTOMERS_ALSO_PURCHASED]-(:Article)
    <-[:PURCHASED]-(:Customer {{customerId: '{customer_id}'}})
    WITH  product, searchScore, sum(r.score) AS purchaseScore
    RETURN '##Product:\n' +
        'prodName: ' + product.prodName + '\n' +
        'productTypeName' + product.productTypeName + '\n' +
        'garmentGroupName' + product.garmentGroupName + '\n' +
        'detailDesc: ' + product.detailDesc + '\n' +
        'url: ' + 'https://representative-domain/product/' + product.productCode
        AS text, (1.0 + purchaseScore)*searchScore AS score, {{source: 'https://representative-domain/product/' + product.productCode}} AS metadata
    ORDER BY score DESC LIMIT 5
    """
    )

In [137]:
kg_recommendations = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='product-text-embeddings',
    retrieval_query=f"""
    WITH node as product, score as searchScore
    MATCH(product)<-[:VARIANT_OF]-(:Article)-[r:CUSTOMERS_ALSO_PURCHASED]->(:Article)-[:VARIANT_OF]-(recProduct)
    WITH  recProduct, searchScore, sum(r.score) AS recScore
    RETURN '##Product:\n' +
        'prodName: ' + recProduct.prodName + '\n' +
        'productTypeName' + recProduct.productTypeName + '\n' +
        'garmentGroupName' + recProduct.garmentGroupName + '\n' +
        'detailDesc: ' + recProduct.detailDesc + '\n' +
        'url: ' + 'https://representative-domain/product/' + recProduct.productCode
        AS text, (1.0 + recScore)*searchScore AS score, {{source: 'https://representative-domain/product/' + recProduct.productCode}} AS metadata
    ORDER BY score DESC LIMIT 5
    """
)

### Prompt Engineering
Now let's define our prompts. We will combine two together:
1. A system prompt which, in this case tells the LLM how to generated the message
2. Human prompt: In this case just wraps the search prompt entered by the customer

This will allow us to pass the customer search to the retrievers, but then also to the LLM for addition context when drafting the message.

In [138]:
general_system_template = '''
You are a personal assistant named Sally for a fashion, home, and beauty company called HRM.
write an email to {customerName}, one of your customers, to promote and summarize products relevant for them given the current season / time of year: {timeOfYear} .
Please only mention the Products listed below. Do not come up with or add any new products to the list.
Each product description comes with a "url" field. make sure to link to the url with descriptive name text for each product so the customer can easily find them.

---
# Relevant Products:
{searchProds}

# Customer May Also Be Interested In:
{recProds}
---
'''
general_user_template = "{searchPrompt}"
messages = [
    SystemMessagePromptTemplate.from_template(general_system_template),
    HumanMessagePromptTemplate.from_template(general_user_template),
]
prompt = ChatPromptTemplate.from_messages(messages)

### Create a Chain
Now let's put a chain together that will leverage the retrievers, prompts, and LLM model. This is where Langchain shines, putting RAG together in a simple way.

In addition to the personalized search and recommendations context, we will allow for som other parameters

1. `customerName`: Ordinarily this will be pulled from Neo4j, but it has been scrubbed from the data for obvious reasons so we will provide our own name here.
2. `timeOfYear`: The time of year as a date, season, month, etc. the LLM can tailor the language appropriately.

You can potentially add other creative parameters here to help the LLM write relevant messages.


In [139]:
# Helper function
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

def chain_gen(customer_id):
    return ({'searchProds': (lambda x:x['searchPrompt']) | kg_personalized_search_gen(customer_id).as_retriever(k=100) | format_docs,
              'recProds': (lambda x:x['searchPrompt']) | kg_recommendations.as_retriever(k=5) | format_docs,
              'customerName': lambda x:x['customerName'],
              'timeOfYear': lambda x:x['timeOfYear'],
              "searchPrompt":  lambda x:x['searchPrompt']}
             | prompt
             | llm
             | StrOutputParser())

### Examples Runs

In [140]:
CUSTOMER_ID = "daae10780ecd14990ea190a1e9917da33fe96cd8cfa5e80b67b4600171aa77e0"

chain = chain_gen(CUSTOMER_ID)

In [141]:
print(chain.invoke({'searchPrompt':"Oversized Sweaters", 'customerName':'Alex Smith', 'timeOfYear':'Nov, 2023'}))

Dear Alex Smith,

I hope this email finds you well. As the weather gets colder, it's the perfect time to update your wardrobe with cozy and stylish oversized sweaters. At HRM, we have a wide range of options that I think you'll love. Let me introduce you to some of our top picks for this season:

1. Betsy Oversized Sweater: This oversized, V-neck jumper is made from a soft, loose knit containing wool and alpaca wool. It features dropped shoulders, long, wide sleeves, wide ribbing around the neckline, cuffs, and hem, as well as slits on the sides. You can find it [here](https://representative-domain/product/842001).

2. Japp Oversize Sweater: This relaxed-fit top is made from sweatshirt fabric and features a ribbed turtle neck, dropped shoulders, long, wide sleeves, and ribbing at the cuffs and hem. It is longer at the back, adding a stylish touch. You can find it [here](https://representative-domain/product/817392).

3. HUBBY Oversized Sweater: Made from sturdy sweatshirt fabric, this 

In [142]:
print(chain.invoke({'searchPrompt':"western boots", 'customerName':'Alex Smith', 'timeOfYear':'Nov, 2023'}))

Dear Alex Smith,

I hope this email finds you well. As the weather gets cooler and the holiday season approaches, I wanted to share some exciting products that I think you'll love. 

First, we have the Brush Western Boot. These ankle boots in imitation leather feature covered elastication at the sides and decorative tabs and buckles. With a comfortable heel height of approximately 4 cm, they are perfect for both style and comfort. You can find them [here](https://representative-domain/product/806766).

Another great option is the Lindsay Western Boot. Made from imitation suede, these ankle boots have a zip on one side and a loop at the back. The fabric linings, insoles, and rubber soles ensure maximum comfort. The heel measures 7.5 cm, adding a touch of elegance to your outfit. You can check them out [here](https://representative-domain/product/673580).

If you're looking for something with a pointed toe, the Wilma Western Boot SPEED is a great choice. These ankle boots feature elastic

Feel free to experiment and try more!

### Demo App
Now lets use the above tools to create a demo app with Gradio.  We will need to make a couple more functions, but otherwise easy to fire up from a Notebook!

In [143]:
# Create a means to generate and cache chains...so we can quickly try different customer ids
personalized_search_chain_cache = dict()
def get_chain(customer_id):
    if customer_id in personalized_search_chain_cache:
        return personalized_search_chain_cache[customer_id]
    chain = chain_gen(customer_id)
    personalized_search_chain_cache[customer_id] = chain
    return chain

In [144]:
import gradio as gr

def message_generator(*x):
    chain = get_chain(x[0])
    return chain.invoke({'searchPrompt':x[3], 'customerName':x[2], 'timeOfYear': x[1]})

customer_id = gr.Textbox(value=CUSTOMER_ID, label="Customer ID")
time_of_year = gr.Textbox(value="Nov, 2023", label="Time Of Year")
customer_name = gr.Textbox(value='Alex Smith', label="Customer Name")
search_prompt = gr.Textbox(value='Oversized Sweaters', label="Search Prompt(s)")
message_result = gr.Markdown( label="Message")

demo = gr.Interface(fn=message_generator, inputs=[customer_id, time_of_year, customer_name, search_prompt], outputs=message_result)
demo.launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7873

Could not create share link. Missing file: /Users/zachblumenfeld/opt/anaconda3/envs/Downloads/lib/python3.10/site-packages/gradio/frpc_darwin_arm64_v0.2. 

Please check your internet connection. This can happen if your antivirus software blocks the download of this file. You can install manually by following these steps: 

1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.2/frpc_darwin_arm64
2. Rename the downloaded file to: frpc_darwin_arm64_v0.2
3. Move the file to this location: /Users/zachblumenfeld/opt/anaconda3/envs/Downloads/lib/python3.10/site-packages/gradio





Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7873 <> None




## Wrap Up