### IMPORTS AND CONN ESTABLISHMENT

In [1]:
import os
import csv
import pandas as pd
from openai import OpenAI
from neo4j import GraphDatabase
# import genai
from dotenv import load_dotenv
load_dotenv()
import tiktoken
import yaml
# from utils.embeddings_utils import get_embedding

In [2]:
class Neo4jConnection:
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [32]:
with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Accessing keys and credentials
openai_api_key = config["openai"]["api_key"]
neo4j_uri = config["neo4j"]["uri"]
neo4j_user = config["neo4j"]["user"]
neo4j_password = config["neo4j"]["password"]
user = config['user']['quest']


In [4]:
## Connection Strings
conn = Neo4jConnection(uri=neo4j_uri, user=neo4j_user, pwd=neo4j_password)
from py2neo import Graph, Node, Relationship, NodeMatcher
conn.query("MATCH (n) RETURN COUNT(n)")

# Establish connection
graph = Graph(neo4j_uri, auth=(neo4j_user, neo4j_password))

### Checking the connection by looking over the data in the Database
graph.run("MATCH (n) return count(n)")

count(n)
46206


### CREATING EMBEDDINGS

In [9]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000 

In [10]:
def get_plots(limit=None):
    driver = GraphDatabase.driver('bolt://localhost:7687',auth=("neo4j",pwd))
    driver.verify_connectivity()
    query = """MATCH (m:ARTICLE) WHERE m.detail_desc IS NOT NULL
    RETURN m.article_id AS id,m.title as title, m.product_group_name AS product_group_name, m.product_type_name as product_type_name, m.detail_desc as detail_desc """
    movies, summary, keys = driver.execute_query(
        query
    )
    driver.close()
    return movies

In [17]:
def generate_embeddings(file_name, limit=None):
    csvfile_out = open(file_name, 'w', encoding='utf8', newline='')
    fieldnames = ['title','embedding']
    output_plot = csv.DictWriter(csvfile_out, fieldnames=fieldnames)
    output_plot.writeheader()
    movies = get_plots(limit=limit)
#     print(len(movies))
    llm = OpenAI(api_key=openai_api_key)
    for movie in movies:
#         print(movie['title'])
        desc = f"{movie['detail_desc']}"
        response = llm.embeddings.create(
            input=desc,
            model='text-embedding-ada-002')
        output_plot.writerow({
            'title': movie['title'],
            'embedding': response.data[0].embedding
        })

    csvfile_out.close()
generate_embeddings('detail-plot-embeddings.csv')

In [None]:
df = pd.read_csv("detail-plot-embeddings.csv")

#### Adding embeddings back to DataBase

In [36]:
for index, row in df.iterrows():
    if row['embedding'] is not None:
        graph.run("""
        MATCH (a:ARTICLE {title: $title})
        SET a.embedding = apoc.convert.fromJsonList($embedding)
        """, title=row['title'], embedding=row['embedding'])

In [56]:
# query = """MATCH (n:ARTICLE)
# RETURN n.article_id AS ArticleID, apoc.meta.type(n.article_id) AS Type
# LIMIT 10;"""

# conn.query(query)

In [47]:
# result = graph.run("MATCH (a:ARTICLE) RETURN a.article_id AS article_id, a.embedding AS embedding").data()

# # Prepare embeddings for similarity search
# article_ids = [row['article_id'] for row in result]
# embeddings = [row['embedding'] for row in result]


In [46]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# # Compute similarity scores
# similarities = cosine_similarity([Ques_embedding], embeddings)

# # Rank articles by similarity
# sorted_indices = np.argsort(-similarities[0])  # Descending order
# top_articles = [(article_ids[i], similarities[0][i]) for i in sorted_indices[:5]]  # Top 5
# print("Top Recommendations:", top_articles)


### CREATING VECTOR INDEX
##### to perform vector search 

In [37]:
query_vectorIndex = """CREATE VECTOR INDEX moviePlots1 IF NOT EXISTS
FOR (m:ARTICLE)
ON m.embedding
OPTIONS {indexConfig: {
 `vector.dimensions`: 1536,
 `vector.similarity_function`: 'cosine'
}}"""
conn.query(query_vectorIndex)

[]

#### check all the existing vector indexs

In [88]:
query_checkout_vectors = """SHOW INDEXES  YIELD id, name, type, state, populationPercent WHERE type = "VECTOR" """
graph.run(query_checkout_vectors)

id,name,type,state,populationPercent
2,moviePlots1,VECTOR,ONLINE,100.0
3,plot,VECTOR,ONLINE,100.0


### User Query

In [33]:
user_query = "I have a green hooodie what can it go good with"

In [34]:
from openai import OpenAI
# Initialize the OpenAI client
client = OpenAI(api_key = openai_api_key)
def get_embedding(text, model="text-embedding-ada-002"):
    # Replace newline characters with spaces
    text = text.replace("\n", " ")
    # Call OpenAI API to get embeddings
    embedding = client.embeddings.create(input=[text], model=model).data[0].embedding
    return embedding
text_input = user_query
# Get embeddings for the text input
Ques_embedding = get_embedding(text_input, model='text-embedding-ada-002')

# Now 'embedding' variable contains the embeddings for the text input
# print(embedding)


In [35]:
### Retrieval Using Neo4j

In [36]:
openaikey = 'openai_api_key'
OPENAI_ENDPOINT = 'https://api.openai.com/v1/'
query_string = """
    CALL db.index.vector.queryNodes(
        'plot', 
        35, 
        $Ques_embedding
        ) YIELD node AS movie, score
    RETURN  movie.article_id, movie.title, movie.detail_desc, score
"""

result = conn.query(query_string, {'Ques_embedding':Ques_embedding
                
})
# print(result)
column_names = ['article_id','title', 'detail_desc', 'similarity_score'] 
answer  = pd.DataFrame(result, columns=column_names)

answer.head(20)

Unnamed: 0,article_id,title,detail_desc,similarity_score
0,626445001,RUSSEL CROP HOODIE,Young Girl Hooded top in printed sweatshirt fa...,0.895126
1,921918001,ED Dores hoodie,"H&M+ Long, wide hoodie in soft sweatshirt fabr...",0.894989
2,845734001,LOGG Atwood. 1,H&M+ Long-sleeved jumper in a soft knit contai...,0.894684
3,545376001,L.O.G.G. Basic Hood,H&M+ Long-sleeved top in sweatshirt fabric wit...,0.894424
4,711719001,LOGG Velour hood,H&M+ Long-sleeved top in velour with a drawstr...,0.894272
5,735412001,Basic Hood jkt set,"Baby Essentials & Complements Top, hooded jack...",0.894043
6,735412005,Hood jkt set,"Baby Essentials & Complements Top, hooded jack...",0.894043
7,935547001,ED Nice hoodie,H&M+ Long hoodie in supersoft cotton jersey wi...,0.893768
8,885803002,Cortina Fleece Jacket (C),Ladies H&M Sport Hooded jacket in soft fleece ...,0.893738
9,885803005,Cortina Fleece Jacket,Ladies H&M Sport Hooded jacket in soft fleece ...,0.893738


### Prep to prompt

In [37]:
### trail 01


import pandas as pd

def prepare_context_from_df(retrieved_items_df):
    context =''
    for idx, row in retrieved_items_df.iterrows():
        context += f"{idx + 1}. {row['title']}: {row['detail_desc']}\n"
    return context

# Example retrieved items as a DataFrame


context = prepare_context_from_df(answer.head(3))
print(context)



1. RUSSEL CROP HOODIE: Young Girl Hooded top in printed sweatshirt fabric with long sleeves, ribbed cuffs and a raw-edge hem.
2. ED Dores hoodie: H&M+ Long, wide hoodie in soft sweatshirt fabric. Double-layered drawstring hood, dropped shoulders, long sleeves, discreet side pockets and ribbing at the cuffs and hem. Soft brushed inside. The polyester content of the hoodie is recycled.
3. LOGG Atwood. 1: H&M+ Long-sleeved jumper in a soft knit containing some wool with a drawstring hood, low dropped shoulders, slits in the sides and ribbing at the cuffs and hem. Slightly longer at the back.



### Generation Phase

In [45]:
def generate_answer(user_query, context):
    messages = [
        {"role": "system", "content": "You are a helpful assistant providing product recommendations."},
        {"role": "user", "content": f"User Query: {user_query}\nContext:\n{context}\nProvide a recommendation for what can be paired with the user's described item."}
    ]
    # Generate response
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # Use 'gpt-4' for more advanced capabilities
        messages=messages,
        max_tokens=200
    ).to_dict()  # Convert to a dictionary

    # Extract the content
    return response['choices'][0]['message']['content'].strip()

# Example usage
user_query = user
context = context
answer = generate_answer(user_query, context)



In [46]:
answer

'Pairing Suggestion: RUSSEL CROP HOODIE - High-waisted distressed jeans and ankle boots.'

In [47]:
def parse_result_to_dict(result_text):
    result_dict = {}
    for line in result_text.strip().split("\n"):
        key, value = line.split(": ")
        result_dict[key.strip()] = value.strip()
    return result_dict
result_dict = parse_result_to_dict(answer)

In [49]:
result_dict

{'Pairing Suggestion': 'RUSSEL CROP HOODIE - High-waisted distressed jeans and ankle boots.'}

In [50]:
answer_embedding = get_embedding(result_dict['Pairing Suggestion'], model='text-embedding-ada-002')

In [53]:
openaikey = 'openai_api_key'
OPENAI_ENDPOINT = 'https://api.openai.com/v1/'
query_string = """
    CALL db.index.vector.queryNodes(
        'plot', 
        50, 
        $Ques_embedding
        ) YIELD node AS movie, score
    RETURN  movie.article_id, movie.title, movie.detail_desc, score
"""

result = conn.query(query_string, {'Ques_embedding':answer_embedding
                
})
# print(result)
column_names = ['article_id','title', 'detail_desc', 'similarity_score'] 
answer  = pd.DataFrame(result, columns=column_names)

answer

Unnamed: 0,article_id,title,detail_desc,similarity_score
0,571650001,Taylor Fancy Denim,"Divided Selected Low-rise, ankle-length jeans ...",0.927399
1,571650002,Taylor fancy denim Slim LW,"Divided Selected Low-rise, ankle-length jeans ...",0.927399
2,569498002,Alala Denim,"Divided Selected Low-rise, ankle-length jeans ...",0.922989
3,547871002,Perrie Fancy,"Divided Collection 5-pocket, ankle-length jean...",0.922928
4,741752001,Petite Perrie slim mom denim T,"Divided Collection 5-pocket, ankle-length jean...",0.922928
5,640021001,Perrie Denim Trash TRS,"Divided Collection 5-pocket, ankle-length jean...",0.922546
6,640021005,Perrie Trash HW Denim TRS,"Divided Collection 5-pocket, ankle-length jean...",0.922546
7,640021013,Perrie HW Denim Trash TRS,"Divided Collection 5-pocket, ankle-length jean...",0.922546
8,613279001,Perrie Denim 2 Fancy TRS,"Divided Collection 5-pocket, high-waisted, ank...",0.922455
9,874955001,BELLE FLARE CROP,"Kids Girl Cropped leggings in washed, superstr...",0.92218


In [56]:
answer.groupby('detail_desc').count()

Unnamed: 0_level_0,article_id,title,similarity_score
detail_desc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Contemporary Casual 5-pocket ankle-length jeans in washed stretch denim with hard-worn details, a regular waist, zip fly and button and super-skinny legs with raw-edge hems.",1,1,1
"Divided Asia keys 5-pocket, ankle-length jeans with a high, asymmetric waist, zip fly and button and straight-cut, slightly wider legs with raw-edge frayed hems.",1,1,1
"Divided Asia keys Ankle-length jeans in washed denim with worn details, a high waist, zip fly and button and straight, wide legs.",1,1,1
"Divided Collection 5-pocket ankle-length jeans in washed denim with embroidered and hard-worn details in a loose fit. High waist, low crotch, button fly and straight legs with raw-edge hems.",1,1,1
"Divided Collection 5-pocket, ankle-length jeans in washed denim made partly from recycled cotton decorated with rhinestones. Slightly looser fit with an extra-high waist and straight legs.",1,1,1
"Divided Collection 5-pocket, ankle-length jeans in washed denim with a high waist and gently tapered legs with raw-edge hems.",1,1,1
"Divided Collection 5-pocket, ankle-length jeans in washed denim with a high waist, button fly and gently tapered legs with raw-edge hems.",1,1,1
"Divided Collection 5-pocket, ankle-length jeans in washed denim with a high waist, zip fly and button and gently tapered legs.",2,2,2
"Divided Collection 5-pocket, ankle-length jeans in washed denim with hard-worn details. Extra-high waist, a button fly and gently tapered legs with raw-edge hems.",3,3,3
"Divided Collection 5-pocket, ankle-length jeans in washed stretch denim with hard-worn details, a high waist and gently tapered legs with raw-edge hems.",1,1,1
