In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Langchain
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.chains.query_constructor.base import AttributeInfo

from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

  from tqdm.autonotebook import tqdm


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone

In [54]:
loader = DirectoryLoader(
    path="./data",
    loader_cls=CSVLoader,
    show_progress=True)

docs = loader.load()


100%|██████████| 1/1 [00:00<00:00, 30.43it/s]


In [55]:

metadata_field_info = [
    AttributeInfo(name="label", description="Category of the product", type="string"),
    AttributeInfo(name="url", description="The official website", type="string"),
    AttributeInfo(name="brand", description="Brand of the product", type="string"),
    AttributeInfo(name="name", description="The name of the product", type="string"),
    AttributeInfo(name="price", description="The price of the product", type="string"),
    AttributeInfo(name="skin type", description="type of the skin either applicable to all types or normal or oily or dry", type="string"),
    AttributeInfo(name="concern", description="For what reason it should used", type="string")
    
]

In [58]:
fields_to_convert_list = ["concern"]
fields_to_convert_int = ["price"]

In [59]:
def convert_to_list(doc, field):
    if field in doc.metadata and doc.metadata[field]:
        doc.metadata[field] = doc.metadata[field].split(", ")

def convert_to_int(doc, field):
    if field in doc.metadata and doc.metadata[field]:
        try:
            doc.metadata[field] = int(doc.metadata[field])
        except ValueError:
            pass 

In [60]:
for doc in docs:

    page_content_dict = dict(line.split(": ", 1)
                            for line in doc.page_content.split("\n") if ": " in line)
    

    doc.page_content = 'Overview: ' + page_content_dict.get('Plot', '') + '. Keywords: ' + page_content_dict.get('Genre', '')
    

    doc.metadata = {field.name: page_content_dict.get(field.name) for field in metadata_field_info}
    
    for field in fields_to_convert_list:
        convert_to_list(doc, field)
    
    for field in fields_to_convert_int:
        convert_to_int(doc, field)

In [61]:
len(docs)

967

In [62]:
docs[111]

Document(page_content='Overview: . Keywords: ', metadata={'label': 'face-moisturisers', 'url': 'https://www.myntra.com/face-moisturisers/loreal/loreal-paris-skin-perfect-age-20-anti-imperfections--whitening-sustainable-cream-50g/625425/buy', 'brand': 'LOreal', 'name': 'Paris Skin Perfect Age 20+ Anti-Imperfections + Whitening Sustainable Cream 50g', 'price': 'â‚¹ 189', 'skin type': 'All', 'concern': ['pigmentation,blackheads and whiteheads,general care']})

In [63]:
docs[111].page_content

'Overview: . Keywords: '

In [64]:
docs[111].metadata

{'label': 'face-moisturisers',
 'url': 'https://www.myntra.com/face-moisturisers/loreal/loreal-paris-skin-perfect-age-20-anti-imperfections--whitening-sustainable-cream-50g/625425/buy',
 'brand': 'LOreal',
 'name': 'Paris Skin Perfect Age 20+ Anti-Imperfections + Whitening Sustainable Cream 50g',
 'price': 'â‚¹ 189',
 'skin type': 'All',
 'concern': ['pigmentation,blackheads and whiteheads,general care']}

In [65]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.split_documents(docs)

In [66]:
len(documents)

967

In [67]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [68]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
llm = ChatGoogleGenerativeAI(model="gemini-pro")

In [69]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings=GoogleGenerativeAIEmbeddings(model = "models/embedding-001")

In [70]:
temp = documents[0:31]

In [71]:
temp

[Document(page_content='Overview: . Keywords:', metadata={'label': 'face-moisturisers', 'url': 'https://www.myntra.com/face-moisturisers/lakme/lakme-absolute-perfect-radiance-skin-lightening-day-creme-15g/4384871/buy', 'brand': 'Lakme', 'name': 'Absolute Perfect Radiance Skin Lightening Day Creme 15g', 'price': 'â‚¹ 79', 'skin type': 'All', 'concern': ['general care,,']}),
 Document(page_content='Overview: . Keywords:', metadata={'label': 'face-moisturisers', 'url': 'https://www.myntra.com/face-moisturisers/biotique/biotique-bio-morning-nectar-flawless-sustainable-skin-moisturizer-190ml/1661465/buy', 'brand': 'Biotique', 'name': 'Bio Morning Nectar Flawless Sustainable Skin Moisturizer 190ml', 'price': 'â‚¹ 165', 'skin type': 'All', 'concern': ['uneven skin tone,hydration,dark spots']}),
 Document(page_content='Overview: . Keywords:', metadata={'label': 'face-moisturisers', 'url': 'https://www.myntra.com/face-moisturisers/nivea/nivea-unisex-aloe-hydration-skin-cream-100-ml/8529167/buy'

In [72]:
index_name = "recommend-products"
index = PineconeVectorStore.from_documents(temp,embeddings,index_name=index_name)
