In [1]:
# Imports

import pandas as pd
import pinecone


from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pinecone import Pinecone
from langchain_community.vectorstores.pinecone import Pinecone
from langchain.docstore.document import Document

import warnings

warnings.filterwarnings("ignore")

import sys

sys.path.append("../")

from utils.config import PINECONE_CONFIG, AZURE_ADA_PARAMS, PINECONE_INDEX
from utils.helper import convert_to_unix_epoch
from utils.constants import PROPERTIES_NAMESPACE

  from tqdm.autonotebook import tqdm


In [2]:
embeddings = OpenAIEmbeddings(**AZURE_ADA_PARAMS)

In [3]:
pinecone.init(**PINECONE_CONFIG)

In [4]:
rental = pd.read_csv("../data/rental_full.csv", nrows=20000)
resale = pd.read_csv("../data/resale_full.csv", nrows=20000)

In [5]:
rental.rename({"atreainsqft": "areainsqft"}, inplace=True, axis=1)
resale.rename({"atreainsqft": "areainsqft"}, inplace=True, axis=1)

In [6]:
df = pd.concat([rental, resale], ignore_index=True)

In [85]:
# Replace null values in string columns to ''

string_columns = df.select_dtypes(include="object").columns

df[string_columns] = df[string_columns].fillna("")

In [86]:
# Replace null values in numeric columns with the mode

numeric_columns = df.select_dtypes(exclude="object").columns

df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mode().iloc[0])

In [87]:
df["unix_timestamp"] = df["createdon"].apply(lambda x: convert_to_unix_epoch(x))

In [88]:
summary = df["summary"]

In [89]:
df = df.applymap(lambda x: x.lower() if type(x) == str else x)

In [90]:
df["summary"] = summary

In [91]:
summaries = df["summary"]
metadata = df[
    [
        "propertyid",
        "cityname",
        "areainsqft",
        "number_of_rooms",
        "furnishing_status",
        "site_view",
        "locality",
        "property_class",
        "number_of_bathroom",
        "propertytype",
        "totalprice",
        "day",
        "year",
        "month",
        "unix_timestamp",
    ]
].to_dict(orient="records")

In [93]:
docs = []

for i in range(len(summaries)):
    docs.append(Document(page_content=summaries[i], metadata=metadata[i]))

In [96]:
Pinecone.from_documents(
    documents=docs,
    embedding=embeddings,
    namespace=PROPERTIES_NAMESPACE,
    index_name=PINECONE_INDEX,
)

<langchain_community.vectorstores.pinecone.Pinecone at 0x7f0d046bd2d0>