In [16]:
import os, pandas as pd
from sentence_transformers import SentenceTransformer
from sqlalchemy import create_engine, text
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.fastembed import FastEmbedEmbeddings

from langchain_iris import IRISVector

In [17]:
username = 'demo'
password = 'demo'
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

In [18]:
engine = create_engine(CONNECTION_STRING)

In [19]:
# Load a pre-trained sentence transformer model. This model's output vectors are of size 384
model = SentenceTransformer('all-MiniLM-L6-v2') 



In [20]:
# with engine.connect() as conn:
#     with conn.begin():# Load 
#         sql = f"""
#                 CREATE TABLE UserReviews3 (
#     id INT PRIMARY KEY AUTO_INCREMENT,
#     description TEXT,
#     video TEXT
# )
#                 """
#         result = conn.execute(text(sql))


In [21]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = f"""
                CREATE TABLE UserReviews0 (
    id INT PRIMARY KEY AUTO_INCREMENT,
    description TEXT,
    video TEXT,
    detail TEXT,
    description_vector VECTOR(DOUBLE, 384)
)
                """
        result = conn.execute(text(sql))

In [8]:
video_path = "../data/test.mov"
description = "a video taken at hackMIT for fun"
video_data =  video_path 


In [10]:
single_embedding =model.encode(description, normalize_embeddings=True).tolist()
print(single_embedding)

[-0.03906959295272827, -0.008342157118022442, -0.026566706597805023, -0.08878551423549652, 0.046086590737104416, 0.020340759307146072, 0.0887850821018219, -0.013871023431420326, 0.027919361367821693, 0.030563069507479668, 0.04630373790860176, -0.013182888738811016, -0.016081584617495537, 0.019593533128499985, -0.011516713537275791, -0.016755985096096992, 0.007793714292347431, 0.04765964299440384, -0.0044016470201313496, -0.024359963834285736, 0.029004549607634544, -0.11794460564851761, 0.048272520303726196, -0.015762722119688988, -0.06334342807531357, 0.04784588888287544, 0.004962285980582237, 0.08252301067113876, -0.042643092572689056, -0.06237834319472313, -0.005695562344044447, 0.010634432546794415, -0.024357780814170837, 0.03637763112783432, -0.07625733315944672, -0.11438402533531189, 0.021384360268712044, -0.0058635869063436985, -0.00525261927396059, 0.007380882743746042, -0.04616823047399521, -0.01485674548894167, 0.09347249567508698, -0.008224139921367168, -0.03535199537873268, 

In [20]:
description2 = "hello hello"
single_embedding2 =model.encode(description2, normalize_embeddings=True).tolist()
print(single_embedding)

[-0.03906959295272827, -0.008342157118022442, -0.026566706597805023, -0.08878551423549652, 0.046086590737104416, 0.020340759307146072, 0.0887850821018219, -0.013871023431420326, 0.027919361367821693, 0.030563069507479668, 0.04630373790860176, -0.013182888738811016, -0.016081584617495537, 0.019593533128499985, -0.011516713537275791, -0.016755985096096992, 0.007793714292347431, 0.04765964299440384, -0.0044016470201313496, -0.024359963834285736, 0.029004549607634544, -0.11794460564851761, 0.048272520303726196, -0.015762722119688988, -0.06334342807531357, 0.04784588888287544, 0.004962285980582237, 0.08252301067113876, -0.042643092572689056, -0.06237834319472313, -0.005695562344044447, 0.010634432546794415, -0.024357780814170837, 0.03637763112783432, -0.07625733315944672, -0.11438402533531189, 0.021384360268712044, -0.0058635869063436985, -0.00525261927396059, 0.007380882743746042, -0.04616823047399521, -0.01485674548894167, 0.09347249567508698, -0.008224139921367168, -0.03535199537873268, 

In [19]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = """
                INSERT INTO UserReviews7
                 (description, video,  description_vector)
                VALUES (:description, :video, TO_VECTOR(:description_vector))
            """
        conn.execute(
            text(sql),
            {"description": description, "video": video_data, "description_vector": str(single_embedding)}
        )
        print("Row inserted successfully.")

Row inserted successfully.


In [21]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = """
                INSERT INTO UserReviews7
                 (description, video,  description_vector)
                VALUES (:description, :video, TO_VECTOR(:description_vector))
            """
        conn.execute(
            text(sql),
            {"description": description2, "video": video_data, "description_vector": str(single_embedding2)}
        )
        print("Row inserted successfully.")

Row inserted successfully.


In [22]:
description_search = "hackMIT"
search_vector = model.encode(description_search, normalize_embeddings=True).tolist() # Convert search phrase into a vector


In [28]:
with engine.connect() as conn:
    with conn.begin():
        sql = text("""
            SELECT TOP 1 * FROM UserReviews7
            ORDER BY VECTOR_DOT_PRODUCT(description_vector, TO_VECTOR(:search_vector)) DESC
        """)

        results = conn.execute(sql, {'search_vector': str(search_vector)}).fetchall()


In [29]:
print(results)

[(1, 'a video taken at hackMIT for fun', '../data/test.mov', '-.039069592952728271484,-.0083421571180224418641,-.026566706597805023193,-.088785514235496520996,.046086590737104415893,.020340759307146072387,.08878 ... (8832 characters truncated) ... 966,-.013672383502125740051,.00043012772221118211746,-.0085372552275657653808,.035218089818954467773,-.054146107286214828491,-.0026657404378056526184')]


In [30]:
results_df = pd.DataFrame(results).iloc[:, :-1] # Remove vector
pd.set_option('display.max_colwidth', None)  # Easier to read description
results_df.head()

Unnamed: 0,id,description,video
0,1,a video taken at hackMIT for fun,../data/test.mov


# Langchain

In [2]:
import getpass
from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [31]:
loader = TextLoader("../data/hackMIT_details.txt", encoding='utf-8')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(documents)
print(docs)

Created a chunk of size 414, which is longer than the specified 400
Created a chunk of size 488, which is longer than the specified 400
Created a chunk of size 404, which is longer than the specified 400
Created a chunk of size 441, which is longer than the specified 400


[Document(metadata={'source': '../data/hackMIT_details.txt'}, page_content='Our journey in developing Rewind has been a fascinating exploration of cutting-edge technology and complex problem-solving. Rewind is an advanced memory system designed to harness the power of Retrieval-Augmented Generation (RAG) combined with intersystem vector search to enhance information retrieval and memory management.\n\nConceptualization and Design'), Document(metadata={'source': '../data/hackMIT_details.txt'}, page_content='The project began with a clear vision: to create a memory system that seamlessly integrates retrieval mechanisms with generative capabilities, allowing for more accurate and contextually relevant information retrieval. Our primary goal was to build a system that not only stores vast amounts of data but also intelligently retrieves and utilizes this information to enhance decision-making and knowledge retention.'), Document(metadata={'source': '../data/hackMIT_details.txt'}, page_cont

In [32]:
embeddings = OpenAIEmbeddings()
COLLECTION_NAME = "state_of_the_union_test"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)


  embeddings = OpenAIEmbeddings()


In [33]:

print(f"Number of docs in vector store: {len(db.get()['ids'])}")
query = "What did we build at HackMIT?"
docs_with_score = db.similarity_search_with_score(query)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

Number of docs in vector store: 126
--------------------------------------------------------------------------------
Score:  0.218652334724994
Technical Challenges and Innovations
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.223700608703322
Building Rewind has been a rewarding experience, showcasing the power of advanced retrieval and generation techniques. It has opened up new possibilities for how we interact with and utilize information, setting the stage for future innovations in memory systems and intelligent search technologies.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.224538392432496
The project began with a clear vision: to create a memory system that seamlessly integrates retrieval mechanisms with generative capabilities, a