# Performant with mistrial model

In [1]:
redis_url = ""
inference_server_url=""

## Preprocess the data

In [2]:
from langchain.document_loaders import PyPDFDirectoryLoader

pdf_folder_path = 'sample'

loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

In [3]:
from PyPDF2 import PdfReader

In [4]:
def getProjectName(doc,end):
    reader = PdfReader(doc)
    page = reader.pages[0]
    text = page.extract_text()
    name = text[:text.rfind(end)]
    return name    

In [5]:
def getDetailAndAuthor(doc, start, end):
    reader = PdfReader(doc)
    page = reader.pages[0]
    text = page.extract_text()
    detailAndAuthor = text[text.find(start)+len(start):text.rfind(end)]
    return detailAndAuthor

In [6]:
def getTechnology(doc, start):
    reader = PdfReader(doc)
    page = reader.pages[0]
    text = page.extract_text()
    technology = text[text.find(start)+len(start):]
    return technology

In [7]:
andromedaDetail = getDetailAndAuthor('./sample/Andromeda.pdf', 'Details', 'Author')

In [8]:
andromedaAuthor = getDetailAndAuthor('./sample/Andromeda.pdf', 'Author', 'Technology')

In [9]:
andromedaTechnology = getTechnology('./sample/Andromeda.pdf', 'Technology')

In [10]:
andromedaName = getProjectName('./sample/Andromeda.pdf', 'Details')

In [11]:
print(f'''
Name: {andromedaName}

Detail: {andromedaDetail}


Author: {andromedaAuthor}

Technology: {andromedaTechnology}
''')


Name: Andromeda   

Detail:   The Andromeda galaxy, also known as M31, is a spiral galaxy located approximately 2.5 million light-years away from Earth in the constellation Andromeda. It is the closest major galaxy to our own Milky Way galaxy and is a member of the Local Group of galaxies, which also includes the Milky Way, the Triangulum galaxy (M33), and several smaller galaxies and dwarf galaxies.  The Andromeda galaxy has a diameter of about 100,000 light-years and is estimated to contain about 2 trillion stars, making it one of the largest galaxies in the observable universe. It is also home to a variety of other celestial objects, including black holes, supernovae, and planetary nebulae.  The Andromeda galaxy is classified as a barred spiral galaxy, meaning it has a central bar-shaped structure composed of stars and gas. It also has several spiral arms that wind outward from the center, containing many stars, gas, and dust. The galaxy is believed to be rotating rapidly, with sta

In [12]:
milkywayDetail = getDetailAndAuthor('./sample/Milkyway.pdf', 'Details', 'Author')

In [13]:
milkywayAuthor = getDetailAndAuthor('./sample/Milkyway.pdf', 'Author', 'Technology')

In [14]:
milkywayTechnology = getTechnology('./sample/Milkyway.pdf', 'Technology')

In [15]:
milkywayName = getProjectName('./sample/Milkyway.pdf', 'Details')

In [16]:
print(f'''
Name: {milkywayName}


Detail: {milkywayDetail}


Author: {milkywayAuthor}

Technology: {milkywayTechnology}
''')


Name: Milkyway   


Detail:   The Milky Way[c] is the galaxy that includes the Solar System, with the name describing the galaxy's appearance from Earth: a hazy band of light seen in the night sky formed from stars that cannot be individually distinguished by the naked eye. The term Milky Way is a translation of the Latin via lactea, from the Greek γαλαξίας κύκλος (galaxías kýklos), meaning "milky circle".[26][27] From Earth, the Milky Way appears as a band because its disk-shaped structure is viewed from within. Galileo Galilei first resolved the band of light into individual stars with his telescope in 1610. Until the early 1920s, most astronomers thought that the Milky Way contained all the stars in the Universe.[28] Following the 1920 Great Debate between the astronomers Harlow Shapley and Heber Doust Curtis,[29] observations by Edwin Hubble showed that the Milky Way is just one of many galaxies.   


Author:   Dr. Moiya McTier  

Technology:    .NET 



## Storing the project details in CSV file

In [17]:
import csv
import pandas as pd

In [18]:
rows = [
    ['1',andromedaName, andromedaDetail, andromedaAuthor, andromedaTechnology],
    ['2',milkywayName, milkywayDetail, milkywayAuthor, milkywayTechnology]
]

filename = "projects.csv"
fields = ['number', 'name', 'detail', 'author', 'technology']

In [19]:
with open(filename, 'w') as csvfile:   
    # creating a csv writer object   
    csvwriter = csv.writer(csvfile)   
          
    # writing the fields   
    csvwriter.writerow(fields)   
          
    # writing the data rows   
    csvwriter.writerows(rows) 

In [20]:
all_prods_df = pd.read_csv("projects.csv")

In [21]:
all_prods_df.reset_index(drop=True, inplace=True)
all_prods_df.head()

Unnamed: 0,number,name,detail,author,technology
0,1,Andromeda,"The Andromeda galaxy, also known as M31, is ...",Ivan Yefremov,Java
1,2,Milkyway,The Milky Way[c] is the galaxy that includes...,Dr. Moiya McTier,.NET


In [22]:
# Contruct a primary key
all_prods_df['primary_key'] = (
    all_prods_df['number']
)

In [23]:
all_prods_df.head()

Unnamed: 0,number,name,detail,author,technology,primary_key
0,1,Andromeda,"The Andromeda galaxy, also known as M31, is ...",Ivan Yefremov,Java,1
1,2,Milkyway,The Milky Way[c] is the galaxy that includes...,Dr. Moiya McTier,.NET,2


In [24]:
product_metadata = ( 
    all_prods_df
     .head(2)
     .to_dict(orient='index')
)

In [25]:
product_metadata[0]

{'number': 1,
 'name': 'Andromeda   ',
 'detail': '  The Andromeda galaxy, also known as M31, is a spiral galaxy located approximately 2.5 million light-years away from Earth in the constellation Andromeda. It is the closest major galaxy to our own Milky Way galaxy and is a member of the Local Group of galaxies, which also includes the Milky Way, the Triangulum galaxy (M33), and several smaller galaxies and dwarf galaxies.  The Andromeda galaxy has a diameter of about 100,000 light-years and is estimated to contain about 2 trillion stars, making it one of the largest galaxies in the observable universe. It is also home to a variety of other celestial objects, including black holes, supernovae, and planetary nebulae.  The Andromeda galaxy is classified as a barred spiral galaxy, meaning it has a central bar-shaped structure composed of stars and gas. It also has several spiral arms that wind outward from the center, containing many stars, gas, and dust. The galaxy is believed to be rota

## Setup Redis and Vector DB

In [26]:
# data that will be embedded and converted to vectors
texts = [
    v['name'] for k, v in product_metadata.items()
]


In [27]:
metadatas = list(product_metadata.values())

In [28]:
metadatas

[{'number': 1,
  'name': 'Andromeda   ',
  'detail': '  The Andromeda galaxy, also known as M31, is a spiral galaxy located approximately 2.5 million light-years away from Earth in the constellation Andromeda. It is the closest major galaxy to our own Milky Way galaxy and is a member of the Local Group of galaxies, which also includes the Milky Way, the Triangulum galaxy (M33), and several smaller galaxies and dwarf galaxies.  The Andromeda galaxy has a diameter of about 100,000 light-years and is estimated to contain about 2 trillion stars, making it one of the largest galaxies in the observable universe. It is also home to a variety of other celestial objects, including black holes, supernovae, and planetary nebulae.  The Andromeda galaxy is classified as a barred spiral galaxy, meaning it has a central bar-shaped structure composed of stars and gas. It also has several spiral arms that wind outward from the center, containing many stars, gas, and dust. The galaxy is believed to be r

In [29]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.redis import Redis as RedisVectorStore
from langchain.schema import Document

In [30]:
index_name = "projects"
embedding = HuggingFaceEmbeddings()
vectorstore = RedisVectorStore.from_texts(
    texts=texts,
    metadatas=metadatas,
    embedding=embedding,
    index_name=index_name,
    redis_url=redis_url
)

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
vectorstore.write_schema("projects_schema.yaml")