# Building a Custom Implementation of the LangChain Embeddings Class
This notebook will document the steps involved in creating a custom implementation of the langchain embeddings class, which is available on [github](https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/embeddings/embeddings.py)<script src="https://emgithub.com/embed-v2.js?target=https%3A%2F%2Fgithub.com%2Flangchain-ai%2Flangchain%2Fblob%2Fmaster%2Flibs%2Fcore%2Flangchain_core%2Fembeddings%2Fembeddings.py&style=default&type=code&showBorder=on&showLineNumbers=on&showFileMeta=on&showFullPath=on&showCopy=on"></script>

## Imports

In [1]:
import google.generativeai as genai
from pymongo import MongoClient
from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import BSHTMLLoader
from bs4 import BeautifulSoup

In [2]:
with open('google_api_key.txt') as f:
    api_key = f.read()

In [3]:
class Embeddings():
    def __init__(self, model='models/text-embedding-004', api_key=api_key, dim=64):
        self.model, self.dim = model, dim
        genai.configure(api_key=api_key)
    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        embeddings = [genai.embed_content(model=self.model, content=text, 
                                          task_type='RETRIEVAL_DOCUMENT', 
                                          output_dimensionality=self.dim)['embedding']
                     for text in texts]
        return embeddings
    def embed_query(self, text: str) -> list[float]:
        return genai.embed_content(model=self.model, content=text, task_type='RETRIEVAL_DOCUMENT', output_dimensionality=self.dim)['embedding']
        

In [4]:

with open('mongo_info.txt') as f:
    (user, password, url) = f.readlines()
string = f'mongodb+srv://{user.strip()}:{password.strip()}@{url.strip()}/?retryWrites=true&w=majority&appName=website-database'

client = MongoClient(string)
embeddings = Embeddings()


In [5]:
len(embeddings.embed_query('will this work?'))

64

In [6]:
ed = [
    'https://www.eastern.edu/academics/colleges-seminary/college-health-and-sciences/departments/department-mathematical-5',
    'https://www.eastern.edu/academics/colleges-seminary/college-health-and-sciences/departments/department-mathematical-6',
    'https://www.eastern.edu/academics/colleges-seminary/college-health-and-sciences/departments/ms-data-faqs',
    'https://www.eastern.edu/academics/colleges-seminary/college-health-and-sciences/departments/department-mathematical-10',
    'https://wpcarey.asu.edu/aznext',
    'https://aznext.pipelineaz.com/static_assets/sites/myfutureaz.pipelineaz.com/AZNext_Brochure_ABDA_Certificate_2022.pdf',
    'https://degrees.apps.asu.edu/minors/major/ASU00/BABDACERT/applied-business-data-analytics?init=false&nopassive=true',
    'https://aznext.pipelineaz.com/static_assets/sites/aznext.pipelineaz.com/AZNext.Brochure.-.ASU.Salesforce.Developer.Academy.participants.pdf',
    'https://www.alfred.edu/academics/undergrad-majors-minors/environmental-studies.cfm',
    'https://www.alfred.edu/about/',
    'https://www.ucvts.org/domain/300'
]

In [7]:
from langchain.document_loaders import WebBaseLoader
pages = [WebBaseLoader(url).load() for url in ed]

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[
                                               "\n\n", "\n", "(?<=\. )", " "], length_function=len)
docs = [text_splitter.split_documents(page) for page in pages]

In [None]:
collection = client['website-database']['education-v2']

# Insert the documents in MongoDB Atlas with their embedding
docsearches = [MongoDBAtlasVectorSearch.from_documents(
    doc, embeddings, collection=collection
) for doc in docs]

In [None]:
vector_search = MongoDBAtlasVectorSearch.from_connection_string(
            mongo_uri := string,
            'website-database.education-v2',                                               #Create a vector search object
            embeddings,
            index_name="vector_index"
        )