### Step-00 : importing the libraries and packages

In [1]:
# importing libraries and packages
import os
# we need to import Report for cloning any git hub repository
from git import Repo

# for context aware splitting we nned to import Language
from langchain.text_splitter import Language

# since we cloned any repo we need to import GenericLoader for loading the documents
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser

# RecursiveCharacterTextSplitter needed for splitting the document into chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter

# we need to import OpenAIEmbeddings for embedding the chunking document
from langchain.embeddings.openai import OpenAIEmbeddings

# we need to import Chroma for string embedding data
from langchain.vectorstores import Chroma

# selecting our models through ChatOpenAI
from langchain.chat_models import ChatOpenAI

# for memorize the Semmary
from langchain.memory import ConversationSummaryMemory

# chaining all the things thar done on earlier lines
from langchain.chains import ConversationalRetrievalChain

### Step-01 : cloning the repository

In [None]:
# current directory
%pwd

In [16]:
# creating folder on current directory
# this folder contains all the files and folders which we are going to clone from github repository
! mkdir files

In [None]:
# cloning the repository
repo_path = "files/"
Repo.clone_from("https://github.com/entbappy/End-to-end-ML-Project-Implementation", to_path=repo_path)

### Step-02 : Load the repository through the

In [22]:
# creating an obejct of GenericLoader() class and through this class 
# call the from_filesystem() function with few parameters
# here loader is the object
repo_path = "files/"
loader = GenericLoader.from_filesystem(
    repo_path+'/src/mlProject',
    glob = "**/*",
    suffixes=[".py"],
    parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [23]:
# through the loader object, we can call its other functions like load() function
# loader is the object of GenericLoader class.
documents=loader.load()

In [None]:
# showing few attribute
documents[0].page_content
documents[0].metadata

### Step-03 : splitting the documents into chunkings

In [28]:
# creating an obejct of RecursiveCharacterTextSplitter() class and through this class 
# call the from_language() function with few parameters
# here documents_splitter is the object
# here we just creating an object
documents_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.PYTHON,
    chunk_size = 2000,
    chunk_overlap = 200
    )

In [30]:
# through the documents_splitter object, we can call its other functions like split_documents() function
# documents_splitter is the object of RecursiveCharacterTextSplitter class.
# here we split the whole document into chunk through the object we created here as documents_splitter
# saving the whole chunked data into a varibale called as chunked_texts
chunked_texts = documents_splitter.split_documents(documents)

### Step-04 : Setting the API KEY for embedding

In [34]:
# importing few things
import os
from dotenv import load_dotenv
load_dotenv()

True

In [36]:
# get the Key from .env file
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

### Step-05 : loading the embbing model through

In [37]:
# creating an object of OpenAIEmbeddings class
embeddings=OpenAIEmbeddings(disallowed_special=())

In [47]:
# creating an obejct of Chroma() class and through this class 
# call the from_documents() function with few parameters
# here vectordb is the object
# here we pass the whole chunked_texts and the embeddings model here we are using openai embeddings
vectordb = Chroma.from_documents(
    chunked_texts, 
    embedding=embeddings, 
    persist_directory='./data'
    )

# through the object we can call other functions like persist function
vectordb.persist()

### Step-06 : loading the model that are going to use

In [40]:
# creating an object of ChatOpenAI class 
# By default it uses the model called as ''
llm = ChatOpenAI()

### Step-07 : Setting Buffer Memory

In [42]:
# creating an object of ConversationSummaryMemory function andpassing few parameters
# loading the model as llm
# its for memorising when we combine all the things
memory = ConversationSummaryMemory(
    llm=llm, 
    memory_key = "chat_history", 
    return_messages=True
    )

### Step-08 : Chaining all the things together

In [46]:
# creating an obejct of ConversationalRetrievalChain() class and through this class 
# call the from_llm() function with few parameters
# here qa is the object
# here we just creating an object
model_object = ConversationalRetrievalChain.from_llm(
    llm, 
    retriever=vectordb.as_retriever(
        search_type="mmr", 
        search_kwargs={"k":3}),  
    memory=memory
    )


### Step-09 : prompt designing

In [44]:
# prompt designing here
question = "what is DataIngestion class?"

### Step-10 : Final Result

In [None]:
# model ging the result
result = model_object(question)
print(result['answer'])

##### Lets converting this notebook into modular coding :
Modular coding is nothing but the codes reusability. Sometimes we need the same code in different functionabilities. For reusing the same code we always follow the modular codding.
