### 1. Importing packages

In [2]:
# pip install openai tiktoken chromadb langchain langchain-community flask GitPython python-dotenv gunicorn

Collecting openai
  Downloading openai-1.38.0-py3-none-any.whl.metadata (22 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting flask
  Using cached flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting GitPython
  Using cached GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting gunicorn
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.27.0-py3-none-any

In [6]:
import os
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

### 2. Clone Github repositories

#### 2.1. Check current working directory

In [7]:
%pwd

'/Users/prathameshmohite/Desktop/source-code-analysis-1/research'

#### 2.2. Create test repository

In [8]:
!mkdir test_repo

mkdir: test_repo: File exists


#### 2.3. Clone a remote respository using URL

In [10]:
repo_path = "test_repo/"

Repo.clone_from(
    url="https://github.com/entbappy/End-to-end-ML-Project-Implementation", 
    to_path=repo_path
    )

<git.repo.base.Repo '/Users/prathameshmohite/Desktop/source-code-analysis-1/research/test_repo/.git'>

In [11]:
repo_path = "test_repo/"

loader = GenericLoader.from_filesystem(
    repo_path+'/src/mlProject',
    glob = "**/*",
    suffixes=[".py"], # I want to focus on python files only
    parser = LanguageParser(
        language=Language.PYTHON, 
        parser_threshold=500
    )
)

In [12]:
documents = loader.load()

In [13]:
documents

[Document(metadata={'source': 'test_repo/src/mlProject/__init__.py', 'language': <Language.PYTHON: 'python'>}, page_content='import os\nimport sys\nimport logging\n\nlogging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"\n\nlog_dir = "logs"\nlog_filepath = os.path.join(log_dir,"running_logs.log")\nos.makedirs(log_dir, exist_ok=True)\n\n\nlogging.basicConfig(\n    level= logging.INFO,\n    format= logging_str,\n\n    handlers=[\n        logging.FileHandler(log_filepath),\n        logging.StreamHandler(sys.stdout)\n    ]\n)\n\nlogger = logging.getLogger("mlProjectLogger")'),
 Document(metadata={'source': 'test_repo/src/mlProject/pipeline/__init__.py', 'language': <Language.PYTHON: 'python'>}, page_content=''),
 Document(metadata={'source': 'test_repo/src/mlProject/pipeline/prediction.py', 'language': <Language.PYTHON: 'python'>}, page_content="import joblib \nimport numpy as np\nimport pandas as pd\nfrom pathlib import Path\n\n\nclass PredictionPipeline:\n    def __init__(

### Chunkings

In [14]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.PYTHON,
    chunk_size = 2000,
    chunk_overlap = 200 # Thumb rule in langchain community: Chunk overlap should be 10% of total chunk size
)

In [15]:
texts = documents_splitter.split_documents(documents)

In [16]:
len(texts)

19

In [17]:
print(texts[3].page_content)

from mlProject.config.configuration import ConfigurationManager
from mlProject.components.model_evaluation import ModelEvaluation
from mlProject import logger


STAGE_NAME = "Model evaluation stage"

class ModelEvaluationTrainingPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        model_evaluation_config = config.get_model_evaluation_config()
        model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
        model_evaluation_config.save_results()




if __name__ == '__main__':
    try:
        logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
        obj = ModelEvaluationTrainingPipeline()
        obj.main()
    except Exception as e:
        logger.exception(e)
        raise e


### Embedding model

In [18]:
# Import the load_dotenv function from the dotenv module to load environment 
# variables from a .env file.
from dotenv import load_dotenv

# Load environment variables from the .env file.
load_dotenv()

# Get the OpenAI API key from the loaded environment variables.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [19]:
embeddings=OpenAIEmbeddings(disallowed_special=())

  warn_deprecated(


### Knowledge base (vector DB)

In [20]:
vectordb = Chroma.from_documents(
    texts, 
    embedding=embeddings, 
    persist_directory='./data'
)

vectordb.persist()

  warn_deprecated(


### LLM Wrapper

In [21]:
# llm = ChatOpenAI(model_name="gpt-4")
llm = ChatOpenAI()

  warn_deprecated(


In [22]:
# For model to remember the context of the entire conversation and save it in memory
memory = ConversationSummaryMemory(
    llm=llm,
    memory_key = "chat_history", 
    return_messages=True
)

In [23]:
qa = ConversationalRetrievalChain.from_llm(
    llm, 
    retriever=vectordb.as_retriever(
        search_type="mmr", # There are many different search types to narrow down your search. 
        # Maximal Marginal Relevance (MMR): MMR is a principle used in information retrieval to prioritize the 
        # selection of documents that are both relevant to a query and diverse from each other. 
        # It aims to balance relevance and diversity in the results returned to the user.
        search_kwargs={"k":3}
    ), 
    memory=memory
)

### Q&A

In [24]:
question = "what is DataIngestion class?"

In [25]:
result = qa(question)
print(result['answer'])

  warn_deprecated(
Number of requested results 20 is greater than number of elements in index 19, updating n_results = 19


The `DataIngestion` class is a class responsible for downloading and extracting files. It is used in the `DataIngestionTrainingPipeline` class to download and extract files as part of the data ingestion process in a machine learning project. The `DataIngestion` class likely takes a configuration object as a parameter to specify details like source URL, local data file paths, and directories for extracting the downloaded files.


In [26]:
question = "what is ModelTrainer class?"
result = qa(question)
print(result['answer'])

Number of requested results 20 is greater than number of elements in index 19, updating n_results = 19


The `ModelTrainer` class is a part of the mlProject package and is used for training models. It initializes with a configuration object and has a method `train()` that is called to start the training process for the model.


In [27]:
question = "List all the unused libraries"
result = qa(question)
print(result['answer'])

Number of requested results 20 is greater than number of elements in index 19, updating n_results = 19


To identify unused libraries, you would typically analyze the codebase for imports that are not being used anywhere in the code. However, from the code snippets you provided, it seems that all the imported libraries are being used within the defined functions and classes. 

Therefore, based on the code you provided, there are no clearly unused libraries. If there are additional parts of the codebase that you haven't shared where you suspect unused libraries, you would need to review those sections for further analysis.


In [34]:
question = "Should I make any corrections in the entire code repo? Include the exact file name"
result = qa(question)
print(result['answer'])

Number of requested results 20 is greater than number of elements in index 19, updating n_results = 19


There seems to be an inconsistency in the file extension used in the code repo. In the provided code snippets, there is a function `load_json` that is loading JSON files, but the function is actually loading YAML files according to the comments. This discrepancy should be corrected to ensure consistency. 

The file name that might need correction is likely `load_json` because it is loading YAML files based on the comments in the code, while the function name suggests it loads JSON files.
