In [7]:

import os
import uuid
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from langchain.document_loaders import DirectoryLoader, NotebookLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import clean_and_tokenize
import json
import glob 
from tabulate import tabulate
def load_folder_documents(folder_path):
    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']
    documents_dict = {}

    for ext in extensions:
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(folder_path, include_outputs=True, max_output_length=20, remove_newline=True)
            else:
                loader = DirectoryLoader(folder_path, glob=glob_pattern, loader_cls=TextLoader)

            loaded_documents = loader.load() if callable(loader.load) else []
            
            if loaded_documents:
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, folder_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            print(f"Problematic folder path: {folder_path}")
            continue

    return list(documents_dict.values())

def load_and_index_files(repo_path):
    main_documents = load_folder_documents(repo_path)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

    split_documents = []
    for original_doc in main_documents:
        split_docs = text_splitter.split_documents([original_doc])
        for split_doc in split_docs:
            split_doc.metadata['file_id'] = original_doc.metadata['file_id']
            split_doc.metadata['source'] = original_doc.metadata['source']

        split_documents.extend(split_docs)

    index = None
    if split_documents:
        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
        index = BM25Okapi(tokenized_documents)
    
    file_sources = [doc.metadata['source'] for doc in split_documents]
    # Create a list of lists to represent your data
    data = []
    for doc in split_documents:
        data.append([doc.metadata['file_id'], doc.metadata['source'], doc.page_content])

    # Define the table headers
    headers = ["File ID", "Source", "Page Content"]

    # Use tabulate to print the data in a table
    print(tabulate(data, headers=headers, tablefmt="grid"))
    return index, split_documents, file_sources, [doc.metadata['source'] for doc in split_documents]

In [9]:
path = '/home/phucsaiyan/Documents/stage/clonechatgpt'
test_load_and_index = load_and_index_files(path)

Error loading files with pattern '**/*.txt': 'utf-8' codec can't decode byte 0xfb in position 0: invalid start byte
Problematic folder path: /home/phucsaiyan/Documents/stage/clonechatgpt
Error loading files with pattern '**/*.ipynb': [Errno 21] Is a directory: '/home/phucsaiyan/Documents/stage/clonechatgpt'
Problematic folder path: /home/phucsaiyan/Documents/stage/clonechatgpt
+--------------------------------------+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| File ID                              | Source                     | Page Content                                                                                                                                 