In [2]:
import os
import uuid
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from langchain.document_loaders import DirectoryLoader, NotebookLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import clean_and_tokenize
import json
import glob 
from tabulate import tabulate
def clone_github_repo(github_url, local_path):
    try:
        subprocess.run(['git', 'clone', github_url, local_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return False

def load_folder_documents(folder_path):
    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']
    documents_dict = {}

    for ext in extensions:
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(folder_path, include_outputs=True, max_output_length=20, remove_newline=True)
            else:
                loader = DirectoryLoader(folder_path, glob=glob_pattern, loader_cls=TextLoader)

            loaded_documents = loader.load() if callable(loader.load) else []
            
            if loaded_documents:
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, folder_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            print(f"Problematic folder path: {folder_path}")
            continue

    return list(documents_dict.values())

def load_and_index_files(repo_path, additional_folder_path):
    main_documents = load_folder_documents(repo_path)
    additional_documents = []
    additional_documents = load_folder_documents(additional_folder_path)

    all_documents = main_documents + additional_documents

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

    split_documents = []
    for original_doc in all_documents:
        split_docs = text_splitter.split_documents([original_doc])
        for split_doc in split_docs:
            split_doc.metadata['file_id'] = original_doc.metadata['file_id']
            split_doc.metadata['source'] = original_doc.metadata['source']

        split_documents.extend(split_docs)

    index = None
    if split_documents:
        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
        index = BM25Okapi(tokenized_documents)
    
    file_sources = [doc.metadata['source'] for doc in split_documents]
    # Create a list of lists to represent your data
    data = []
    for doc in split_documents:
        data.append([doc.metadata['file_id'], doc.metadata['source'], doc.page_content])

    # Define the table headers
    headers = ["File ID", "Source", "Page Content"]

    # Use tabulate to print the data in a table
    print(tabulate(data, headers=headers, tablefmt="grid"))
    return index, split_documents, file_sources, [doc.metadata['source'] for doc in split_documents]

[nltk_data] Downloading package punkt to /home/phucsaiyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
import os
import tempfile
from dotenv import load_dotenv
import langchain
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI
from config import WHITE, GREEN, RESET_COLOR, model_name
from utils import format_user_question
from file_processing import clone_github_repo, load_and_index_files
from questions import ask_question, QuestionContext

In [44]:
def clone_github_repo(github_url, local_path):
    try:
        subprocess.run(['git', 'clone', github_url, local_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return False
def load_folder_documents(folder_path):
    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 
                  'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 
                  'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 
                  'ipynb']
    documents_dict = {}

    for ext in extensions:
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(folder_path, include_outputs=True, max_output_length=20, remove_newline=True)
            else:
                loader = DirectoryLoader(folder_path, glob=glob_pattern, loader_cls=TextLoader )

            loaded_documents = loader.load() if callable(loader.load) else []
            
            # Filter out directories from the list of loaded documents
            loaded_documents = [doc for doc in loaded_documents if not os.path.isdir(doc.metadata['source'])]
            
            if loaded_documents:
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, folder_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            print(f"Problematic folder path: {folder_path}")
            continue

    return list(documents_dict.values())

In [45]:
github_url  = 'https://github.com/phucpham24/AIreadme_gen.git'
repo_name = github_url.split("/")[-1]
with tempfile.TemporaryDirectory() as local_path:
        if clone_github_repo(github_url, local_path):
            a = load_folder_documents(local_path)
            if index is None:
                print("No documents were found to index. Exiting.")
                exit()

            print("Repository cloned. Indexing files...")
a

Cloning into '/tmp/tmp6yv5e3b_'...


Error loading files with pattern '**/*.txt': 'utf-8' codec can't decode byte 0xfb in position 0: invalid start byte
Problematic folder path: /tmp/tmp6yv5e3b_
Error loading files with pattern '**/*.ipynb': [Errno 21] Is a directory: '/tmp/tmp6yv5e3b_'
Problematic folder path: /tmp/tmp6yv5e3b_
Repository cloned. Indexing files...


[Document(page_content="# AI GitHub Reader with ChatGPT-4\n\nAI GitHub Reader with ChatGPT-4 is a powerful tool that allows you to explore and ask questions about a GitHub code repository using OpenAI's advanced ChatGPT-4 language model. This AI-powered assistant can provide detailed answers based on the contents of the indexed repository files, including code, text, and Jupyter Notebook files.\n\n## Prerequisites\n\n- Python 3.6+\n- OpenAI API key (set in the environment variable OPENAI_API_KEY)\n\n## Usage\n\n1. Set the OpenAI API key as an environment variable `OPENAI_API_KEY`.\n2. Run the script: `app.py`\n3. Enter the GitHub URL of the repository you wish to explore.\n4. Ask questions about the repository. To exit the program, type `exit()`.\n\n## Key Features\n\n- Clones and indexes the contents of a GitHub repository.\n- Supports various file types, including code, text, and Jupyter Notebook files.\n- Utilizes OpenAI's advanced ChatGPT-4 language model for generating highly accu

In [47]:
github_url = input("Enter the GitHub URL of the repository: ")
repo_name = github_url.split("/")[-1]
print("Cloning the repository...")
add_doc = input("Enter the additional doc of the repository: ")
print("adding files into the repository...")
with tempfile.TemporaryDirectory() as local_path:
        if clone_github_repo(github_url, local_path):
            index, documents, file_type_counts, filenames = load_and_index_files(local_path, add_doc)
            if index is None:
                print("No documents were found to index. Exiting.")
                exit()

            print("Repository cloned. Indexing files...")

index, documents, file_type_counts, filenames

Cloning the repository...
adding files into the repository...


Cloning into '/tmp/tmp_fl4xld4'...


Error loading files with pattern '**/*.txt': 'utf-8' codec can't decode byte 0xfb in position 0: invalid start byte
Problematic folder path: /tmp/tmp_fl4xld4
Error loading files with pattern '**/*.json': Json schema does not match the Unstructured schema
Problematic folder path: /tmp/tmp_fl4xld4
Error loading files with pattern '**/*.ipynb': [Errno 21] Is a directory: '/tmp/tmp_fl4xld4'
Problematic folder path: /tmp/tmp_fl4xld4
Error loading files with pattern '**/*.json': Json schema does not match the Unstructured schema
Problematic folder path: /home/phucsaiyan/Documents/stage/test
Error loading files with pattern '**/*.ipynb': [Errno 21] Is a directory: '/home/phucsaiyan/Documents/stage/test'
Problematic folder path: /home/phucsaiyan/Documents/stage/test
Repository cloned. Indexing files...


(<rank_bm25.BM25Okapi at 0x7fea61855180>,
 [Document(page_content="AI GitHub Reader with ChatGPT-4\n\nAI GitHub Reader with ChatGPT-4 is a powerful tool that allows you to explore and ask questions about a GitHub code repository using OpenAI's advanced ChatGPT-4 language model. This AI-powered assistant can provide detailed answers based on the contents of the indexed repository files, including code, text, and Jupyter Notebook files.\n\nPrerequisites\n\nPython 3.6+\n\nOpenAI API key (set in the environment variable OPENAI_API_KEY)\n\nUsage\n\nSet the OpenAI API key as an environment variable OPENAI_API_KEY.\n\nRun the script: app.py\n\nEnter the GitHub URL of the repository you wish to explore.\n\nAsk questions about the repository. To exit the program, type exit().\n\nKey Features\n\nClones and indexes the contents of a GitHub repository.\n\nSupports various file types, including code, text, and Jupyter Notebook files.\n\nUtilizes OpenAI's advanced ChatGPT-4 language model for genera

In [19]:
print(filenames[1])
print(documents)


config.py
[Document(page_content="AI GitHub Reader with ChatGPT-4\n\nAI GitHub Reader with ChatGPT-4 is a powerful tool that allows you to explore and ask questions about a GitHub code repository using OpenAI's advanced ChatGPT-4 language model. This AI-powered assistant can provide detailed answers based on the contents of the indexed repository files, including code, text, and Jupyter Notebook files.\n\nPrerequisites\n\nPython 3.6+\n\nOpenAI API key (set in the environment variable OPENAI_API_KEY)\n\nUsage\n\nSet the OpenAI API key as an environment variable OPENAI_API_KEY.\n\nRun the script: app.py\n\nEnter the GitHub URL of the repository you wish to explore.\n\nAsk questions about the repository. To exit the program, type exit().\n\nKey Features\n\nClones and indexes the contents of a GitHub repository.\n\nSupports various file types, including code, text, and Jupyter Notebook files.\n\nUtilizes OpenAI's advanced ChatGPT-4 language model for generating highly accurate and detailed

[nltk_data] Downloading package punkt to /home/phucsaiyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
import langchain

from langchain.document_loaders import DirectoryLoader, TextLoader

# Define the folder path where your JSON files are located
folder_path = '/home/phucsaiyan/Documents/stage/test'

# Create a DirectoryLoader instance
loader = DirectoryLoader(folder_path, glob='**/*.json', loader_cls=TextLoader)

# Load the JSON files
loaded_documents = loader.load()

# Now, loaded_documents contains the JSON data from the specified folder

In [35]:
loaded_documents

[Document(page_content='{"_type": "ptag", "name": "JSON_OUTPUT_VERSION", "path": "1.0", "pattern": "in development"}\n{"_type": "ptag", "name": "TAG_EXTRA_DESCRIPTION", "path": "anonymous", "pattern": "Include tags for non-named objects like lambda"}\n{"_type": "ptag", "name": "TAG_EXTRA_DESCRIPTION", "path": "fileScope", "pattern": "Include tags of file scope"}\n{"_type": "ptag", "name": "TAG_EXTRA_DESCRIPTION", "path": "pseudo", "pattern": "Include pseudo tags"}\n{"_type": "ptag", "name": "TAG_EXTRA_DESCRIPTION", "path": "qualified", "pattern": "Include an extra class-qualified tag entry for each tag"}\n{"_type": "ptag", "name": "TAG_EXTRA_DESCRIPTION", "path": "subparser", "pattern": "Include tags generated by subparsers"}\n{"_type": "ptag", "name": "TAG_FIELD_DESCRIPTION", "parserName": "Python", "path": "nameref", "pattern": "the original name for the tag"}\n{"_type": "ptag", "name": "TAG_FIELD_DESCRIPTION", "path": "access", "pattern": "Access (or export) of class members"}\n{"_t

In [37]:
import json

def validate_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            # If parsing succeeds, the JSON is valid
            return True, json_data
    except json.JSONDecodeError as e:
        # If parsing fails, the JSON is invalid
        return False, str(e)

# Provide the path to your JSON file
file_path = '/home/phucsaiyan/Documents/stage/test/output.json'

# Validate the JSON file
is_valid, data = validate_json_file(file_path)

if is_valid:
    print("Valid JSON data:")
    print(data)
else:
    print("Invalid JSON data. Error message:")
    print(data)

Invalid JSON data. Error message:
Extra data: line 2 column 1 (char 93)



def my_function():
    print("Hello, World!")

def another_function():
    pass

