In [1]:
import os 
from dotenv import load_dotenv
from pathlib import Path
import os
import shutil
import json
from datetime import datetime
# Get the current working directory and navigate one level up
PROJECT_ROOT = Path.cwd().parent  # Go one level above the current working directory
os.chdir(PROJECT_ROOT)

load_dotenv()

# Retrieve environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
import os
import json
import asyncio
import nest_asyncio
import logging
import time
from dotenv import load_dotenv
from openai import AsyncOpenAI

# Load environment variables
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)

# Initialize OpenAI async client
client = AsyncOpenAI(api_key=os.getenv('OPENAI_API_KEY'),base_url=os.getenv('BASE_URL'))

# Token Rate Limit Configuration
# token_limit_per_minute = 2000000
token_limit_per_minute = 2000000
char_limit=200000
token_counter = []
token_counter_time_window = 60  # seconds

# Chunk text by characters
def chunk_text_by_characters(text, char_limit=char_limit):
    for i in range(0, len(text), char_limit):
        yield text[i:i + char_limit]

# Rate limiting enforcement
async def enforce_rate_limit():
    current_time = time.time()
    while token_counter and token_counter[0] < current_time - token_counter_time_window:
        token_counter.pop(0)

    if len(token_counter) >= token_limit_per_minute:
        earliest_token_time = token_counter[0]
        sleep_time = (earliest_token_time + token_counter_time_window) - current_time
        if sleep_time > 0:
            logging.info(f"Rate limit approaching. Sleeping for {sleep_time:.2f} seconds.")
            await asyncio.sleep(sleep_time)

# GPT-4 API Call
async def call_gpt4(question, text_chunk, max_retries=5):
    prompt = f"""
    You are going to look at the file contents in separate chunks based on the chunks that are returned. 
    Return a JSON with key \"response\" containing a list of file paths starting with 'srcRepo'.

    Goal/Question: {question}

    Context:
    {text_chunk}
    """

    retries = 0
    while retries <= max_retries:
        await enforce_rate_limit()
        try:
            response = await client.chat.completions.create(
                # model="gpt-4o",
                model="us.amazon.nova-micro-v1:0",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                top_p=0.95,
                response_format={"type": "json_object"}
            )

            usage = response.usage
            total_tokens_used = usage.total_tokens
            token_counter.extend([time.time()] * total_tokens_used)
            return json.loads(response.choices[0].message.content)

        except Exception as e:
            logging.error(f"Unexpected error: {e}")

        retries += 1
        sleep_time = 2 ** retries
        logging.warning(f"Retrying after {sleep_time} seconds...")
        await asyncio.sleep(sleep_time)

    logging.error("Max retries exceeded.")
    return None

# Process chunks asynchronously with concurrency control
async def process_text_chunks(question, text_chunks, max_concurrent_requests=(token_limit_per_minute//char_limit)-1):
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    async def sem_task(chunk):
        async with semaphore:
            return await call_gpt4(question, chunk)

    tasks = [sem_task(chunk) for chunk in text_chunks]
    return await asyncio.gather(*tasks)

# Main execution
async def main(question):
    input_file = 'tmp/file_tree.txt'
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            text = file.read()

        text_chunks = list(chunk_text_by_characters(text, char_limit=128000))
        results = await process_text_chunks(question, text_chunks, max_concurrent_requests=5)

        with open('results.json', 'w') as f:
            json.dump(results, f, indent=2)

        logging.info("Processing complete. Results saved to 'results.json'.")

    except FileNotFoundError:
        logging.error(f"Input file '{input_file}' not found.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")

# Apply nest_asyncio for environments like Jupyter
nest_asyncio.apply()

# Example usage
user_question = "how to get current month cloud costs with boto3 sdk?"
asyncio.run(main(user_question))


In [3]:
import os
from pathlib import Path
from dotenv import load_dotenv
import mimetypes

# Load environment variables
load_dotenv()

# # Navigate to project root
# PROJECT_ROOT = Path(__file__).resolve().parent.parent
# os.chdir(PROJECT_ROOT)

def is_text_file(file_path):
    text_extensions = {
        '.txt', '.md', '.py', '.js', '.java', '.c', '.cpp', '.h', '.css',
        '.html', '.xml', '.json', '.yaml', '.yml', '.ini', '.conf', '.sh',
        '.bat', '.csv', '.log'
    }

    if os.path.splitext(file_path)[1].lower() in text_extensions:
        return True

    mime_type, _ = mimetypes.guess_type(file_path)
    if mime_type and mime_type.startswith('text'):
        return True

    try:
        with open(file_path, 'rb') as f:
            chunk = f.read(1024)
            return not bool(b'\x00' in chunk)
    except Exception:
        return False

def summarize_folder(folder_path, summary_file):
    subfolders = []
    files = []

    for item in sorted(os.listdir(folder_path)):
        item_path = os.path.join(folder_path, item)
        if os.path.isdir(item_path):
            subfolders.append(item)
        elif os.path.isfile(item_path):
            files.append(item)

    with open(summary_file, 'w', encoding='utf-8') as f:
        f.write(f"Summary of '{os.path.basename(folder_path)}':\n\n")
        if subfolders:
            f.write("Subfolders:\n")
            for sub in subfolders:
                f.write(f"- {sub}\n")
        if files:
            f.write("\nFiles:\n")
            for file in files:
                f.write(f"- {file}\n")

def create_repo_with_summaries(source_directory, target_directory):
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    for root, dirs, files in os.walk(source_directory):
        relative_path = os.path.relpath(root, source_directory)
        target_root = os.path.join(target_directory, relative_path)

        if not os.path.exists(target_root):
            os.makedirs(target_root)

        for file in files:
            source_file = os.path.join(root, file)
            target_file = os.path.join(target_root, file)
            if is_text_file(source_file):
                with open(source_file, 'r', encoding='utf-8', errors='ignore') as src, \
                     open(target_file, 'w', encoding='utf-8') as tgt:
                    tgt.write(src.read())

        for dir_name in dirs:
            summary_file = os.path.join(target_root, f"{dir_name}-gitRag.txt")
            summarize_folder(os.path.join(root, dir_name), summary_file)

# Example Usage
create_repo_with_summaries('gitRagRepo', 'newGitRagRepo')


In [None]:
import os
import json
import asyncio
import nest_asyncio
import logging
import time
from dotenv import load_dotenv
from openai import AsyncOpenAI

# Load environment variables
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)

# Initialize OpenAI async client
client = AsyncOpenAI(api_key=os.getenv('OPENAI_API_KEY'), base_url=os.getenv('BASE_URL'))

# Token Rate Limit Configuration
token_limit_per_minute = 2000000
char_limit = 200000
token_counter = []
token_counter_time_window = 60  # seconds

# Rate limiting enforcement
async def enforce_rate_limit():
    current_time = time.time()
    while token_counter and token_counter[0] < current_time - token_counter_time_window:
        token_counter.pop(0)

    if len(token_counter) >= token_limit_per_minute:
        earliest_token_time = token_counter[0]
        sleep_time = (earliest_token_time + token_counter_time_window) - current_time
        if sleep_time > 0:
            logging.info(f"Rate limit approaching. Sleeping for {sleep_time:.2f} seconds.")
            await asyncio.sleep(sleep_time)

# GPT-4 API Call
async def should_traverse(question, summary_text, max_retries=5):
    prompt = f"""
    Based on the provided summary, should we traverse deeper into this subfolder for the given question?

    Question: {question}

    Summary:
    {summary_text}

    Respond with a JSON object: {{"traverse": true or false}}
    """

    retries = 0
    while retries <= max_retries:
        await enforce_rate_limit()
        try:
            response = await client.chat.completions.create(
                model="us.amazon.nova-micro-v1:0",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                top_p=0.95,
                response_format={"type": "json_object"}
            )

            usage = response.usage
            total_tokens_used = usage.total_tokens
            token_counter.extend([time.time()] * total_tokens_used)
            return json.loads(response.choices[0].message.content)['traverse']

        except Exception as e:
            logging.error(f"Unexpected error: {e}")

        retries += 1
        sleep_time = 2 ** retries
        logging.warning(f"Retrying after {sleep_time} seconds...")
        await asyncio.sleep(sleep_time)

    logging.error("Max retries exceeded.")
    return False

async def traverse_repo(question, directory):
    files_of_interest = []

    for root, dirs, files in os.walk(directory):
        summary_files = [f for f in files if f.endswith('-gitrag.txt')]

        traverse_decisions = await asyncio.gather(*[
            should_traverse(question, open(os.path.join(root, sf), 'r', encoding='utf-8').read())
            for sf in summary_files
        ])

        for idx, decision in enumerate(traverse_decisions):
            if not decision:
                dirs.remove(summary_files[idx].replace('-gitrag.txt', ''))

        for file in files:
            if not file.endswith('-gitrag.txt'):
                files_of_interest.append(os.path.join(root, file))

    return files_of_interest

async def main(question):
    repo_path = 'newGitRagRepo'
    files_to_consider = await traverse_repo(question, repo_path)

    with open('results.json', 'w') as f:
        json.dump(files_to_consider, f, indent=2)

    logging.info(f"Traversal complete. Files of interest saved to 'results.json'.")

# Apply nest_asyncio for environments like Jupyter
nest_asyncio.apply()

# Example usage
user_question = "how to get current month cloud costs with boto3 sdk?"
asyncio.run(main(user_question))
