In [2]:
import google.generativeai as genai
genai.configure(api_key='ADD API KEY')

for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash
models/gemini-1.5-flash-001-tuning


In [4]:
model = genai.GenerativeModel('gemini-pro')


In [5]:
%%time

import os
import pathlib
import json

# Assuming files_path is defined and is a Path object
files_path = pathlib.Path('/Users/moutasemhome/Human-vs.-Synthetic-Datasets-Advancing-Niche-Model-Training-for-qa/data/torch_code_base.json/step_1_torch_repo/full_codebase')
print(f"Processing files in: {files_path.absolute()}")

# Ensure the output directory exists
output_dir = pathlib.Path('/Users/moutasemhome/Human-vs.-Synthetic-Datasets-Advancing-Niche-Model-Training-for-qa/data/step_3_generated_qa')
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output will be saved to: {output_dir.absolute()}")

# Track processed files
processed_files_tracker = output_dir / 'processed_files.txt'
if not processed_files_tracker.exists():
    processed_files_tracker.touch()
with open(processed_files_tracker, 'r') as f:
    processed_files = set(f.read().splitlines())

total_files = sum(1 for _ in files_path.glob('*.json'))
print(f"Total files to process: {total_files}")
print(f"Already processed files: {len(processed_files)}")
files_left = total_files - len(processed_files)
print(f"Files left to process: {files_left}")

file_processed = False
failed_responses = {}

for file_path in files_path.glob('*.json'):
    if file_path.name in processed_files:
        continue  # Skip this file since it's already been processed

    print(f"Processing file: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        # Load JSON content
        json_content = json.load(file)
        content = json_content['content']
    
    # Prepare the prompt based on the content type
    content_type = json_content['type']
    
    prompt = f"""Please generate a series of detailed questions and answers in clean JSON format, suitable for direct parsing with json.loads, that would challenge a developer or deep learning engineer's understanding of the following content about pytorch and deep learning. The focus areas include:
    
    - Implementation Nuances: Specific code choices, their potential trade-offs, and alternative implementations.
    - Theoretical Underpinnings: Conceptual knowledge behind algorithms and techniques used in the content.
    - Practical Implications: Real-world scenarios, edge cases, and failure modes relevant to the code or models.
    - Efficiency and Best Practices: Strategies for optimizing PyTorch code for better performance and adherence to best practices.
    
    The content for analysis is: {content}
    
    Guidelines for question and answer generation:
    - Clear and Specific: Questions must directly relate to the content, with relevant code snippets included for clarity.
    - Insightful Answers: Provide deep insights, efficiency tips, and potential improvements in the answers.
    - Standalone Value: Each QA pair must stand alone in value, understandable without the context of other pairs.
    - Clean JSON Format: Structure the output as a JSON array of objects, each with 'question' and 'answer' keys, ensuring the format is correct for json.loads parsing.
    
    Illustrative Examples:
    
    [
        {{
            "question": "Why is ReLU used as the activation function in the given convolutional neural network layer implementation?",
            "answer": "ReLU is chosen for its computational efficiency and ability to mitigate vanishing gradients. While tanh might offer smoother gradients, it can be computationally slower and still susceptible to saturation issues. Consider the task and dataset characteristics when experimenting with alternatives."
        }},
        {{
            "question": "What advantages does creating a custom dataset class in PyTorch offer over using predefined datasets, and how does the DataLoader interface enhance batch processing?",
            "answer": "Creating a custom dataset class allows for greater flexibility in preprocessing and handling data that may not fit the standard formats of predefined datasets. Utilizing the DataLoader interface enables efficient batch processing, automatic batching, shuffling, and parallel data loading. These features are crucial for handling large datasets and optimizing the training process."
        }},
        {{
            "question": "Explain the significance of the .to(device) method in PyTorch code, especially in the context of model training across different hardware.",
            "answer": "The .to(device) method allows for device-agnostic coding by dynamically allocating tensors or models to the available hardware, optimizing computational efficiency and leveraging GPU acceleration. This significantly speeds up model training and inference times by utilizing parallel processing capabilities of GPUs."
        }},
        ...
    ]
    
    Please adhere to the JSON format meticulously, ensuring each object within the array follows the '{{"question": "", "answer": ""}}' structure, ready for immediate parsing and utilization in development environments."""

    
    try:
        # Placeholder for the model call and response handling
        response = model.generate_content(prompt).text

        print(f"Received response for {file_path.name}")

        response = model.generate_content(prompt).text

        print(f"Received response for {file_path.name}")
       

        # Extracting the file name without the .json extension for the output
        base_file_name = os.path.basename(file_path).rsplit('.', 1)[0]  # Removes the last extension
        output_file_path = output_dir / f'{base_file_name}_qa.txt'

        # Writing the response to the new file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(response)


        # After successfully processing and saving:
        with open(processed_files_tracker, 'a') as f:
            f.write(f"{file_path.name}\n")  # Record this file as processed
        
        file_processed = True
        print(f"Output saved to: {output_file_path}")
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
        failed_responses[str(file_path)] = str(e)

if not file_processed:
    print("No files were processed. Check if the input directory exists and contains .json files.")

# Optionally, handle failed_responses here
if failed_responses:
    print("Some files failed to process:")
    for file_path, error in failed_responses.items():
        print(f"{file_path}: {error}")

# Update counts at the end
with open(processed_files_tracker, 'r') as f:
    processed_files = set(f.read().splitlines())
files_processed = len(processed_files)
files_left = total_files - files_processed
print(f"Total files processed: {files_processed}")
print(f"Files left to process: {files_left}")


Processing files in: /Users/moutasemhome/Human-vs.-Synthetic-Datasets-Advancing-Niche-Model-Training-for-qa/data/torch_code_base.json/step_1_torch_repo/full_codebase
Output will be saved to: /Users/moutasemhome/Human-vs.-Synthetic-Datasets-Advancing-Niche-Model-Training-for-qa/data/step_3_generated_qa
Total files to process: 2672
Already processed files: 0
Files left to process: 2672
Processing file: /Users/moutasemhome/Human-vs.-Synthetic-Datasets-Advancing-Niche-Model-Training-for-qa/data/torch_code_base.json/step_1_torch_repo/full_codebase/test_python_dispatch.json
Received response for test_python_dispatch.json
Failed to process /Users/moutasemhome/Human-vs.-Synthetic-Datasets-Advancing-Niche-Model-Training-for-qa/data/torch_code_base.json/step_1_torch_repo/full_codebase/test_python_dispatch.json: 429 Resource has been exhausted (e.g. check quota).
Processing file: /Users/moutasemhome/Human-vs.-Synthetic-Datasets-Advancing-Niche-Model-Training-for-qa/data/torch_code_base.json/step_