In [None]:
!pip install groq
!pip install jsonlines

In [None]:
import os
import json
import re
import time
import pandas as pd
import requests
from groq import Groq
from google.colab import drive
from IPython.display import display, HTML
from tqdm.notebook import tqdm

# Install required packages
!pip install groq pandas tqdm requests

# Mount Google Drive
drive.mount('/content/drive')

# Create project directory if it doesn't exist
PROJECT_DIR = "/content/drive/MyDrive/socratic_project"
os.makedirs(PROJECT_DIR, exist_ok=True)
print(f"Project directory created at: {PROJECT_DIR}")

# Setup Groq client
# You'll need to add your API key
API_KEY = input("Enter your Groq API key: ")
os.environ["GROQ_API_KEY"] = API_KEY
client = Groq(api_key=API_KEY)

# Function to parse the socratic dialogue from model output
def parse_dialogue(text):
    # First, try to extract JSON directly if it exists
    try:
        # Find JSON pattern with regex
        json_pattern = r'\[\s*\{.*\}\s*\]'
        match = re.search(json_pattern, text, re.DOTALL)
        if match:
            dialogue = json.loads(match.group(0))
            return dialogue
    except:
        print("JSON extraction failed, trying manual parsing")

    # Manual parsing as fallback
    dialogue = []
    lines = text.split('\n')
    current_role = None
    current_content = []

    for line in lines:
        if line.strip() == "":
            continue

        # Check for teacher/student markers
        if re.match(r'^(Teacher|TEACHER|teacher):', line):
            if current_role:
                dialogue.append({"role": current_role, "content": " ".join(current_content).strip()})
            current_role = "teacher"
            content_part = re.split(r'^(Teacher|TEACHER|teacher):\s*', line)[2] if len(re.split(r'^(Teacher|TEACHER|teacher):\s*', line)) > 2 else ""
            current_content = [content_part]
        elif re.match(r'^(Student|STUDENT|student):', line):
            if current_role:
                dialogue.append({"role": current_role, "content": " ".join(current_content).strip()})
            current_role = "student"
            content_part = re.split(r'^(Student|STUDENT|student):\s*', line)[2] if len(re.split(r'^(Student|STUDENT|student):\s*', line)) > 2 else ""
            current_content = [content_part]
        elif current_role:
            current_content.append(line.strip())

    # Add the last dialogue turn
    if current_role and current_content:
        dialogue.append({"role": current_role, "content": " ".join(current_content).strip()})

    # If parsing failed completely, create a placeholder
    if not dialogue:
        print("Warning: Could not parse dialogue structure. Using raw text.")
        dialogue = [{"role": "teacher", "content": text}]

    return dialogue

# Instead of using files.upload(), manually upload the file to your Google Drive
# and then access it directly
print("\nPlease upload your class7.txt file to Google Drive in the socratic_project folder")
print("Once uploaded, enter the filename below:")

input_file = input("Filename (default: class7.txt): ") or "class7.txt"
file_path = f"{PROJECT_DIR}/{input_file}"

# Check if file exists
if not os.path.exists(file_path):
    print(f"File not found at {file_path}")
    print("Please upload the file to your Google Drive and try again.")
    import sys
    sys.exit()

# Read the questions from the file
with open(file_path, "r") as f:
    questions_raw = f.readlines()

# Clean up questions and visualize them
questions = [q.strip() for q in questions_raw if q.strip()]

# Display the questions in a DataFrame for verification
df_questions = pd.DataFrame({
    "Question Number": range(1, len(questions) + 1),
    "Question": questions
})

print(f"\nLoaded {len(questions)} questions. Here's a preview:")
display(df_questions.head(10))

# Ask user to verify questions look correct before proceeding
proceed = input("\nDo the questions look correctly separated? (yes/no): ")
if proceed.lower() not in ['yes', 'y']:
    print("Please fix the input file and re-run.")
    import sys
    sys.exit()

# Define how many questions to process (full dataset or sample)
process_all = input("\nProcess all questions or just a sample? (all/sample): ")
if process_all.lower() in ['sample', 's']:
    sample_size = int(input("How many questions to process? (e.g., 5): "))
    questions = questions[:sample_size]
    print(f"Processing a sample of {sample_size} questions.")
else:
    print(f"Processing all {len(questions)} questions.")

# Function to find latest checkpoint
def find_latest_checkpoint():
    try:
        checkpoint_files = [f for f in os.listdir(PROJECT_DIR)
                           if f.startswith("checkpoint_") and f.endswith(".json")]
        if not checkpoint_files:
            return 0, []

        latest_file = max(checkpoint_files, key=lambda x: int(x.split("_")[1].split(".")[0]))
        latest_idx = int(latest_file.split("_")[1].split(".")[0])

        with open(f"{PROJECT_DIR}/{latest_file}", "r") as f:
            completed_results = json.load(f)

        return latest_idx, completed_results
    except Exception as e:
        print(f"Error finding checkpoints: {e}")
        return 0, []

# Call API with retry
def call_groq_with_retry(prompt, model="llama3-70b-8192", max_tokens=2000, max_retries=5):
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=model,
                max_tokens=max_tokens
            )
            return response
        except Exception as e:
            wait_time = 2 ** attempt  # Exponential backoff
            print(f"Error calling Groq API: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)

    # If all retries fail
    raise Exception(f"Failed to call Groq API after {max_retries} attempts")

# Constants for processing
CHECKPOINT_FREQUENCY = 25  # Save every 25 questions
BATCH_SIZE = 50  # Process in batches of 50 questions

# Check for existing checkpoints
start_idx, results = find_latest_checkpoint()
if start_idx > 0:
    print(f"Resuming from question {start_idx} with {len(results)} completed results")
else:
    results = []

# Create tracking DataFrame
tracking_data = []
for i, q in enumerate(questions):
    status = "Completed" if i < start_idx else "Pending"
    tracking_data.append({"Question": q, "Status": status, "Factual Answer": "", "Dialogue Turns": 0})

results_tracking = pd.DataFrame(tracking_data)

# Process each batch
for batch_start in range(start_idx, len(questions), BATCH_SIZE):
    batch_end = min(batch_start + BATCH_SIZE, len(questions))
    print(f"\nProcessing batch from {batch_start+1} to {batch_end}")

    # Process each question in the batch
    for i in range(batch_start, batch_end):
        question = questions[i]
        print(f"\n\n{'='*80}\nProcessing question {i+1}/{len(questions)}:\n{question}\n{'='*80}")

        # Update status
        results_tracking.loc[i, "Status"] = "Processing"
        display(results_tracking.iloc[i:i+1])

        # Generate factual answer
        print("\nGenerating factual answer...")
        factual_prompt = f"Answer this Class 7 NCERT Science question factually and directly: {question}"
        try:
            factual_response = call_groq_with_retry(
                prompt=factual_prompt,
                model="llama3-70b-8192",
                max_tokens=500
            )
            factual_answer = factual_response.choices[0].message.content
            print(f"Factual answer: {factual_answer[:100]}...")
            results_tracking.loc[i, "Factual Answer"] = factual_answer[:50] + "..."
        except Exception as e:
            print(f"Error generating factual answer: {e}")
            factual_answer = f"Error: Could not generate factual answer - {str(e)}"
            results_tracking.loc[i, "Status"] = "Error-Factual"

        # Generate socratic dialogue
        print("\nGenerating socratic dialogue...")
        socratic_prompt = f"""
        Create a socratic dialogue between a teacher and student for this 7th grade NCERT Science question:
        "{question}"

        The teacher should NOT give direct answers but guide the student through reasoning steps.
        The teacher should ask questions that help the student think and discover the answer themselves.

        Format the dialogue as a JSON array with alternating 'teacher' and 'student' roles like this:
        [
          {{"role": "teacher", "content": "First teacher message"}},
          {{"role": "student", "content": "First student response"}},
          {{"role": "teacher", "content": "Second teacher message"}},
          ...and so on
        ]

        Ensure the dialogue has at least 3-4 exchanges and leads to understanding.
        Always start with the teacher role and end with the teacher providing a concluding explanation.
        """

        try:
            socratic_response = call_groq_with_retry(
                prompt=socratic_prompt,
                model="llama3-70b-8192",
                max_tokens=2000
            )
            socratic_text = socratic_response.choices[0].message.content
            socratic_dialogue = parse_dialogue(socratic_text)

            # Print a sample of the dialogue
            print(f"\nDialogue has {len(socratic_dialogue)} turns. Sample:")
            for turn in socratic_dialogue[:2]:  # Show first 2 turns
                print(f"{turn['role'].upper()}: {turn['content'][:100]}...")

            results_tracking.loc[i, "Dialogue Turns"] = len(socratic_dialogue)
        except Exception as e:
            print(f"Error generating socratic dialogue: {e}")
            socratic_dialogue = [
                {"role": "teacher", "content": f"Error: Could not generate dialogue - {str(e)}"}
            ]
            results_tracking.loc[i, "Status"] = "Error-Dialogue"
            results_tracking.loc[i, "Dialogue Turns"] = 0

        # Create example and add to results
        example = {
            "question": question,
            "rejected": factual_answer,
            "chosen": socratic_dialogue
        }

        results.append(example)

        # Update status
        results_tracking.loc[i, "Status"] = "Completed"
        display(results_tracking.iloc[i:i+1])

        # Save checkpoint
        if (i + 1) % CHECKPOINT_FREQUENCY == 0 or i == len(questions) - 1:
            checkpoint_file = f"{PROJECT_DIR}/checkpoint_{i+1}.json"
            with open(checkpoint_file, "w") as f:
                json.dump(results, f, indent=2)
            print(f"\nCheckpoint saved to {checkpoint_file}")

        # Add a small delay to avoid rate limits
        time.sleep(1)

    # Save full results at the end of each batch
    full_results_file = f"{PROJECT_DIR}/full_results_{batch_end}.json"
    with open(full_results_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Full results saved to {full_results_file}")

# Save the final dataset
final_filename = f"{PROJECT_DIR}/socratic_dpo_dataset_final.json"
with open(final_filename, "w") as f:
    json.dump(results, f, indent=2)

print(f"\nDataset creation complete! Final dataset saved to {final_filename}")

# Generate summary statistics
dialogue_lengths = [len(ex["chosen"]) for ex in results]
factual_lengths = [len(ex["rejected"].split()) for ex in results]

print("\nSummary Statistics:")
print(f"Total questions processed: {len(results)}")
print(f"Average dialogue turns: {sum(dialogue_lengths)/len(dialogue_lengths):.1f}")
print(f"Average factual answer length: {sum(factual_lengths)/len(factual_lengths):.1f} words")

# Display a formatted sample of the final dataset
def display_example(example, index):
    html = f"""
    <div style="background-color:#f5f5f5; padding:15px; margin:10px 0; border-radius:5px;">
        <h3>Example {index+1}: {example['question']}</h3>
        <div style="margin:10px 0; padding:10px; background-color:#ffe6e6; border-radius:5px;">
            <h4>Rejected (Factual Answer):</h4>
            <p>{example['rejected']}</p>
        </div>
        <div style="margin:10px 0; padding:10px; background-color:#e6ffe6; border-radius:5px;">
            <h4>Chosen (Socratic Dialogue):</h4>
    """

    for turn in example['chosen']:
        if turn['role'] == 'teacher':
            html += f'<p style="color:#0066cc"><strong>Teacher:</strong> {turn["content"]}</p>'
        else:
            html += f'<p style="color:#cc6600"><strong>Student:</strong> {turn["content"]}</p>'

    html += """
        </div>
    </div>
    """
    return html

print("\nDisplaying samples from the final dataset:")
for i in range(min(3, len(results))):
    display(HTML(display_example(results[i], i)))

In [None]:



import json
import os
from google.colab import drive

# Make sure Google Drive is mounted
drive.mount('/content/drive', force_remount=False)

# Path to your project directory
PROJECT_DIR = "/content/drive/MyDrive/socratic_project"

# Ask for the file to clean
print("Available JSON files in your project directory:")
json_files = [f for f in os.listdir(PROJECT_DIR) if f.endswith('.json')]
for i, file in enumerate(json_files):
    print(f"{i+1}. {file}")

file_idx = int(input("\nEnter the number of the file you want to clean: ")) - 1
if file_idx < 0 or file_idx >= len(json_files):
    print("Invalid selection")
else:
    input_file = "socratic_dpo_dataset_final.json"
    input_path = os.path.join(PROJECT_DIR, input_file)

    # Load the JSON file
    with open(input_path, 'r') as f:
        data = json.load(f)

    print(f"\nLoaded {len(data)} entries from {input_file}")

    # Define validation functions
    def is_valid_entry(entry):
        # Check if question exists and is not empty
        if not entry.get("question") or entry["question"].strip() == "":
            return False

        # Check if rejected answer exists and is not error message
        if not entry.get("rejected") or entry["rejected"].strip() == "" or "Error:" in entry["rejected"]:
            return False

        # Check if chosen dialogue exists and has minimum turns
        if not entry.get("chosen") or not isinstance(entry["chosen"], list) or len(entry["chosen"]) < 2:
            return False

        # Check if dialogue has proper structure (teacher/student alternating)
        roles = [turn.get("role", "") for turn in entry["chosen"]]
        if not roles or roles[0] != "teacher":
            return False

        # Check if any turn has empty content
        for turn in entry["chosen"]:
            if not turn.get("content") or turn["content"].strip() == "":
                return False

        return True

    # Filter the data
    valid_data = [entry for entry in data if is_valid_entry(entry)]

    print(f"Found {len(valid_data)} valid entries out of {len(data)} total")
    print(f"Removed {len(data) - len(valid_data)} invalid entries")

    # Ask for confirmation
    if input("\nSave cleaned dataset? (yes/no): ").lower() in ['yes', 'y']:
        output_file = input_file.split(".")[0] + "_cleaned.json"
        output_path = os.path.join(PROJECT_DIR, output_file)

        with open(output_path, 'w') as f:
            json.dump(valid_data, f, indent=2)

        print(f"Cleaned dataset saved to {output_file}")

        # Display examples of removed entries if there are any
        if len(data) != len(valid_data):
            print("\nExamples of removed entries:")
            removed = [entry for entry in data if not is_valid_entry(entry)]
            for i, entry in enumerate(removed[:3]):  # Show up to 3 examples
                print(f"\nInvalid Entry #{i+1}:")
                print(f"Question: {entry.get('question', 'MISSING')}")
                print(f"Rejected: {entry.get('rejected', 'MISSING')[:100]}..." if entry.get('rejected') else "MISSING")
                print(f"Chosen: {len(entry.get('chosen', [])) if isinstance(entry.get('chosen'), list) else 'INVALID'} turns")
                if isinstance(entry.get('chosen'), list) and entry['chosen']:
                    print(f"First turn: {entry['chosen'][0].get('role', 'MISSING')} - {entry['chosen'][0].get('content', 'MISSING')[:50]}...")