In [None]:
import openai
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import time
import optuna
import logging
import random
import os

# Set up OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

# Set up logging for Optuna
optuna.logging.set_verbosity(optuna.logging.DEBUG)

# Load the datasets
train_df = pd.read_csv('/Users/akhi/Desktop/TCS Kaggle Project/train.csv')
test_df = pd.read_csv('/Users/akhi/Desktop/TCS Kaggle Project/test.csv')

# Limit the dataset for optimization
train_df = train_df.sample(n=100, random_state=42)  # Using a smaller subset for quicker optimization

# Combine question and options into a single prompt
def create_prompt(row):
    return (f"Question: {row['prompt']}\n"
            f"A) {row['A']}\n"
            f"B) {row['B']}\n"
            f"C) {row['C']}\n"
            f"D) {row['D']}\n"
            f"E) {row['E']}\n\n"
            f"Please provide the letter of the correct option. Only provide the letter.")

train_df['prompt_combined'] = train_df.apply(create_prompt, axis=1)
queries = train_df['prompt_combined'].tolist()
true_answers = train_df['answer'].tolist()

def generate_response(prompt, temperature):
    for _ in range(5):  # Retry up to 5 times
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant. Answer the question with only the letter of the correct option."},
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,  # Use the temperature parameter from Optuna
                max_tokens=10  # Limit the response length to avoid additional text
            )
            return response['choices'][0]['message']['content'].strip()
        except openai.error.RateLimitError as e:
            print(f"Rate limit error: {e}. Retrying...")
            time.sleep(random.uniform(0.5, 2))  # Wait for a random time between 0.5 to 2 seconds before retrying
        except Exception as e:
            print(f"Error generating response: {e}")
            return ""
    return ""  # Return an empty string if all retries fail

def extract_letter(response):
    # Extract the first uppercase letter from the response
    for char in response:
        if char in ['A', 'B', 'C', 'D', 'E']:
            return char
    return "Invalid"  # Return "Invalid" if no letter is found

def evaluate_accuracy(temperature):
    results = []
    correct = 0
    
    for i, prompt in enumerate(queries):
        response = generate_response(prompt, temperature)
        predicted_answer = extract_letter(response)
        results.append(predicted_answer)
        if predicted_answer == true_answers[i]:
            correct += 1
    
    accuracy = correct / len(true_answers)
    return accuracy

def objective(trial):
    # Suggest a temperature parameter to Optuna
    temperature = trial.suggest_float('temperature', 0.0, 1.0)
    
    # Evaluate accuracy with the suggested temperature
    print(f"Starting trial with temperature: {temperature}")
    accuracy = evaluate_accuracy(temperature)
    print(f"Finished trial with temperature: {temperature}, accuracy: {accuracy}")
    return accuracy

# Create an Optuna study and optimize the objective function
print("Creating Optuna study...")
study = optuna.create_study(direction='maximize')
print("Starting optimization...")
study.optimize(objective, n_trials=5)  # Reduced number of trials for quicker optimization

best_temperature = study.best_params['temperature']
print(f"Best Temperature: {best_temperature}")

# Evaluate the model with the best temperature found by Optuna
results = []
correct = 0

start_time = time.time()  # Start timing

for i, prompt in enumerate(queries):
    response = generate_response(prompt, best_temperature)
    predicted_answer = extract_letter(response)
    results.append({
        "Prompt": prompt,
        "True Answer": true_answers[i],
        "Predicted Answer": predicted_answer
    })

    # Print a sample of the prompts, true answers, and predicted answers for debugging
    if i < 5:  # Adjust this number to print more samples if needed
        print(f"Sample {i + 1}")
        print(f"Prompt:\n{prompt}")
        print(f"True Answer: {true_answers[i]}")
        print(f"Predicted Answer: {predicted_answer}")
        print()

end_time = time.time()  # End timing
processing_time = end_time - start_time

results_df = pd.DataFrame(results)

# Calculate accuracy
correct = sum([1 for true, pred in zip(true_answers, results_df['Predicted Answer']) if true.strip().upper() == pred.strip().upper()])
accuracy = correct / len(true_answers)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Processing Time: {processing_time:.2f} seconds")

# Display the full dataset with results
print(results_df)

# Classification report
print("\nClassification Report:")
print(classification_report(true_answers, results_df['Predicted Answer'], zero_division=0))

# Confusion matrix
cm = confusion_matrix(true_answers, results_df['Predicted Answer'], labels=list(set(true_answers)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(set(true_answers)))
disp.plot(cmap='viridis')
plt.title('Confusion Matrix - Prompt Engineering')
plt.savefig('confusion_matrix_promptengineering.png')  # Save the confusion matrix plot
plt.show()
