In [None]:
import openai
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import time
import os

# Set up OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load the datasets
train_df = pd.read_csv('/Users/akhi/Desktop/TCS Kaggle Project/train.csv')
test_df = pd.read_csv('/Users/akhi/Desktop/TCS Kaggle Project/test.csv')

# Combine question and options into a single prompt
def create_prompt(row):
    return (f"Question: {row['prompt']}\n"
            f"A) {row['A']}\n"
            f"B) {row['B']}\n"
            f"C) {row['C']}\n"
            f"D) {row['D']}\n"
            f"E) {row['E']}\n\n"
            f"Please provide the letter of the correct option. Only provide the letter.")

train_df['prompt_combined'] = train_df.apply(create_prompt, axis=1)
queries = train_df['prompt_combined'].tolist()
true_answers = train_df['answer'].tolist()

def generate_response(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Answer the question with only the letter of the correct option."},
            {"role": "user", "content": prompt}
        ],
        temperature=0  # Ensure deterministic responses
    )
    return response['choices'][0]['message']['content'].strip()

def extract_letter(response):
    # Extract the first uppercase letter from the response
    for char in response:
        if char in ['A', 'B', 'C', 'D', 'E']:
            return char
    return response  # In case no letter is found, return the original response for debugging

# Evaluate on the entire dataset
results = []

start_time = time.time()  # Start timing

for i, prompt in enumerate(queries):
    response = generate_response(prompt)
    predicted_answer = extract_letter(response)
    results.append({
        "Prompt": prompt,
        "True Answer": true_answers[i],
        "Predicted Answer": predicted_answer
    })

    # Print a sample of the prompts, true answers, and predicted answers for debugging
    if i < 5:  # Adjust this number to print more samples if needed
        print(f"Sample {i + 1}")
        print(f"Prompt:\n{prompt}")
        print(f"True Answer: {true_answers[i]}")
        print(f"Predicted Answer: {predicted_answer}")
        print()

end_time = time.time()  # End timing
processing_time = end_time - start_time

results_df = pd.DataFrame(results)

# Calculate accuracy
correct = sum([1 for true, pred in zip(true_answers, results_df['Predicted Answer']) if true.strip().upper() == pred.strip().upper()])
accuracy = correct / len(true_answers)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Processing Time: {processing_time:.2f} seconds")

# Display the full dataset with results
print(results_df)

# Classification report
print("\nClassification Report:")
print(classification_report(true_answers, results_df['Predicted Answer'], zero_division=0))

# Confusion matrix
cm = confusion_matrix(true_answers, results_df['Predicted Answer'], labels=list(set(true_answers)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(set(true_answers)))
disp.plot(cmap='viridis')
plt.title('Confusion Matrix - Prompt Engineering')
plt.savefig('confusion_matrix_promptengineering.png')  # Save the confusion matrix plot
plt.show()
