In [None]:
# Set your API key directly (replace with your actual OpenAI API key)
api_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"  # ⚠️ Make sure to secure this or use secrets management in production

# Initialize the OpenAI client with the API key
client = OpenAI(api_key=api_key)

In [5]:
# Import necessary libraries
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
import os
import time
from typing import List, Dict, Tuple, Any
import random

# Set your API key directly
api_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" 

# Initialize the OpenAI client with the API key
client = OpenAI(api_key=api_key)

In [6]:
class RAGEvaluator:
    def __init__(self):
        """
        Initialize the RAG evaluator to track metrics related to RAG performance
        """
        self.retrieval_metrics = {
            'relevance_scores': [],          # Relevance scores for retrieved questions
            'query_similarity_scores': [],   # Query-question similarity scores
            'category_matches': [],          # Whether retrieved questions match requested categories
            'difficulty_matches': []         # Whether retrieved questions match requested difficulty
        }

        self.response_metrics = {
            'evaluation_quality': [],        # Metric for quality of evaluations
            'reference_similarity': [],      # Similarity between user responses and reference answers
            'evaluation_time': []            # Time taken to evaluate responses
        }

        self.user_feedback = {
            'question_relevance': [],        # User ratings of question relevance
            'evaluation_fairness': [],       # User ratings of evaluation fairness
            'overall_satisfaction': None     # Overall user satisfaction with the interview
        }

    def log_retrieval_metrics(self, query_embedding, retrieved_questions, question_embeddings,
                             requested_categories=None, requested_difficulty=None):
        """
        Log metrics related to question retrieval

        Args:
            query_embedding: The embedding of the query used for retrieval
            retrieved_questions: List of retrieved question data
            question_embeddings: Matrix of all question embeddings
            requested_categories: Categories requested by the user
            requested_difficulty: Difficulty level requested by the user
        """
        from sklearn.metrics.pairwise import cosine_similarity
        import numpy as np

        # Get indices of retrieved questions
        retrieved_indices = [q['original_index'] for q in retrieved_questions]

        # Calculate relevance scores (using cosine similarity)
        similarities = cosine_similarity([query_embedding], question_embeddings)[0]
        retrieved_similarities = similarities[retrieved_indices]
        self.retrieval_metrics['relevance_scores'].extend(retrieved_similarities.tolist())
        self.retrieval_metrics['query_similarity_scores'].extend(retrieved_similarities.tolist())

        # Log category and difficulty matches if provided
        if requested_categories:
            for q in retrieved_questions:
                category_match = q.get('category', '') in requested_categories
                self.retrieval_metrics['category_matches'].append(category_match)

        if requested_difficulty:
            for q in retrieved_questions:
                difficulty_match = q.get('difficulty_level', '') == requested_difficulty
                self.retrieval_metrics['difficulty_matches'].append(difficulty_match)

    def log_response_evaluation(self, question, user_response, reference_answer, evaluation, score, evaluation_time):
        """
        Log metrics related to response evaluation

        Args:
            question: The question that was asked
            user_response: User's response to the question
            reference_answer: Reference answer from the question bank
            evaluation: Evaluation text from the LLM
            score: Score given by the LLM
            evaluation_time: Time taken to evaluate the response
        """
        from sentence_transformers import SentenceTransformer
        from sklearn.metrics.pairwise import cosine_similarity

        # Record evaluation time
        self.response_metrics['evaluation_time'].append(evaluation_time)

        # Calculate similarity between user response and reference answer
        try:
            encoder = SentenceTransformer('all-MiniLM-L6-v2')
            user_embedding = encoder.encode([user_response])[0]
            ref_embedding = encoder.encode([reference_answer])[0]
            similarity = cosine_similarity([user_embedding], [ref_embedding])[0][0]
            self.response_metrics['reference_similarity'].append(similarity)

            # Evaluate if the evaluation text mentions key points from reference answer
            # This is a simple heuristic - could be enhanced with more sophisticated methods
            eval_quality = 0.5  # Default quality score
            key_terms = self._extract_key_terms(reference_answer)
            mentioned_terms = sum(1 for term in key_terms if term.lower() in evaluation.lower())
            if key_terms:
                eval_quality = mentioned_terms / len(key_terms)
            self.response_metrics['evaluation_quality'].append(eval_quality)
        except Exception as e:
            print(f"Error calculating evaluation metrics: {e}")
            self.response_metrics['reference_similarity'].append(0.0)
            self.response_metrics['evaluation_quality'].append(0.0)

    def _extract_key_terms(self, text, max_terms=5):
        """Extract key terms from a text using a simple frequency-based approach"""
        import re
        from collections import Counter

        # Remove common words and punctuation
        words = re.findall(r'\b[A-Za-z]{3,}\b', text.lower())
        stop_words = {'the', 'and', 'is', 'in', 'to', 'of', 'that', 'for', 'are', 'with', 'as', 'can', 'be', 'this', 'an', 'or'}
        filtered_words = [w for w in words if w not in stop_words]

        # Return the most common words
        return [word for word, _ in Counter(filtered_words).most_common(max_terms)]

    def collect_user_feedback(self, question_idx, relevance_rating=None, fairness_rating=None):
        """
        Collect user feedback on question relevance and evaluation fairness

        Args:
            question_idx: Index of the question being rated
            relevance_rating: User rating of question relevance (1-5)
            fairness_rating: User rating of evaluation fairness (1-5)
        """
        if relevance_rating is not None:
            self.user_feedback['question_relevance'].append((question_idx, relevance_rating))

        if fairness_rating is not None:
            self.user_feedback['evaluation_fairness'].append((question_idx, fairness_rating))

    def set_overall_satisfaction(self, rating):
        """Set overall satisfaction rating from the user"""
        self.user_feedback['overall_satisfaction'] = rating

    def generate_evaluation_report(self):
        """
        Generate a comprehensive evaluation report for the RAG system

        Returns:
            Formatted report as a string
        """
        import numpy as np

        report = "# RAG System Evaluation Report\n\n"

        # Retrieval metrics
        report += "## Retrieval Performance\n\n"

        if self.retrieval_metrics['relevance_scores']:
            avg_relevance = np.mean(self.retrieval_metrics['relevance_scores'])
            report += f"- Average Relevance Score: {avg_relevance:.3f}\n"

        if self.retrieval_metrics['category_matches']:
            category_match_rate = np.mean(self.retrieval_metrics['category_matches'])
            report += f"- Category Match Rate: {category_match_rate:.2%}\n"

        if self.retrieval_metrics['difficulty_matches']:
            difficulty_match_rate = np.mean(self.retrieval_metrics['difficulty_matches'])
            report += f"- Difficulty Match Rate: {difficulty_match_rate:.2%}\n"

        # Response evaluation metrics
        report += "\n## Response Evaluation Performance\n\n"

        if self.response_metrics['evaluation_time']:
            avg_eval_time = np.mean(self.response_metrics['evaluation_time'])
            report += f"- Average Evaluation Time: {avg_eval_time:.2f} seconds\n"

        if self.response_metrics['evaluation_quality']:
            avg_quality = np.mean(self.response_metrics['evaluation_quality'])
            report += f"- Evaluation Quality Score: {avg_quality:.2f}\n"

        if self.response_metrics['reference_similarity']:
            avg_similarity = np.mean(self.response_metrics['reference_similarity'])
            report += f"- Average Response-Reference Similarity: {avg_similarity:.3f}\n"

        # User feedback
        report += "\n## User Feedback\n\n"

        if self.user_feedback['question_relevance']:
            ratings = [r for _, r in self.user_feedback['question_relevance']]
            avg_relevance = np.mean(ratings)
            report += f"- Average Question Relevance Rating: {avg_relevance:.2f}/5\n"

        if self.user_feedback['evaluation_fairness']:
            ratings = [r for _, r in self.user_feedback['evaluation_fairness']]
            avg_fairness = np.mean(ratings)
            report += f"- Average Evaluation Fairness Rating: {avg_fairness:.2f}/5\n"

        if self.user_feedback['overall_satisfaction'] is not None:
            report += f"- Overall User Satisfaction: {self.user_feedback['overall_satisfaction']}/5\n"

        # Overall assessment
        report += "\n## Overall Assessment\n\n"

        # Calculate overall retrieval score
        retrieval_score = 0
        if self.retrieval_metrics['relevance_scores']:
            retrieval_score = np.mean(self.retrieval_metrics['relevance_scores'])

        # Calculate overall evaluation score
        eval_score = 0
        if self.response_metrics['evaluation_quality']:
            eval_score = np.mean(self.response_metrics['evaluation_quality'])

        # Calculate overall user satisfaction
        user_score = 0
        if self.user_feedback['overall_satisfaction'] is not None:
            user_score = self.user_feedback['overall_satisfaction'] / 5
        elif self.user_feedback['question_relevance']:
            ratings = [r for _, r in self.user_feedback['question_relevance']]
            user_score = np.mean(ratings) / 5

        # Calculate overall RAG quality score
        rag_quality = (retrieval_score * 0.4) + (eval_score * 0.3) + (user_score * 0.3)
        report += f"- Overall RAG Quality Score: {rag_quality:.2f} (0-1 scale)\n"

        # Add recommendations based on scores
        report += "\n## Recommendations\n\n"

        if retrieval_score < 0.7:
            report += "- Consider improving retrieval by refining embeddings or using hybrid retrieval\n"

        if eval_score < 0.7:
            report += "- Enhance evaluation prompts to better assess user responses against reference answers\n"

        if user_score < 0.7:
            report += "- Focus on improving user experience and question relevance\n"

        return report

In [7]:
import time
from typing import List, Dict, Tuple, Any, Optional

class MockInterviewer:
    def __init__(self, qb_path: str):
        """
        Initialize the mock interviewer with the question bank

        Args:
            qb_path: Path to the CSV file containing the question bank
        """
        try:
            self.question_bank = pd.read_csv(qb_path)
            print(f"Loaded question bank with {len(self.question_bank)} questions.")

            # Print first few rows and columns to verify data
            print("\nFirst few rows of the question bank:")
            print(self.question_bank.head(2))

            # Check if required columns exist
            required_columns = ['questions', 'category', 'company', 'difficulty_level', 'Answer']
            for col in required_columns:
                if col not in self.question_bank.columns:
                    print(f"Warning: Required column '{col}' not found in the question bank.")
                    if col == 'questions' and 'question' in self.question_bank.columns:
                        print(f"Using 'question' column instead of 'questions'")
                        self.question_bank['questions'] = self.question_bank['question']
                    elif col == 'Answer' and 'answer' in self.question_bank.columns:
                        print(f"Using 'answer' column instead of 'Answer'")
                        self.question_bank['Answer'] = self.question_bank['answer']

            # Initialize the sentence transformer model for embeddings
            print("\nLoading the sentence transformer model...")
            self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
            print("Model loaded successfully!")

            # Get the questions column (could be 'questions' or 'question')
            questions_col = 'questions' if 'questions' in self.question_bank.columns else 'question'

            # Precompute embeddings for all questions in the bank
            print("Computing embeddings for questions...")
            self.question_embeddings = self.encoder.encode(self.question_bank[questions_col].tolist())
            print(f"Generated {len(self.question_embeddings)} embeddings.")

        except Exception as e:
            print(f"Error loading question bank: {e}")
            print("Creating sample question bank as fallback...")
            self.question_bank = self.create_sample_qb()
            print("Initializing embeddings with sample data...")
            self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
            self.question_embeddings = self.encoder.encode(self.question_bank['questions'].tolist())

        # Added for RAG Evaluation
        # Initialize the RAG evaluator
        self.evaluator = RAGEvaluator()

        # Track original indices in the question bank
        self.question_bank.reset_index(inplace=True)
        self.question_bank.rename(columns={'index': 'original_index'}, inplace=True)
        # Added for RAG Evaluation

        # Initialize dictionaries to store results
        self.responses = {}
        self.evaluations = {}
        self.scores = {}
        # User profile
        self.user_profile = {}

    def create_sample_qb(self):
        """Create a sample question bank for demonstration"""
        print("Creating a sample question bank for demonstration...")

        sample_data = {
            "questions": [
                "Explain the difference between supervised and unsupervised learning.",
                "What is the purpose of regularization in machine learning?",
                "Explain the concept of backpropagation in neural networks.",
                "What is the CAP theorem in distributed systems?",
                "How would you implement a recommendation system for an e-commerce platform?",
                "Explain the concept of REST API and its principles.",
                "What are microservices and what are their advantages?",
                "How do you handle concurrency issues in a distributed system?",
                "Explain the concept of Docker and containerization.",
                "How would you design a URL shortening service?"
            ],
            "category": ["ML", "ML", "ML", "System Design", "ML", "Web Development", "System Design", "System Design", "DevOps", "System Design"],
            "company": ["Google", "Meta", "Amazon", "Microsoft", "Netflix", "Amazon", "Google", "Meta", "Microsoft", "Netflix"],
            "difficulty_level": ["Medium", "Medium", "Hard", "Hard", "Hard", "Easy", "Medium", "Hard", "Medium", "Medium"],
            "Answer": [
                "Supervised learning uses labeled data to train models, where the algorithm learns to map inputs to known outputs. Unsupervised learning works with unlabeled data to find patterns or structures without predefined outputs.",
                "Regularization prevents overfitting by adding a penalty term to the loss function, discouraging complex models. Common techniques include L1 (Lasso) and L2 (Ridge) regularization.",
                "Backpropagation is an algorithm for training neural networks that calculates gradients of the loss function with respect to weights using the chain rule, propagating errors backward from output to input layers.",
                "The CAP theorem states that a distributed system cannot simultaneously provide Consistency, Availability, and Partition tolerance. You must choose two out of three properties.",
                "A recommendation system for e-commerce would use collaborative filtering, content-based filtering, or hybrid approaches. It would analyze user behavior, purchase history, and item similarities to suggest relevant products.",
                "REST (Representational State Transfer) is an architectural style for web services. Its principles include client-server architecture, statelessness, cacheability, uniform interface, layered system, and code on demand.",
                "Microservices are an architectural style where applications are built as small, independent services. Advantages include scalability, technology diversity, resilience, and easier deployment.",
                "Concurrency issues in distributed systems can be handled using locks, distributed transactions, eventual consistency models, conflict resolution strategies, and specialized data structures like CRDTs.",
                "Docker is a platform for developing, shipping, and running applications in containers. Containerization packages code and dependencies together, ensuring consistent operation across environments.",
                "A URL shortening service design includes components for URL shortening (hash function), storage (database), redirection service, analytics, and scaling considerations like caching and load balancing."
            ]
        }

        return pd.DataFrame(sample_data)

    def get_user_profile(self) -> Dict[str, Any]:
        """
        Get user profile information to personalize the interview

        Returns:
            Dictionary containing user profile information
        """
        print("\n--- Welcome to the Mock Interview App ---\n")
        years_exp = input("How many years of experience do you have? ")
        job_role = input("What job role are you interviewing for? ")
        skills = input("List your key skills (comma separated): ")

        self.user_profile = {
            "years_of_experience": years_exp,
            "job_role": job_role,
            "skills": [skill.strip() for skill in skills.split(",")]
        }

        print(f"\nThanks! I'll prepare an interview for a {job_role} position with {years_exp} years of experience.\n")
        return self.user_profile

    def retrieve_relevant_questions(self, n: int = 5, categories: Optional[List[str]] = None,
                               difficulty: Optional[str] = None) -> List[Dict[str, Any]]:
        """
        Retrieve relevant questions based on user profile

        Args:
            n: Number of questions to retrieve
            categories: Optional list of categories to filter by
            difficulty: Optional difficulty level to filter by

        Returns:
            List of relevant questions
        """
        # Create a query based on user profile
        query = f"Interview questions for {self.user_profile['job_role']} position with {self.user_profile['years_of_experience']} years of experience in {', '.join(self.user_profile['skills'])}"
        if categories:
            query += f" focusing on {', '.join(categories)}"
        if difficulty:
            query += f" at {difficulty} difficulty level"

        print(f"Query for question selection: {query}")

        # Get query embedding
        query_embedding = self.encoder.encode([query])[0]

        # Calculate similarity scores
        similarities = cosine_similarity([query_embedding], self.question_embeddings)[0]

        # Apply category filter if specified
        filtered_indices = list(range(len(self.question_bank)))
        if categories:
            filtered_indices = [i for i, cat in enumerate(self.question_bank['category'])
                              if cat in categories]

        # Apply difficulty filter if specified
        if difficulty and filtered_indices:
            filtered_indices = [i for i in filtered_indices
                              if self.question_bank.iloc[i]['difficulty_level'] == difficulty]

        # If we have no matches after filtering, use all questions
        if not filtered_indices:
            filtered_indices = list(range(len(self.question_bank)))
            print("No matches found with filters. Using all questions.")

        # Get similarities only for filtered indices
        filtered_similarities = [(i, similarities[i]) for i in filtered_indices]

        # Sort by similarity
        filtered_similarities.sort(key=lambda x: x[1], reverse=True)

        # Get top n indices
        top_indices = [i for i, _ in filtered_similarities[:n]]

        # Get the questions
        selected_questions = []
        questions_col = 'questions' if 'questions' in self.question_bank.columns else 'question'

        for idx in top_indices:
            question_data = self.question_bank.iloc[idx].to_dict()
            # Make sure we have all required fields
            if 'questions' not in question_data and questions_col in self.question_bank.columns:
                question_data['questions'] = question_data[questions_col]
            selected_questions.append(question_data)
            print(f"Selected question with similarity score {similarities[idx]:.4f}: {question_data['questions'][:50]}...")

        # Log retrieval metrics
        self.evaluator.log_retrieval_metrics(
            query_embedding=query_embedding,
            retrieved_questions=selected_questions,
            question_embeddings=self.question_embeddings,
            requested_categories=categories,
            requested_difficulty=difficulty
        )

        return selected_questions

    def conduct_interview(self, num_questions: int = 5) -> None:
        """
        Conduct the interview using selected questions

        Args:
            num_questions: Number of questions to ask
        """
        if not self.user_profile:
            self.get_user_profile()

        # Get category and difficulty preferences
        print("\nWould you like to focus on specific categories?")
        print("Available categories: ML, System Design, Web Development, DevOps, or leave blank for all")
        category_input = input("Enter categories (comma-separated) or press Enter for all: ")
        categories = [cat.strip() for cat in category_input.split(",")] if category_input.strip() else None

        print("\nWould you like to focus on a specific difficulty level?")
        print("Available difficulties: Easy, Medium, Hard, or leave blank for all")
        difficulty = input("Enter difficulty or press Enter for all: ").strip() or None

        print(f"\nRetrieving {num_questions} relevant questions based on your profile...")
        relevant_questions = self.retrieve_relevant_questions(
            num_questions,
            categories=categories if categories and categories[0] else None,
            difficulty=difficulty
        )

        print("\n--- Starting Interview ---\n")
        print(f"I'll ask you {num_questions} questions relevant to your profile. Please provide detailed answers.\n")

        for i, question_data in enumerate(relevant_questions):
            question = question_data['questions']
            category = question_data.get('category', 'General')
            difficulty = question_data.get('difficulty_level', 'Medium')

            # Handle different case variations for the Answer field
            reference_answer = None
            for key in ['Answer', 'answer', 'ANSWER']:
                if key in question_data:
                    reference_answer = question_data[key]
                    break

            if reference_answer is None:
                reference_answer = "No reference answer provided."

            print(f"\nQuestion {i+1} ({category}, Difficulty: {difficulty}):")
            print(question)

            # Get user's response
            response = input("\nYour answer: ")
            self.responses[question] = response

            print("\nEvaluating your response...")
            # Evaluate response
            evaluation, score = self.evaluate_response(question, response, reference_answer)
            self.evaluations[question] = evaluation
            self.scores[question] = score

            # Provide immediate feedback
            print(f"\nEvaluation: {evaluation}")
            print(f"Score: {score}/10")

            # Collect user feedback on question relevance
            relevance_rating = input("\nHow relevant was this question to your job role? (1-5, 5 being most relevant): ")
            try:
                relevance_rating = int(relevance_rating)
                if 1 <= relevance_rating <= 5:
                    self.evaluator.collect_user_feedback(i, relevance_rating=relevance_rating)
                else:
                    print("Invalid rating. Skipping feedback.")
            except ValueError:
                print("Invalid rating. Skipping feedback.")

            # Collect user feedback on evaluation fairness
            fairness_rating = input("How fair was this evaluation? (1-5, 5 being most fair): ")
            try:
                fairness_rating = int(fairness_rating)
                if 1 <= fairness_rating <= 5:
                    self.evaluator.collect_user_feedback(i, fairness_rating=fairness_rating)
                else:
                    print("Invalid rating. Skipping feedback.")
            except ValueError:
                print("Invalid rating. Skipping feedback.")

        # Collect overall satisfaction rating
        overall_rating = input("\nOverall, how satisfied are you with this interview experience? (1-5, 5 being most satisfied): ")
        try:
            overall_rating = int(overall_rating)
            if 1 <= overall_rating <= 5:
                self.evaluator.set_overall_satisfaction(overall_rating)
        except ValueError:
            pass

        # Generate summary at the end
        self.generate_summary()

        # Generate RAG evaluation report
        self.generate_rag_report()

    def evaluate_response(self, question: str, response: str, reference: str) -> Tuple[str, float]:
        """
        Evaluate user's response using GPT-3.5

        Args:
            question: The question asked
            response: User's response
            reference: Reference answer from the question bank

        Returns:
            Tuple of (evaluation text, score)
        """
        start_time = time.time()

        try:
            prompt = """
            You are evaluating a candidate's response in a technical interview.

            Question: {}

            Reference Answer: {}

            Candidate's Response: {}

            Please evaluate the candidate's response considering the following:
            1. Correctness and accuracy of information
            2. Completeness of the answer
            3. Clarity and communication
            4. Technical depth and understanding

            Provide a brief evaluation (2-3 sentences) and a score out of 10.

            Format your response as:
            Evaluation: [Your evaluation here]
            Score: [Score]/10
            """.format(question, reference, response)

            completion = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}]
            )

            result = completion.choices[0].message.content

            # Extract evaluation and score with enhanced error handling
            try:
                if "Evaluation:" in result and "Score:" in result:
                    evaluation_part = result.split("Evaluation:")[1].split("Score:")[0].strip()
                    score_part = result.split("Score:")[1].strip()
                    score = float(score_part.split("/")[0])
                else:
                    # Fallback parsing if format is not exactly as expected
                    lines = result.split("\n")
                    evaluation_part = next((line for line in lines if "valuation" in line), "")
                    evaluation_part = evaluation_part.split(":", 1)[1].strip() if ":" in evaluation_part else ""

                    score_line = next((line for line in lines if "core" in line), "")
                    score_text = score_line.split(":", 1)[1].strip() if ":" in score_line else ""
                    score = float(score_text.split("/")[0]) if "/" in score_text else 5.0

                    if not evaluation_part:
                        evaluation_part = "Response evaluated based on correctness, completeness, clarity, and technical depth."
            except Exception as e:
                print(f"Error parsing evaluation: {e}")
                evaluation_part = "Unable to parse evaluation correctly."
                score = 5.0

            # Calculate time taken
            evaluation_time = time.time() - start_time

            # Log evaluation metrics
            self.evaluator.log_response_evaluation(
                question=question,
                user_response=response,
                reference_answer=reference,
                evaluation=evaluation_part,
                score=score,
                evaluation_time=evaluation_time
            )

            return evaluation_part, score

        except Exception as e:
            print(f"Error in evaluation: {e}")
            evaluation_time = time.time() - start_time
            # Log failed evaluation
            self.evaluator.log_response_evaluation(
                question=question,
                user_response=response,
                reference_answer=reference,
                evaluation="Unable to evaluate due to an error.",
                score=5.0,
                evaluation_time=evaluation_time
            )
            return "Unable to evaluate due to an error.", 5.0

    def generate_summary(self) -> str:
        """
        Generate a comprehensive summary of the interview

        Returns:
            Summary text
        """
        print("\nGenerating interview summary...")
        # Prepare data for summary
        questions_and_scores = []
        for question, score in self.scores.items():
            questions_and_scores.append(f"Question: {question}\nScore: {score}/10")

        avg_score = sum(self.scores.values()) / len(self.scores)

        # Generate summary using GPT-3.5
        try:
            # Using string formatting instead of f-strings to avoid backslash issues
            prompt = """
            You are an interview coach. Please generate a comprehensive summary of this technical interview.

            Candidate Profile:
            - Job Role: {}
            - Experience: {} years
            - Skills: {}

            Interview Results:
            {}

            Average Score: {:.1f}/10

            Please provide:
            1. Overall performance assessment
            2. Key strengths (at least 3)
            3. Areas for improvement (at least 3)
            4. Recommendations for next steps

            Format your response in a clear, structured way with sections.
            """.format(
                self.user_profile['job_role'],
                self.user_profile['years_of_experience'],
                ', '.join(self.user_profile['skills']),
                '\n\n'.join(questions_and_scores),
                avg_score
            )

            try:
                completion = client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[{"role": "user", "content": prompt}]
                )

                summary = completion.choices[0].message.content
            except Exception as api_error:
                print(f"Error with API call: {api_error}")
                # Create a simplified summary without API call as fallback
                summary = """
                # INTERVIEW SUMMARY

                ## Overall Performance Assessment
                Your overall score was {:.1f}/10.

                ## Key Strengths
                - You completed the interview process
                - You provided answers to all questions
                - You demonstrated knowledge in your field

                ## Areas for Improvement
                - Work on providing more detailed answers
                - Include more examples in your responses
                - Connect theoretical knowledge with practical applications

                ## Recommendations
                - Review the technical areas covered in this interview
                - Practice explaining complex concepts simply
                - Prepare examples from your experience for common interview questions
                """.format(avg_score)

            print("\n" + "="*50)
            print("INTERVIEW SUMMARY")
            print("="*50)
            print(summary)

            return summary

        except Exception as e:
            print(f"Error generating summary: {e}")
            return "Unable to generate summary due to an error."

        # Add a new method to generate the RAG evaluation report
    def generate_rag_report(self) -> None:
        """Generate and display the RAG evaluation report"""
        report = self.evaluator.generate_evaluation_report()

        print("\n" + "="*50)
        print("RAG EVALUATION REPORT")
        print("="*50)
        print(report)

        # Ask if user wants to save the report
        save_report = input("\nWould you like to save this report to a file? (y/n): ")
        if save_report.lower() == 'y':
            filename = input("Enter filename (default: rag_evaluation_report.md): ") or "rag_evaluation_report.md"
            try:
                with open(filename, 'w') as f:
                    f.write(report)
                print(f"Report saved to {filename}")
            except Exception as e:
                print(f"Error saving report: {e}")


In [8]:
# Main execution
def main():
    print("Mock Interviewer App Started")

    # Look for QB.csv file in the current directory/content folder
    qb_path = "/content/processed_data (7).csv"
    if not os.path.exists(qb_path):
        print(f"QB.csv not found in {os.getcwd()}")
        print("Looking in /content/ directory...")
        content_path = "/content/processed_data (7).csv"
        if os.path.exists(content_path):
            qb_path = content_path
            print(f"Found QB.csv in /content/ directory")
        else:
            print("QB.csv not found. Will create a sample question bank.")

    # Initialize the interviewer
    interviewer = MockInterviewer(qb_path)

    # Get user profile
    interviewer.get_user_profile()

    # Ask for number of questions
    try:
        num_questions = int(input("\nHow many questions would you like in this interview? (recommended: 3-5): "))
        if num_questions <= 0:
            print("Number must be positive. Using default of 3 questions.")
            num_questions = 3
        elif num_questions > 10:
            print("Maximum 10 questions allowed. Using 10 questions.")
            num_questions = 10
    except ValueError:
        print("Invalid input. Using default of 3 questions.")
        num_questions = 3

    # Conduct the interview
    interviewer.conduct_interview(num_questions)

if __name__ == "__main__":
    main()

Mock Interviewer App Started
Loaded question bank with 3346 questions.

First few rows of the question bank:
                                           questions           category  \
0  "Can you describe a time when you used data to...  Analytical Skills   
1  "How do you prioritize tasks when analyzing la...  Analytical Skills   

  company difficulty_level                                             Answer  
0   Other           medium  One time I used data to make a decision was wh...  
1   Other           medium  When analyzing large volumes of data, it is im...  

Loading the sentence transformer model...
Model loaded successfully!
Computing embeddings for questions...
Generated 3346 embeddings.

--- Welcome to the Mock Interview App ---

How many years of experience do you have? 4
What job role are you interviewing for? Data Analyst
List your key skills (comma separated): SQL

Thanks! I'll prepare an interview for a Data Analyst position with 4 years of experience.


How many que