In [1]:
!pip install -r requirements.txt

import pandas as pd
import re
import json
from pathlib import Path
import os
import google.generativeai as genai

# Set Gemini API key (replace 'YOUR_GEMINI_API_KEY' with your actual key)
os.environ['GOOGLE_API_KEY'] = 'PutYourAPIKeyHere'
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Set base directory
BASE_DIR = Path.cwd() / '/Users/moazam_a12/AI-Powered Interview Question Generator'

print("Dependencies installed, libraries imported, and Gemini API key configured.")

Dependencies installed, libraries imported, and Gemini API key configured.


In [3]:
from utils.preprocess import load_question_bank, load_job_descriptions, extract_keywords, save_preprocessed_data

# Define paths
QUESTION_BANK_PATH = BASE_DIR / 'data' / 'updated_coding_interview_question_bank.csv'
JOB_DESCRIPTIONS_PATH = BASE_DIR / 'data' / 'job_descriptions.csv'
OUTPUT_PATH = BASE_DIR / 'utils' / 'preprocessed_data.json'

# Define category column
CATEGORY_COLUMN = 'category'

# Load and process question bank and job descriptions
questions = load_question_bank(QUESTION_BANK_PATH, CATEGORY_COLUMN)
jobs = load_job_descriptions(JOB_DESCRIPTIONS_PATH)

if questions is not None and jobs is not None:
    # Save preprocessed data
    save_preprocessed_data(questions, jobs, OUTPUT_PATH)
    
    # Calculate summary
    total_technical = sum(len(questions['technical'][subcat]) for subcat in questions['technical'])
    total_behavioral = len(questions['behavioral'])
    print(f"Processed {total_technical} technical questions")
    print(f"Processed {total_behavioral} behavioral questions")
    print(f"Processed {len(jobs)} job descriptions")
else:
    print("Failed to preprocess data. Check input files or preprocess.py.")

Total questions in dataset: 1167
Unique questions in dataset: 186
Unique categories in question bank: ['machine learning', 'data science', 'deep learning', 'behavior']
Machine Learning questions: 14
Data Science questions: 3
Deep Learning questions: 114
Behavioral questions: 55
Preprocessed data saved to /Users/moazam_a12/AI-Powered Interview Question Generator/utils/preprocessed_data.json
Processed 131 technical questions
Processed 55 behavioral questions
Processed 997 job descriptions


In [5]:
import json
from pathlib import Path
import random

# Define paths
PREPROCESSED_PATH = BASE_DIR / 'utils' / 'preprocessed_data.json'
OUTPUT_QUESTIONS_PATH = BASE_DIR / 'output'
OUTPUT_QUESTIONS_PATH.mkdir(exist_ok=True)
QUESTION_SETS_PATH = OUTPUT_QUESTIONS_PATH / 'interview_questions.json'

def select_questions(questions, job_keywords, num_technical=5, num_behavioral=3):
    """Select relevant questions based on job keywords."""
    # Select technical questions
    technical_questions = []
    for subcat in questions['technical']:
        if subcat in job_keywords or any(keyword in subcat for keyword in job_keywords):
            technical_questions.extend(questions['technical'][subcat])
    
    # Filter by difficulty (prioritize Easy/Medium for internships)
    easy_medium = [q for q in technical_questions if q['difficulty'].lower() in ['easy', 'medium']]
    hard = [q for q in technical_questions if q['difficulty'].lower() == 'hard']
    selected_technical = random.sample(easy_medium, min(num_technical, len(easy_medium))) if easy_medium else []
    if len(selected_technical) < num_technical and hard:
        selected_technical.extend(random.sample(hard, min(num_technical - len(selected_technical), len(hard))))
    
    # Select behavioral questions
    behavioral_questions = random.sample(questions['behavioral'], min(num_behavioral, len(questions['behavioral']))) if questions['behavioral'] else []
    
    return selected_technical, behavioral_questions

# Load preprocessed data
try:
    with open(PREPROCESSED_PATH, 'r') as f:
        preprocessed_data = json.load(f)
except Exception as e:
    print(f"Error loading preprocessed data: {e}")
    preprocessed_data = {'questions': {'technical': {}, 'behavioral': []}, 'jobs': []}

questions = preprocessed_data['questions']
jobs = preprocessed_data['jobs']

# Generate question sets for each job
question_sets = []
for job in jobs:
    job_title = job['job_title']
    company_name = job['company_name']
    seniority_level = job['seniority_level']
    keywords = job['keywords']
    
    # Select questions from bank
    technical_questions, behavioral_questions = select_questions(questions, keywords)
    
    # Create question set
    question_set = {
        'job_title': job_title,
        'company_name': company_name,
        'seniority_level': seniority_level,
        'technical_questions': technical_questions[:5],
        'behavioral_questions': behavioral_questions[:3]
    }
    question_sets.append(question_set)

# Save question sets
with open(QUESTION_SETS_PATH, 'w') as f:
    json.dump(question_sets, f, indent=2)
print(f"Question sets saved to {QUESTION_SETS_PATH}")

# Print sample question set for Upper Hand
for q_set in question_sets:
    if q_set['company_name'] == 'Upper Hand':
        print("\nSample question set for Upper Hand:")
        print(json.dumps(q_set, indent=2))
        break

Question sets saved to /Users/moazam_a12/AI-Powered Interview Question Generator/output/interview_questions.json

Sample question set for Upper Hand:
{
  "job_title": "Internship - Machine Learning Engineer & Data Science",
  "company_name": "Upper Hand",
  "seniority_level": "Internship",
  "technical_questions": [
    {
      "id": 1083,
      "question": " How do you preprocess text in NLP",
      "difficulty": "Easy"
    },
    {
      "id": 1163,
      "question": " Describe two ways to visualize features of a CNN in an image classification task",
      "difficulty": "Medium"
    },
    {
      "id": 1146,
      "question": " Why Sigmoid or Tanh is not preferred to be used as the activation function in the hidden layer of the neural network",
      "difficulty": "Medium"
    },
    {
      "id": 1143,
      "question": " What can go wrong if we use a linear activation instead of ReLU",
      "difficulty": "Medium"
    },
    {
      "id": 1107,
      "question": " For infrequent/r

In [38]:
# Cell 4: Define chatbot helper functions
import json
import google.generativeai as genai
import re

def parse_user_input(user_input):
    """Use Gemini to parse user input, with keyword fallback if API fails."""
    # Keyword-based fallback
    def keyword_parse(input_text):
        input_lower = input_text.lower()
        intent = "unknown"
        company = None
        role = None
        
        if any(word in input_lower for word in ['interview', 'prep', 'prepare']):
            intent = "prepare for interview"
        elif any(word in input_lower for word in ['more', 'additional', 'another']):
            intent = "more questions"
        elif 'tip' in input_lower:
            intent = "interview tips"
        elif any(word in input_lower for word in ['hi', 'hello', 'hey']):
            intent = "greeting"
        elif any(word in input_lower for word in ['quit', 'exit']):
            intent = "quit"
        
        # Extract company
        if 'upper hand' in input_lower:
            company = "Upper Hand"
        
        # Extract role
        roles = ['machine learning', 'data science', 'ml engineer', 'data scientist']
        for r in roles:
            if r in input_lower:
                role = r.title()
                break
        
        return {"intent": intent, "company": company, "role": role}
    
    # Try Gemini API
    try:
        model = genai.GenerativeModel('gemini-2.0-flash')
        prompt = f"""
        Analyze the following user input and extract:
        - Intent: What the user wants (e.g., 'prepare for interview', 'more questions', 'interview tips', 'greeting', 'quit')
        - Company: Any mentioned company name (e.g., 'Upper Hand') or None
        - Role: Any mentioned job role (e.g., 'Machine Learning Engineer', 'Data Science') or None
        Input: "{user_input}"
        Return a JSON object: {{"intent": "string", "company": "string or null", "role": "string or null"}}
        """
        response = model.generate_content(prompt)
        parsed = json.loads(response.text.strip('```json\n```'))
        return parsed
    except Exception as e:
        if "429" in str(e):
            print("Oops, hit the API limit. Using backup parsing. Check your quota at https://ai.google.dev/gemini-api/docs/rate-limits.")
        return keyword_parse(user_input)

def generate_questions(job_description, keywords, num_technical=0, num_behavioral=0):
    """Generate additional questions using Gemini API."""
    # Fallback if job_description is empty
    if not job_description:
        default_technical = [
            {"id": "default_1", "question": "Explain the difference between supervised and unsupervised learning.", "difficulty": "Easy", "category": "Machine Learning"},
            {"id": "default_2", "question": "What is overfitting, and how can you prevent it?", "difficulty": "Medium", "category": "Machine Learning"}
        ][:num_technical]
        default_behavioral = [
            {"id": "default_1", "question": "Tell me about a time you worked in a team to solve a problem.", "difficulty": "Easy", "category": "Behavioral"},
            {"id": "default_2", "question": "How do you handle conflicting priorities?", "difficulty": "Easy", "category": "Behavioral"}
        ][:num_behavioral]
        return default_technical, default_behavioral

    model = genai.GenerativeModel('gemini-2.0-flash')
    technical_prompt = f"""
    Generate {num_technical} technical interview questions for an intern-level role with the following job description:
    {job_description}
    Focus on skills: {', '.join(keywords)}.
    Questions should be Easy or Medium difficulty, concise, and role-specific.
    Format as a JSON list: [{{\"id\": \"generated_X\", \"question\": \"Question text\", \"difficulty\": \"Easy/Medium\", \"category\": \"Appropriate category\"}}]
    """
    behavioral_prompt = f"""
    Generate {num_behavioral} behavioral interview questions for an intern-level role with the following job description:
    {job_description}
    Focus on teamwork, communication, and motivation.
    Format as a JSON list: [{{\"id\": \"generated_X\", \"question\": \"Question text\", \"difficulty\": \"Easy\", \"category\": \"Behavioral\"}}]
    """
    
    generated_technical = []
    generated_behavioral = []
    
    try:
        if num_technical > 0:
            technical_response = model.generate_content(technical_prompt)
            generated_technical = json.loads(technical_response.text.strip('```json\n```'))
            for i, q in enumerate(generated_technical):
                q['id'] = f"generated_technical_{i}"
        if num_behavioral > 0:
            behavioral_response = model.generate_content(behavioral_prompt)
            generated_behavioral = json.loads(behavioral_response.text.strip('```json\n```'))
            for i, q in enumerate(generated_behavioral):
                q['id'] = f"generated_behavioral_{i}"
        return generated_technical, generated_behavioral
    except Exception as e:
        if "429" in str(e):
            print("Oops, hit the API limit. Can't generate new questions right now.")
        return [], []

def get_interview_tips(role=None):
    """Generate interview tips using Gemini API."""
    model = genai.GenerativeModel('gemini-2.0-flash')
    role_text = f" for a {role} role" if role else ""
    prompt = f"""
    Provide 3 concise interview tips{role_text}.
    Focus on preparation, communication, and technical skills.
    Format as a JSON list: [\"Tip 1\", \"Tip 2\", \"Tip 3\"]
    """
    try:
        response = model.generate_content(prompt)
        tips = json.loads(response.text.strip('```json\n```'))
        return tips
    except Exception as e:
        if "429" in str(e):
            print("Oops, hit the API limit. Using default tips.")
        return ["Review common concepts.", "Practice clear communication.", "Prepare questions for the interviewer."]

In [40]:
# Cell 5: Load data and initialize chatbot context
from pathlib import Path
import json

# Define paths
BASE_DIR = Path.cwd()
QUESTION_SETS_PATH = BASE_DIR / 'output' / 'interview_questions.json'
PREPROCESSED_PATH = BASE_DIR / 'utils' / 'preprocessed_data.json'
UPDATED_QUESTION_SETS_PATH = BASE_DIR / 'output' / 'updated_interview_questions.json'

# Load preprocessed data for job descriptions
try:
    with open(PREPROCESSED_PATH, 'r') as f:
        preprocessed_data = json.load(f)
    print("Preprocessed data loaded successfully.")
except Exception as e:
    print(f"Error loading preprocessed data: {e}")
    preprocessed_data = {'jobs': []}

# Load question sets from Cell 3
try:
    with open(QUESTION_SETS_PATH, 'r') as f:
        question_sets = json.load(f)
    print("Question sets loaded successfully.")
except Exception as e:
    print(f"Error loading question sets: {e}")
    question_sets = []

# Initialize context
current_company = None
current_role = None

print("Data loaded successfully. Ready to run the chatbot in the next cell.")

Preprocessed data loaded successfully.
Question sets loaded successfully.
Data loaded successfully. Ready to run the chatbot in the next cell.


In [42]:
# Cell 6: Run the interactive chatbot
print("Welcome to the Interview Prep Chatbot! 👩‍💻")
print("Say something like 'I need to prepare for an Upper Hand interview' or 'exit' to quit.")

while True:
    user_input = input("You: ").strip()
    
    if user_input.lower() in ['exit', 'quit']:
        print("Goodbye! Best of luck with your interviews! 👑")
        break
    
    # Parse input
    parsed_input = parse_user_input(user_input)
    intent = parsed_input['intent']
    company = parsed_input['company']
    role = parsed_input['role']
    
    # Update context
    if company:
        current_company = company
    if role:
        current_role = role
    
    if intent == 'prepare for interview':
        # Prompt for company/role if None
        if not company and not role:
            print("\nI need a bit more info! Which company or role are you preparing for? (e.g., 'Upper Hand' or 'Data Science')")
            continue
        
        # Find matching question set
        matched_set = None
        for q_set in question_sets:
            if (current_company and current_company.lower() in q_set['company_name'].lower()) or \
               (current_role and current_role.lower() in q_set['job_title'].lower()):
                matched_set = q_set
                break
        
        if matched_set:
            print(f"\nHere's your question set for {matched_set['company_name']} ({matched_set['job_title']}):")
            print(json.dumps(matched_set, indent=2))
        else:
            print(f"\nNo question set found for '{current_company or current_role}'. Trying to generate new questions...")
            # Find job description
            job = next((j for j in preprocessed_data['jobs'] if 
                       (current_company and current_company.lower() in j['company_name'].lower()) or 
                       (current_role and current_role.lower() in j['job_title'].lower())), {})
            job_description = job.get('combined_text', '')
            keywords = job.get('keywords', [])
            
            # Generate questions
            gen_technical, gen_behavioral = generate_questions(job_description, keywords, num_technical=5, num_behavioral=3)
            
            if gen_technical or gen_behavioral:
                new_set = {
                    'job_title': current_role or 'Unknown Role',
                    'company_name': current_company or 'Unknown Company',
                    'seniority_level': job.get('seniority_level', 'Unknown'),
                    'technical_questions': gen_technical[:5],
                    'behavioral_questions': gen_behavioral[:3]
                }
                question_sets.append(new_set)
                
                # Save updated question sets
                try:
                    with open(UPDATED_QUESTION_SETS_PATH, 'w') as f:
                        json.dump(question_sets, f, indent=2)
                    print(f"New question set saved to {UPDATED_QUESTION_SETS_PATH}")
                except Exception as e:
                    print(f"Oops, couldn't save the new question set: {e}")
                
                print("\nHere's your generated question set:")
                print(json.dumps(new_set, indent=2))
            else:
                # Fallback to a default question set
                fallback_set = next((q_set for q_set in question_sets if 'data science' in q_set['job_title'].lower() or 'machine learning' in q_set['job_title'].lower()), None)
                if fallback_set:
                    print("\nUnable to generate new questions due to API limits. Here's a default Data Science question set:")
                    print(json.dumps(fallback_set, indent=2))
                else:
                    print("\nUnable to generate questions or find a suitable default set. Please try a different company or role.")
        
        # Context-aware prompt
        context_prompt = f"You mentioned {current_company or current_role}. "
        print(f"\n{context_prompt}What next? Try 'more technical questions', 'more behavioral questions', 'interview tips', or specify a company/role.")
    
    elif intent == 'more questions':
        # Prompt for company/role if None
        if not current_company and not current_role:
            print("\nI need a bit more info! Which company or role are you preparing for? (e.g., 'Upper Hand' or 'Data Science')")
            continue
        
        # Determine if technical or behavioral
        num_technical = 2 if 'technical' in user_input.lower() else 0
        num_behavioral = 1 if 'behavioral' in user_input.lower() else 0
        if num_technical == 0 and num_behavioral == 0:
            num_technical, num_behavioral = 2, 1  # Default to both
        
        # Find job description
        job = next((j for j in preprocessed_data['jobs'] if 
                   (current_company and current_company.lower() in j['company_name'].lower()) or 
                   (current_role and current_role.lower() in j['job_title'].lower())), {})
        job_description = job.get('combined_text', '')
        keywords = job.get('keywords', [])
        
        # Generate questions
        gen_technical, gen_behavioral = generate_questions(job_description, keywords, num_technical, num_behavioral)
        if gen_technical or gen_behavioral:
            print("\nAdditional questions:")
            if gen_technical:
                print("Technical:")
                for q in gen_technical:
                    print(f"- {q['question']} ({q['difficulty']})")
            if gen_behavioral:
                print("Behavioral:")
                for q in gen_behavioral:
                    print(f"- {q['question']} ({q['difficulty']})")
        else:
            # Fallback to existing questions
            matched_set = next((q_set for q_set in question_sets if 
                               (current_company and current_company.lower() in q_set['company_name'].lower()) or 
                               (current_role and current_role.lower() in q_set['job_title'].lower())), None)
            if matched_set:
                print("\nUnable to generate new questions due to API limits. Here's some from the existing set:")
                if num_technical > 0 and matched_set['technical_questions']:
                    print("Technical:")
                    for q in matched_set['technical_questions'][:num_technical]:
                        print(f"- {q['question']} ({q['difficulty']})")
                if num_behavioral > 0 and matched_set['behavioral_questions']:
                    print("Behavioral:")
                    for q in matched_set['behavioral_questions'][:num_behavioral]:
                        print(f"- {q['question']} ({q['difficulty']})")
            else:
                print("\nUnable to generate or find existing questions. Please try a different company or role.")
        
        # Context-aware prompt
        context_prompt = f"You mentioned {current_company or current_role}. "
        print(f"\n{context_prompt}What next? Try 'more technical questions', 'more behavioral questions', 'interview tips', or specify a company/role.")
    
    elif intent == 'interview tips':
        tips = get_interview_tips(current_role)
        print("\nInterview tips:")
        for tip in tips:
            print(f"- {tip}")
        
        # Context-aware prompt
        context_prompt = f"You mentioned {current_company or current_role or 'an interview'}. "
        print(f"\n{context_prompt}What next? Try 'more technical questions', 'more behavioral questions', 'interview tips', or specify a company/role.")
    
    elif intent == 'greeting':
        print(f"\nHey there! Ready to prep for an interview? Tell me something like 'I need to prepare for an Upper Hand interview' or 'interview tips'.")
        if current_company or current_role:
            print(f"You mentioned {current_company or current_role}. Want questions or tips for that? (e.g., 'prepare for {current_company or current_role} interview')")
    
    else:
        print("\nI didn't quite understand. Try something like 'I need to prepare for an Upper Hand interview', 'more questions', or 'interview tips'.")
        if current_company or current_role:
            print(f"You mentioned {current_company or current_role}. Want questions or tips for that? (e.g., 'prepare for {current_company or current_role} interview')")

Welcome to the Interview Prep Chatbot! 👩‍💻
Say something like 'I need to prepare for an Upper Hand interview' or 'exit' to quit.


You:  Hey!



Hey there! Ready to prep for an interview? Tell me something like 'I need to prepare for an Upper Hand interview' or 'interview tips'.


You:  Need prep so bad!



I need a bit more info! Which company or role are you preparing for? (e.g., 'Upper Hand' or 'Data Science')


You:  I applied to sales in Ibex?



I didn't quite understand. Try something like 'I need to prepare for an Upper Hand interview', 'more questions', or 'interview tips'.


You:  need prep!! I'm not prepared for my interview!! HELP ME!!



I need a bit more info! Which company or role are you preparing for? (e.g., 'Upper Hand' or 'Data Science')


You:  Well, it's for an Internship for ML?



No question set found for 'Machine Learning Intern'. Trying to generate new questions...
New question set saved to /Users/moazam_a12/AI-Powered Interview Question Generator/output/updated_interview_questions.json

Here's your generated question set:
{
  "job_title": "Machine Learning Intern",
  "company_name": "Unknown Company",
  "seniority_level": "Unknown",
  "technical_questions": [
    {
      "id": "default_1",
      "question": "Explain the difference between supervised and unsupervised learning.",
      "difficulty": "Easy",
      "category": "Machine Learning"
    },
    {
      "id": "default_2",
      "question": "What is overfitting, and how can you prevent it?",
      "difficulty": "Medium",
      "category": "Machine Learning"
    }
  ],
  "behavioral_questions": [
    {
      "id": "default_1",
      "question": "Tell me about a time you worked in a team to solve a problem.",
      "difficulty": "Easy",
      "category": "Behavioral"
    },
    {
      "id": "default_2"

You:  Well, I'd appreciate any tips I can get?



Interview tips:
- Prepare by deeply understanding common ML algorithms (linear regression, logistic regression, decision trees, etc.) and their underlying assumptions, trade-offs, and practical applications. Practice explaining them simply.
- Communicate your thought process clearly and concisely. When problem-solving, articulate your approach, assumptions, and any challenges you encounter. Don't be afraid to ask clarifying questions.
- Showcase your technical skills through projects. Be ready to discuss the details of your projects, including the data used, the models implemented, the evaluation metrics, and the lessons learned. Emphasize quantifiable results.

You mentioned Machine Learning Intern. What next? Try 'more technical questions', 'more behavioral questions', 'interview tips', or specify a company/role.


You:  exit


Goodbye! Best of luck with your interviews! 👑
