In [None]:
!pip install -q datasets sentence-transformers faiss-cpu langchain tqdm requests python-dotenv beautifulsoup4 spacy
!python -m spacy download en_core_web_sm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m117.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import numpy as np
import requests
import time
import re
import json
import spacy
from bs4 import BeautifulSoup
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import pickle
from tqdm import tqdm
import os
from dotenv import load_dotenv

In [None]:

load_dotenv()
GITHUB_TOKEN = os.getenv('github-token')

In [None]:
nlp = spacy.load("en_core_web_sm") #used to process text

In [None]:
print("📊 Loading Resume-Screening-Dataset...")
dataset = load_dataset("AzharAli05/Resume-Screening-Dataset")
df = dataset['train'].to_pandas()

print(f"✅ Loaded {len(df)} resumes")
print("📋 Dataset columns:", df.columns.tolist())
print("\n🔍 First few rows:")
print(df[['Role', 'Resume', 'Decision']].head())

📊 Loading Resume-Screening-Dataset...
✅ Loaded 10174 resumes
📋 Dataset columns: ['Role', 'Resume', 'Decision', 'Reason_for_decision', 'Job_Description']

🔍 First few rows:
                         Role  \
0       E-commerce Specialist   
1              Game Developer   
2  Human Resources Specialist   
3       E-commerce Specialist   
4       E-commerce Specialist   

                                              Resume Decision  
0  Here's a professional resume for Jason Jones:\...   reject  
1  Here's a professional resume for Ann Marshall:...   select  
2  Here's a professional resume for Patrick Mccla...   reject  
3  Here's a professional resume for Patricia Gray...   select  
4  Here's a professional resume for Amanda Gross:...   reject  


In [None]:
dataset = dataset.remove_columns(["Reason_for_decision", "Decision"])

# Show a few examples
print(dataset["train"][0])
print(dataset["train"][1])
print(dataset["train"][2])

{'Role': 'E-commerce Specialist', 'Resume': 'Here\'s a professional resume for Jason Jones:\n\nJason Jones\nE-commerce Specialist\n\nContact Information:\n\n* Email: [jasonjones@email.com](mailto:jasonjones@email.com)\n* Phone: 555-123-4567\n* LinkedIn: linkedin.com/in/jasonjones\n\nSummary:\nResults-driven E-commerce Specialist with 5+ years of experience in inventory management, SEO, online advertising, and analytics. Proven track record of increasing online sales, improving website traffic, and optimizing inventory levels. Skilled in analyzing complex data sets, identifying trends, and making data-driven decisions. Passionate about staying up-to-date with the latest e-commerce trends and technologies.\n\nProfessional Experience:\n\nE-commerce Specialist, XYZ Corporation (2018-Present)\n\n* Managed inventory levels across multiple channels, resulting in a 25% reduction in stockouts and a 15% reduction in overstocking\n* Developed and implemented SEO strategies that increased website 

In [None]:
print("Number of rows in train:", dataset["train"].num_rows)


print("Number of rows in train (using len):", len(dataset["train"]))

Number of rows in train: 10174
Number of rows in train (using len): 10174


In [None]:
def clean_resume_text(text):
    """Comprehensive resume cleaning function"""
    if not isinstance(text, str):
        return ""

    # Remove URLs but keep them for later extraction
    text = re.sub(r'http\S+', ' [URL] ', text)

    # Remove special characters but keep meaningful content
    text = re.sub(r'[^\w\s.,!?;:@#$%&*()\-+/]', ' ', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)

    # Remove common resume noise
    noise_patterns = [
        r'\b\d{10,}\b',  # Phone numbers
        r'\b[\w\.-]+@[\w\.-]+\.\w+\b',  # Emails
        r'©|™|®|�',  # Special characters
    ]

    for pattern in noise_patterns:
        text = re.sub(pattern, ' ', text)

    return text.strip()

In [None]:
def extract_skills(text):
    """Extract technical skills from resume text"""
    skills_keywords = {
        'python', 'java', 'javascript', 'react', 'node', 'sql', 'nosql', 'mongodb',
        'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible',
        'tensorflow', 'pytorch', 'keras', 'scikit', 'pandas', 'numpy', 'matplotlib',
        'machine learning', 'deep learning', 'nlp', 'computer vision', 'data science',
        'html', 'css', 'typescript', 'angular', 'vue', 'django', 'flask', 'fastapi',
        'git', 'jenkins', 'ci/cd', 'linux', 'bash', 'shell', 'rest', 'api', 'graphql',
        'tableau', 'powerbi', 'spark', 'hadoop', 'kafka', 'airflow'
    }

    doc = nlp(text.lower())
    found_skills = set() #use set() to avoid duplicates

    # Extract using keyword matching
    for token in doc:
        if token.text in skills_keywords and len(token.text) > 2:
            found_skills.add(token.text)

    # Extract noun phrases that might be skills
    for chunk in doc.noun_chunks:
        if chunk.text in skills_keywords:
            found_skills.add(chunk.text)

    return list(found_skills)

In [None]:
def extract_links(text):
    """Extract GitHub and portfolio links from resume"""
    github_links = re.findall(r'github\.com/([a-zA-Z0-9-]+)', text.lower())
    portfolio_links = re.findall(r'https?://[^\s]*portfolio[^\s]*', text.lower())
    linkedin_links = re.findall(r'linkedin\.com/in/([a-zA-Z0-9-]+)', text.lower())

    return {
        'github_usernames': list(set(github_links)),
        'portfolio_links': list(set(portfolio_links)),
        'linkedin_usernames': list(set(linkedin_links))
    }

In [None]:
def fetch_github_projects(username, max_repos=3):
    """Fetch GitHub repositories and their details"""
    headers = {'Authorization': f'token {GITHUB_TOKEN}'} if GITHUB_TOKEN else {}
    url = f"https://api.github.com/users/{username}/repos"

    projects = []

    try:
        response = requests.get(url, headers=headers, params={'per_page': max_repos, 'sort': 'updated'})

        if response.status_code == 200:
            repos = response.json()

            for repo in repos:
                # Get README content
                readme_url = f"https://api.github.com/repos/{username}/{repo['name']}/readme"
                readme_response = requests.get(readme_url, headers=headers)

                description = repo['description'] or "No description available"
                readme_content = ""

                if readme_response.status_code == 200:
                    import base64
                    readme_data = readme_response.json()
                    readme_content = base64.b64decode(readme_data['content']).decode('utf-8')

                projects.append({
                    'username': username,
                    'repo_name': repo['name'],
                    'description': description,
                    'stars': repo['stargazers_count'],
                    'language': repo['language'],
                    'url': repo['html_url'],
                    'readme_content': readme_content[:1000],
                    'has_description': bool(repo['description'])
                })

        time.sleep(1) #To avoid GitHub API rate limit

    except Exception as e:
        print(f"❌ Error fetching GitHub data for {username}: {e}")

    return projects

In [None]:
def summarize_project(project):
    """Create a summary of a GitHub project"""
    if project['has_description']:
        summary = f"Project: {project['repo_name']} - {project['description']}"
        if project['readme_content']:
            summary += f". README: {project['readme_content'][:200]}..."
    else:
        summary = f"Project: {project['repo_name']} - No description available"

    return summary

In [None]:
def process_resumes_with_live_data(df, sample_size=None):
    """Main processing function with live data integration"""
    if sample_size:
        df = df.sample(min(sample_size, len(df)), random_state=42)

    processed_data = []

    print("🔧 Processing resumes with live data...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        try:
            # Clean resume text
            clean_text = clean_resume_text(row['Resume'])

            # Extract skills
            skills = extract_skills(clean_text)

            # Extract links
            links = extract_links(row['Resume'])

            # Fetch live GitHub data
            github_projects = []
            project_summaries = []

            for username in links['github_usernames'][:2]:  # Limit to 2 usernames
                projects = fetch_github_projects(username, max_repos=2)
                github_projects.extend(projects)

                for project in projects:
                    project_summaries.append(summarize_project(project))

            # Create enhanced content
            enhanced_content = f"""
            RESUME CONTENT:
            {clean_text}

            EXTRACTED SKILLS:
            {', '.join(skills)}

            GITHUB PROJECTS:
            {'; '.join(project_summaries) if project_summaries else 'No GitHub projects found'}

            PORTFOLIO LINKS:
            {', '.join(links['portfolio_links']) if links['portfolio_links'] else 'No portfolio links'}
            """

            processed_data.append({
                'original_index': idx,  # Use index instead of non-existent 'ID'
                'role': row['Role'],    # Use 'Role' instead of non-existent 'Category'
                'clean_text': clean_text,
                'skills': skills,
                'links': links,
                'github_projects': github_projects,
                'enhanced_content': enhanced_content,
                'has_github': len(links['github_usernames']) > 0,
                'has_portfolio': len(links['portfolio_links']) > 0,
                'num_projects': len(github_projects)
            })

        except Exception as e:
            print(f"❌ Error processing resume at index {idx}: {e}")
            # Append an entry with default values to maintain DataFrame structure
            processed_data.append({
                'original_index': idx,
                'role': row['Role'],
                'clean_text': "",
                'skills': [],
                'links': {},
                'github_projects': [],
                'enhanced_content': "Error processing resume.",
                'has_github': False,
                'has_portfolio': False,
                'num_projects': 0
            })
            continue

    return pd.DataFrame(processed_data)

In [None]:
print("🚀 Starting comprehensive processing...")
processed_df = process_resumes_with_live_data(df)

print(f"✅ Successfully processed {len(processed_df)} resumes")
print(f"📊 Stats:")
print(f"   - Resumes with GitHub: {processed_df['has_github'].sum()}")
print(f"   - Resumes with portfolio: {processed_df['has_portfolio'].sum()}")
print(f"   - Total projects fetched: {processed_df['num_projects'].sum()}")

🚀 Starting comprehensive processing...
🔧 Processing resumes with live data...


100%|██████████| 10174/10174 [1:21:06<00:00,  2.09it/s]

✅ Successfully processed 10174 resumes
📊 Stats:
   - Resumes with GitHub: 3474
   - Resumes with portfolio: 6
   - Total projects fetched: 89





In [None]:

print("💾 Saving processed data...")
processed_df.to_csv('full_processed_resumes.csv', index=False)
print("✅ Saved full_processed_resumes.csv")

💾 Saving processed data...
✅ Saved full_processed_resumes.csv


In [None]:
def create_enhanced_vector_store(processed_df):
    """Create vector store with enhanced content"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=200,
        length_function=len,
    )

    chunks = []
    print("✂️ Creating chunks with enhanced content...")

    for _, row in tqdm(processed_df.iterrows(), total=len(processed_df)):
        try:
            text_chunks = text_splitter.split_text(row['enhanced_content'])
            for i, chunk in enumerate(text_chunks):
                chunk_data = {
                    'text': chunk,
                    'source': 'enhanced_resume',
                    'resume_id': row['original_index'], # Use 'original_index'
                    'category': row['role'], # Use 'role'
                    'skills': row['skills'],
                    'has_github': row['has_github'],
                    'has_portfolio': row['has_portfolio'],
                    'num_projects': row['num_projects'],
                    'chunk_id': i
                }
                chunks.append(chunk_data)
        except Exception as e:
            # Print error but continue to the next row
            print(f"❌ Error creating chunks for resume at index {row['original_index']}: {e}")
            continue # Continue to the next row

    print(f"✨ Created {len(chunks)} enhanced chunks")

    if not chunks:
        print("⚠️ No chunks created. Skipping embedding and index creation.")
        return None, [], pd.DataFrame() # Return empty values if no chunks

    # Create embeddings
    print("🔨 Creating embeddings...")
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    texts = [chunk['text'] for chunk in chunks]
    embeddings = embedder.encode(texts, show_progress_bar=True)

    # Build FAISS index
    print("🏗️ Building FAISS index...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))

    # Save everything
    faiss.write_index(index, "enhanced_resume_index.bin")
    with open("enhanced_resume_metadata.pkl", "wb") as f:
        # Convert processed_df to dict if it's not empty, otherwise save empty list
        processed_df_dict = processed_df.to_dict('records') if not processed_df.empty else []
        pickle.dump({
            'chunks': chunks,
            'processed_df': processed_df_dict,
            'total_chunks': len(chunks),
            'total_resumes': len(processed_df)
        }, f)

    print(f"🎉 Enhanced vector store created with {index.ntotal} vectors!")
    return index, chunks, processed_df

# Create the enhanced vector store
vector_index, all_chunks, processed_resumes = create_enhanced_vector_store(processed_df)

# Check if vector_index was created before proceeding
if vector_index:
    print("\nVector store created successfully. Ready for searching.")
else:
    print("\nFailed to create vector store. Please check for errors in the previous steps.")

✂️ Creating chunks with enhanced content...


100%|██████████| 10174/10174 [00:06<00:00, 1488.46it/s]


✨ Created 51438 enhanced chunks
🔨 Creating embeddings...


Batches:   0%|          | 0/1608 [00:00<?, ?it/s]

🏗️ Building FAISS index...
🎉 Enhanced vector store created with 51438 vectors!

Vector store created successfully. Ready for searching.


In [None]:
def search_enhanced_candidates(query, index, all_chunks, processed_resumes, top_k=5, require_github=False):
    """Search with enhanced results including live project info"""
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = embedder.encode([query])

    # Search in FAISS
    distances, indices = index.search(np.array(query_embedding).astype('float32'), top_k * 3)

    results = []
    seen_resumes = set()

    for idx, distance in zip(indices[0], distances[0]):
        if idx >= len(all_chunks):
            continue

        chunk = all_chunks[idx]
        resume_id = chunk['resume_id']

        # Skip duplicates and apply filters
        if resume_id in seen_resumes:
            continue
        if require_github and not chunk['has_github']:
            continue

        # Get the full processed resume data
        # Fix: Access data from processed_resumes DataFrame using boolean indexing
        resume_data_row = processed_resumes[processed_resumes['original_index'] == resume_id]
        if resume_data_row.empty:
            continue

        # Convert the row to a dictionary for easier access
        resume_data = resume_data_row.iloc[0].to_dict()


        # Prepare project information
        project_info = []
        for project in resume_data.get('github_projects', []): # Use .get for safety
            if project.get('has_description'): # Use .get for safety
                project_info.append(f"📁 {project.get('repo_name', 'N/A')}: {project.get('description', 'No description available')}")
            else:
                project_info.append(f"📁 {project.get('repo_name', 'N/A')}: No description available")

        results.append({
            'score': float(1 / (1 + distance)),
            'resume_id': resume_id,
            'category': chunk['category'],
            'skills': chunk['skills'],
            'has_github': chunk['has_github'],
            'has_portfolio': chunk['has_portfolio'],
            'project_count': chunk['num_projects'],
            'projects': project_info,
            'excerpt': chunk['text'][:200] + "...",
            'github_usernames': resume_data['links'].get('github_usernames', []), # Use .get for safety
            'portfolio_links': resume_data['links'].get('portfolio_links', []) # Use .get for safety
        })

        seen_resumes.add(resume_id)
        if len(results) >= top_k:
            break

    return results

# Test the enhanced search
print("🧪 Testing enhanced search with live project data...")

test_queries = [
    "Python developer with machine learning projects",
    "React frontend developer with GitHub portfolio",
    "Data scientist with TensorFlow experience",
    "DevOps engineer with Docker and Kubernetes"
]

for query in test_queries:
    print(f"\n🔍 Query: '{query}'")
    print("=" * 70)

    # Fix: Pass index, all_chunks, and processed_resumes to the function
    results = search_enhanced_candidates(query, vector_index, all_chunks, processed_resumes, top_k=2, require_github=True)

    for i, result in enumerate(results):
        print(f"{i+1}. 🎯 Score: {result['score']:.3f}")
        print(f"   📁 Category: {result['category']}")
        print(f"   ⚡ Skills: {', '.join(result['skills'][:5])}...")
        print(f"   📊 GitHub: {result['project_count']} projects")

        if result['projects']:
            print("   📦 Projects:")
            for project in result['projects'][:2]:  # Show top 2 projects
                print(f"      • {project}")

        print(f"   📝 Excerpt: {result['excerpt']}")
        print()

🧪 Testing enhanced search with live project data...

🔍 Query: 'Python developer with machine learning projects'
1. 🎯 Score: 0.611
   📁 Category: software engineer
   ⚡ Skills: python, linux, mongodb, java, django...
   📊 GitHub: 0 projects
   📝 Excerpt: with cross-functional teams to identify and prioritize project requirements * implemented automated testing and continuous integration tools to improve code quality and efficiency software engineer, d...

2. 🎯 Score: 0.595
   📁 Category: software engineer
   ⚡ Skills: python, api, tensorflow, java, nosql...
   📊 GitHub: 0 projects
   📝 Excerpt: * certified java developer (ocpjp), oracle corporation (2019) * certified python programmer (pcep), python institute (2020) projects: * personal project: developed a web scraper using python and beaut...


🔍 Query: 'React frontend developer with GitHub portfolio'
1. 🎯 Score: 0.619
   📁 Category: Mobile App Developer
   ⚡ Skills: react...
   📊 GitHub: 0 projects
   📝 Excerpt: EXTRACTED SKILLS:
   

In [None]:
!pip install -q streamlit sentence-transformers faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import faiss
import pickle
import re
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any

In [None]:
class RecruitRAGSystem:
    def __init__(self, index_path: str, metadata_path: str):
        """Initialize the RAG system"""
        self.index = faiss.read_index(index_path)
        with open(metadata_path, 'rb') as f:
            self.metadata = pickle.load(f)
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.chunks = self.metadata['chunks']

    def enhance_query(self, query: str) -> str:
        """
        Enhance the search query using simple rule-based expansion
        This is the lightweight alternative to LLM query rewriting
        """
        # Query expansion rules
        expansion_rules = {
            r'\bpm\b': 'project manager',
            r'\bdevops\b': 'devops engineer docker kubernetes',
            r'\bml\b': 'machine learning',
            r'\bai\b': 'artificial intelligence',
            r'\bfrontend\b': 'frontend developer react javascript',
            r'\bbackend\b': 'backend developer node python java',
            r'\bfull.?stack\b': 'full stack developer',
            r'\bdata scien\b': 'data scientist',
            r'\bux\b': 'user experience designer',
            r'\bui\b': 'user interface designer',
            r'\bweb\b': 'web developer',
            r'\bmobile\b': 'mobile developer ios android',
            r'\bcloud\b': 'aws azure gcp cloud',
            r'\bdatabase\b': 'sql nosql database',
            r'\btest\b': 'qa tester quality assurance',
            r'\bsecurity\b': 'cybersecurity security'
        }

        # Skill-specific expansions
        skill_expansions = {
            r'\bangular\b': 'angularjs javascript typescript',
            r'\breact\b': 'reactjs javascript frontend',
            r'\bvue\b': 'vuejs javascript',
            r'\bpython\b': 'python django flask',
            r'\bjava\b': 'java spring hibernate',
            r'\bnode\b': 'nodejs javascript backend',
            r'\bdocker\b': 'docker containers kubernetes',
            r'\baws\b': 'amazon web services cloud',
            r'\bazure\b': 'microsoft azure cloud',
            r'\btensorflow\b': 'tensorflow machine learning',
            r'\bpytorch\b': 'pytorch machine learning',
            r'\bsql\b': 'sql database mysql postgresql',
            r'\bmongodb\b': 'mongodb nosql database'
        }

        enhanced_query = query.lower()

        # Apply general expansions
        for pattern, expansion in expansion_rules.items():
            if re.search(pattern, enhanced_query, re.IGNORECASE):
                enhanced_query += " " + expansion

        # Apply skill-specific expansions
        for pattern, expansion in skill_expansions.items():
            if re.search(pattern, enhanced_query, re.IGNORECASE):
                enhanced_query += " " + expansion

        # Remove duplicates and clean up
        words = enhanced_query.split()
        enhanced_query = " ".join(sorted(set(words), key=words.index))

        return enhanced_query

    def hybrid_search(self, query: str, top_k: int = 10) -> List[Dict]:
        """
        Perform hybrid search (vector + keyword matching)
        """
        # Enhance the query first
        enhanced_query = self.enhance_query(query)
        print(f"Original query: '{query}'")
        print(f"Enhanced query: '{enhanced_query}'")

        # Vector search
        query_embedding = self.embedder.encode([enhanced_query])
        distances, indices = self.index.search(
            np.array(query_embedding).astype('float32'),
            top_k * 3  # Get more results for re-ranking
        )

        # Keyword scoring (simple BM25-like approach)
        query_words = set(enhanced_query.lower().split())
        scored_results = []

        for idx, distance in zip(indices[0], distances[0]):
            if idx >= len(self.chunks):
                continue

            chunk = self.chunks[idx]
            text = chunk['text'].lower()

            # Calculate keyword score
            keyword_matches = 0
            for word in query_words:
                if len(word) > 2 and word in text:
                    keyword_matches += 1

            keyword_score = keyword_matches / len(query_words) if query_words else 0
            vector_score = 1 / (1 + distance)

            # Combined score (weighted average)
            combined_score = 0.7 * vector_score + 0.3 * keyword_score

            scored_results.append({
                'chunk': chunk,
                'vector_score': vector_score,
                'keyword_score': keyword_score,
                'combined_score': combined_score,
                'distance': distance
            })

        # Sort by combined score and return top results
        scored_results.sort(key=lambda x: x['combined_score'], reverse=True)
        return scored_results[:top_k]

    def group_by_candidate(self, results: List[Dict]) -> Dict[str, List[Dict]]:
        """
        Group results by candidate/resume ID
        """
        candidates = {}

        for result in results:
            chunk = result['chunk']
            resume_id = chunk.get('resume_id', 'unknown')

            if resume_id not in candidates:
                candidates[resume_id] = {
                    'chunks': [],
                    'scores': [],
                    'best_score': 0,
                    'metadata': {k: v for k, v in chunk.items() if k != 'text'}
                }

            candidates[resume_id]['chunks'].append(result)
            candidates[resume_id]['scores'].append(result['combined_score'])
            candidates[resume_id]['best_score'] = max(
                candidates[resume_id]['best_score'],
                result['combined_score']
            )

        return candidates

    def highlight_key_phrases(self, text: str, query: str) -> str:
        """
        Highlight relevant phrases in the text
        """
        highlighted_text = text
        query_words = set(query.lower().split())

        for word in query_words:
            if len(word) > 3:  # Only highlight meaningful words
                pattern = re.compile(re.escape(word), re.IGNORECASE)
                highlighted_text = pattern.sub(
                    f"**{word.upper()}**",
                    highlighted_text
                )

        return highlighted_text

    def format_results(self, candidates: Dict[str, List[Dict]], original_query: str) -> List[Dict]:
        """
        Format results for display
        """
        formatted_results = []

        for resume_id, candidate_data in candidates.items():
            # Get top 2-3 most relevant chunks
            top_chunks = sorted(
                candidate_data['chunks'],
                key=lambda x: x['combined_score'],
                reverse=True
            )[:3]

            # Extract highlighted excerpts
            excerpts = []
            for chunk_result in top_chunks:
                chunk = chunk_result['chunk']
                highlighted_text = self.highlight_key_phrases(
                    chunk['text'],
                    original_query
                )
                excerpts.append({
                    'text': highlighted_text,
                    'score': chunk_result['combined_score']
                })

            # Prepare candidate result
            result = {
                'resume_id': resume_id,
                'match_score': candidate_data['best_score'],
                'category': candidate_data['metadata'].get('category', 'Unknown'),
                'skills': candidate_data['metadata'].get('skills', []),
                'has_github': candidate_data['metadata'].get('has_github', False),
                'excerpts': excerpts,
                'metadata': candidate_data['metadata']
            }

            formatted_results.append(result)

        # Sort by match score
        formatted_results.sort(key=lambda x: x['match_score'], reverse=True)
        return formatted_results

In [None]:
def initialize_rag_system():
    """Initialize the RAG system with our saved data"""
    try:
        rag_system = RecruitRAGSystem(
            index_path="/content/enhanced_resume_index.bin",
            metadata_path="/content/enhanced_resume_metadata.pkl"
        )
        print("✅ RAG system initialized successfully!")
        return rag_system
    except Exception as e:
        print(f"❌ Error initializing RAG system: {e}")
        return None

# Step 4: Streamlit Web Interface
def create_web_interface():
    """Create the Streamlit web interface"""
    st.set_page_config(
        page_title="RecruitRAG - Intelligent Candidate Search",
        page_icon="🔍",
        layout="wide"
    )

    st.title("🔍 RecruitRAG - Intelligent Candidate Search")
    st.markdown("Find the perfect candidates using AI-powered semantic search")

    # Initialize RAG system
    if 'rag_system' not in st.session_state:
        with st.spinner("Loading search engine..."):
            st.session_state.rag_system = initialize_rag_system()

    if st.session_state.rag_system is None:
        st.error("Failed to initialize the search system. Please check if the data files exist.")
        return

    # Search interface
    col1, col2 = st.columns([3, 1])

    with col1:
        query = st.text_input(
            "🔎 Search for candidates:",
            placeholder="e.g., 'Python developer with React experience and machine learning projects'",
            help="Describe the ideal candidate's skills, experience, or projects"
        )

    with col2:
        top_k = st.slider("Number of results:", 1, 20, 5)
        require_github = st.checkbox("Only show candidates with GitHub", value=True)

    # Search button
    if st.button("🚀 Search", type="primary") or query:
        if query:
            with st.spinner("Searching through thousands of resumes..."):
                # Perform search
                results = st.session_state.rag_system.hybrid_search(query, top_k * 3)

                # Group by candidate
                candidates = st.session_state.rag_system.group_by_candidate(results)

                # Filter by GitHub if required
                if require_github:
                    candidates = {
                        rid: data for rid, data in candidates.items()
                        if data['metadata'].get('has_github', False)
                    }

                # Format results
                formatted_results = st.session_state.rag_system.format_results(
                    candidates, query
                )[:top_k]

            # Display results
            st.subheader(f"📊 Found {len(formatted_results)} matching candidates")

            for i, result in enumerate(formatted_results):
                with st.expander(
                    f"#{i+1} | {result['category']} | Score: {result['match_score']:.0%} | "
                    f"Skills: {', '.join(result['skills'][:3])}...",
                    expanded=i == 0  # Expand first result
                ):
                    col_a, col_b = st.columns([2, 1])

                    with col_a:
                        st.markdown("**🎯 Most Relevant Experience:**")

                        for excerpt in result['excerpts']:
                            st.markdown(f"**({excerpt['score']:.0%})** {excerpt['text']}")
                            st.markdown("---")

                    with col_b:
                        st.markdown("**📋 Candidate Details:**")
                        st.markdown(f"**Category:** {result['category']}")
                        st.markdown(f"**Key Skills:** {', '.join(result['skills'][:5])}")
                        st.markdown(f"**GitHub Available:** {'✅ Yes' if result['has_github'] else '❌ No'}")
                        st.markdown(f"**Confidence Score:** {result['match_score']:.0%}")

                        if result['has_github']:
                            st.success("🎯 Strong match with code portfolio!")

            # Show search statistics
            st.markdown("---")
            st.caption(f"*Searched through {len(st.session_state.rag_system.chunks):,} data chunks*")

        else:
            st.warning("Please enter a search query")

In [None]:
def simple_search_interface():
    """Simple command-line style interface"""
    rag_system = initialize_rag_system()

    if rag_system is None:
        print("Please run the data processing first!")
        return

    print("🔍 RecruitRAG Search Interface")
    print("Type 'quit' to exit")
    print("-" * 50)

    while True:
        query = input("\n🎯 Enter your search query: ").strip()

        if query.lower() in ['quit', 'exit', 'q']:
            break

        if not query:
            continue

        print(f"🔧 Enhancing query: '{query}'")
        enhanced_query = rag_system.enhance_query(query)
        print(f"📝 Enhanced to: '{enhanced_query}'")

        print("⚡ Searching...")
        results = rag_system.hybrid_search(query, 5)
        candidates = rag_system.group_by_candidate(results)
        formatted_results = rag_system.format_results(candidates, query)[:3]

        print(f"\n✅ Found {len(formatted_results)} candidates:")
        print("=" * 80)

        for i, result in enumerate(formatted_results, 1):
            print(f"\n{i}. {result['category']} (Score: {result['match_score']:.0%})")
            print(f"   Skills: {', '.join(result['skills'][:4])}")
            print(f"   GitHub: {'✅' if result['has_github'] else '❌'}")
            print(f"   Top matches:")

            for excerpt in result['excerpts']:
                print(f"      • ({excerpt['score']:.0%}) {excerpt['text'][:100]}...")

            print("-" * 80)

In [None]:

 simple_search_interface()

✅ RAG system initialized successfully!
🔍 RecruitRAG Search Interface
Type 'quit' to exit
--------------------------------------------------

🎯 Enter your search query: devops engineer aws docker
🔧 Enhancing query: 'devops engineer aws docker'
📝 Enhanced to: 'devops engineer aws docker kubernetes containers amazon web services cloud'
⚡ Searching...
Original query: 'devops engineer aws docker'
Enhanced query: 'devops engineer aws docker kubernetes containers amazon web services cloud'

✅ Found 3 candidates:

1. Cloud Architect (Score: 65%)
   Skills: ci/cd, python, linux, machine learning
   GitHub: ❌
   Top matches:
      • (65%) macOS Professional Experience: Senior Cloud Architect, ABC Corporation (2018-Present) * Designed and...
--------------------------------------------------------------------------------

2. DevOps Engineer (Score: 65%)
   Skills: terraform, linux, aws, docker
   GitHub: ✅
   Top matches:
      • (65%) and implemented a containerized application using **DOCKER** 

KeyboardInterrupt: Interrupted by user

In [None]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm

def create_enhanced_vector_store(processed_df):
    """Create vector store with enhanced content using cosine similarity"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=200,
        length_function=len,
    )

    chunks = []
    print("✂️ Creating chunks with enhanced content...")

    for _, row in tqdm(processed_df.iterrows(), total=len(processed_df)):
        try:
            text_chunks = text_splitter.split_text(row['enhanced_content'])
            for i, chunk in enumerate(text_chunks):
                chunk_data = {
                    'text': chunk,
                    'source': 'enhanced_resume',
                    'resume_id': row['original_index'],  # Use 'original_index'
                    'category': row['role'],             # Use 'role'
                    'skills': row['skills'],
                    'has_github': row['has_github'],
                    'has_portfolio': row['has_portfolio'],
                    'num_projects': row['num_projects'],
                    'chunk_id': i
                }
                chunks.append(chunk_data)
        except Exception as e:
            print(f"❌ Error creating chunks for resume at index {row['original_index']}: {e}")
            continue

    print(f"✨ Created {len(chunks)} enhanced chunks")

    if not chunks:
        print("⚠️ No chunks created. Skipping embedding and index creation.")
        return None, [], pd.DataFrame()

    # Create embeddings
    print("🔨 Creating embeddings...")
    embedder = SentenceTransformer("all-mpnet-base-v2")  # better model than MiniLM
    texts = [chunk['text'] for chunk in chunks]

    # Normalize embeddings for cosine similarity
    embeddings = embedder.encode(texts, show_progress_bar=True, normalize_embeddings=True)

    # Build FAISS index (cosine similarity = inner product on normalized vectors)
    print("🏗️ Building FAISS index with cosine similarity...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)   # Inner Product index
    index.add(np.array(embeddings).astype('float32'))

    # Save everything
    faiss.write_index(index, "enhanced_resume_index.bin")
    with open("enhanced_resume_metadata.pkl", "wb") as f:
        processed_df_dict = processed_df.to_dict('records') if not processed_df.empty else []
        pickle.dump({
            'chunks': chunks,
            'processed_df': processed_df_dict,
            'total_chunks': len(chunks),
            'total_resumes': len(processed_df)
        }, f)

    print(f"🎉 Enhanced vector store created with {index.ntotal} vectors (cosine similarity).")
    return index, chunks, processed_df


# Create the enhanced vector store
vector_index, all_chunks, processed_resumes = create_enhanced_vector_store(processed_df)

# Check if vector_index was created before proceeding
if vector_index:
    print("\n✅ Vector store created successfully. Ready for searching with cosine similarity.")
else:
    print("\n❌ Failed to create vector store. Please check for errors in the previous steps.")


✂️ Creating chunks with enhanced content...


100%|██████████| 10174/10174 [00:05<00:00, 1854.79it/s]


✨ Created 51438 enhanced chunks
🔨 Creating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1608 [00:00<?, ?it/s]

🏗️ Building FAISS index with cosine similarity...
🎉 Enhanced vector store created with 51438 vectors (cosine similarity).

✅ Vector store created successfully. Ready for searching with cosine similarity.


In [None]:
def search_resumes(query, index, chunks, top_k=5, model_name="all-mpnet-base-v2"):
    """
    Search resumes using FAISS index + cosine similarity
    """
    # Load the same embedding model used in the index
    embedder = SentenceTransformer(model_name)

    # Encode and normalize query
    query_vec = embedder.encode([query], normalize_embeddings=True)

    # Search in FAISS
    distances, indices = index.search(np.array(query_vec).astype('float32'), top_k)

    results = []
    for rank, (score, idx) in enumerate(zip(distances[0], indices[0])):
        if idx == -1:
            continue
        result = chunks[idx].copy()
        result["similarity"] = float(score)  # already cosine similarity [0,1]
        results.append(result)

        print(f"Rank {rank+1} | Similarity: {score:.2f} | Resume ID: {result['resume_id']}")
        print(f"Snippet: {result['text'][:200]}...\n")

    return results


In [None]:
# Example query
query = "machine learning engineer with Python and TensorFlow experience"

# Run search
results = search_resumes(query, vector_index, all_chunks, top_k=5)


Rank 1 | Similarity: 0.72 | Resume ID: 6826
Snippet: Here s a sample resume for a Machine Learning Engineer with a focus on Python, TensorFlow, Feature Engineering, and Hyperparameter Tuning: Kimberly Miles Machine Learning Engineer Contact Information:...

Rank 2 | Similarity: 0.72 | Resume ID: 5074
Snippet: Experience: Senior Machine Learning Engineer, XYZ Corporation (2018-Present) * Designed and developed predictive models using TensorFlow and PyTorch for a large-scale e-commerce platform, resulting in...

Rank 3 | Similarity: 0.70 | Resume ID: 4075
Snippet: Experience: Machine Learning Engineer, ABC Company (2018-Present) * Developed and deployed multiple AI models using TensorFlow and scikit-learn for predicting customer churn and recommendation systems...

Rank 4 | Similarity: 0.70 | Resume ID: 3563
Snippet: Here is a professional resume for a Machine Learning Engineer candidate: David Russell Contact Information: * Email:   (mailto: ) * Phone: 555-555-5555 * LinkedIn: linkedin.

In [None]:
!pip install -q gradio sentence-transformers faiss-cpu pandas numpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gradio as gr
import pandas as pd
import numpy as np
import faiss
import pickle
import re
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any

ModuleNotFoundError: No module named 'faiss'

In [None]:

class EnhancedRecruitRAGSystem:
    def __init__(self, index_path: str, metadata_path: str):
        """Initialize the enhanced RAG system"""
        self.index = faiss.read_index(index_path)
        #contains all the information about the chunks (the text, resume ID, skills, etc.) and the original processed DataFrame
        with open(metadata_path, 'rb') as f:
            self.metadata = pickle.load(f)
        # Use better embedding model
        self.embedder = SentenceTransformer('all-mpnet-base-v2') #larger and more powerful model than the previous all-MiniLM-L6-v2, leading to better query understanding
        self.chunks = self.metadata['chunks']
        self.processed_df = pd.DataFrame(self.metadata['processed_df'])

    def extract_name_from_resume(self, resume_text: str) -> str:
        """Extract person name from resume text using multiple strategies"""
        # Strategy 1: Look for name patterns at the beginning
        lines = resume_text.split('\n')
        for line in lines[:10]:  # Check first 10 lines
            line = line.strip()
            # Pattern: Title Case words (likely names)
            name_match = re.match(r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', line)
            if name_match and len(name_match.group(1).split()) <= 4:
                return name_match.group(1)

        # Strategy 2: Look for common name indicators
        for line in lines[:20]:
            if any(indicator in line.lower() for indicator in
                  ['name:', 'full name:', 'candidate:', 'applicant:']):
                # Extract the name part
                name_part = re.split(r'name:|full name:|candidate:|applicant:', line,
                                   flags=re.IGNORECASE)
                if len(name_part) > 1:
                    potential_name = name_part[1].strip()
                    if len(potential_name.split()) <= 4:
                        return potential_name

        # Strategy 3: Fallback to generic name
        return "Candidate"

    def enhance_query(self, query: str) -> str:
        """Enhanced query understanding with better expansion"""
        # More comprehensive expansion rules
        expansion_rules = {
            r'\bml\b': 'machine learning',
            r'\bai\b': 'artificial intelligence',
            r'\bpm\b': 'project manager program management',
            r'\bdevops\b': 'devops engineer docker kubernetes ci/cd',
            r'\bds\b': 'data scientist',
            r'\bde\b': 'data engineer',
            r'\bse\b': 'software engineer',
            r'\bfe\b': 'frontend engineer react javascript',
            r'\bbe\b': 'backend engineer python java node',
            r'\bux\b': 'user experience designer',
            r'\bui\b': 'user interface designer',
            r'\bqa\b': 'quality assurance tester',
            r'\baws\b': 'amazon web services ec2 s3 lambda',
            r'\bazure\b': 'microsoft azure cloud',
            r'\bgcp\b': 'google cloud platform',
            r'\bpython\b': 'python django flask pandas numpy',
            r'\bjava\b': 'java spring hibernate',
            r'\bjs\b': 'javascript typescript',
            r'\breact\b': 'reactjs javascript frontend',
            r'\bangular\b': 'angularjs typescript',
            r'\bvue\b': 'vuejs javascript',
            r'\bnode\b': 'nodejs javascript backend',
            r'\bsql\b': 'sql database mysql postgresql',
            r'\bnosql\b': 'mongodb cassandra',
            r'\bdocker\b': 'docker containers',
            r'\bkubernetes\b': 'k8s container orchestration',
            r'\btf\b': 'tensorflow',
            r'\bpt\b': 'pytorch',
            r'\bdl\b': 'deep learning neural networks',
            r'\bnlp\b': 'natural language processing',
            r'\bcv\b': 'computer vision',
        }

        enhanced_query = query.lower()

        # Apply expansions
        for pattern, expansion in expansion_rules.items():
            if re.search(pattern, enhanced_query, re.IGNORECASE):
                enhanced_query += " " + expansion

        # Remove duplicates and clean
        words = enhanced_query.split()
        unique_words = []
        seen_words = set()

        for word in words:
            if word not in seen_words and len(word) > 2: #ignore very short words
                unique_words.append(word)
                seen_words.add(word)

        return " ".join(unique_words)

    def hybrid_search(self, query: str, top_k: int = 20) -> List[Dict]:
        """Enhanced hybrid search with better scoring"""
        enhanced_query = self.enhance_query(query)
        print(f"Enhanced query: {enhanced_query}")

        # Get query embedding (normalized for cosine similarity)
        query_embedding = self.embedder.encode([enhanced_query],
                                             normalize_embeddings=True)

        # Search with cosine similarity (inner product)
        similarities, indices = self.index.search(
            np.array(query_embedding).astype('float32'),
            top_k * 2
        )

        # Enhanced keyword matching
        query_words = set(enhanced_query.lower().split())
        scored_results = []

        for idx, similarity in zip(indices[0], similarities[0]):
            if idx >= len(self.chunks):
                continue

            chunk = self.chunks[idx]
            text = chunk['text'].lower()

            # Enhanced keyword scoring
            keyword_score = 0
            for word in query_words:
                if len(word) > 2:
                    # Count occurrences with weighting
                    count = text.count(word)
                    if count > 0:
                        keyword_score += min(count * 0.1, 0.3)  # Cap per word

            # Normalize keyword score
            keyword_score = min(keyword_score, 1.0)

            # Combine scores (cosine similarity + keyword boost)
            combined_score = similarity + keyword_score * 0.2

            scored_results.append({
                'chunk': chunk,
                'cosine_similarity': similarity,
                'keyword_score': keyword_score,
                'combined_score': combined_score,
                'chunk_index': idx
            })

        # Sort and return top results
        scored_results.sort(key=lambda x: x['combined_score'], reverse=True)
        return scored_results[:top_k]

    def get_candidate_info(self, resume_id):
        """Get complete candidate information including name"""
        if not self.processed_df.empty:
            candidate_data = self.processed_df[
                self.processed_df['original_index'] == resume_id
            ]
            if not candidate_data.empty:
                return candidate_data.iloc[0].to_dict()
        return None

    def format_results(self, results: List[Dict], query: str) -> List[Dict]:
        """Format results with candidate names and enhanced information"""
        formatted_results = []
        seen_candidates = set()

        for result in results:
            chunk = result['chunk']
            resume_id = chunk.get('resume_id')

            if resume_id in seen_candidates:
                continue

            # Get candidate information
            candidate_info = self.get_candidate_info(resume_id)
            if not candidate_info:
                continue

            # Extract name from resume content
            resume_content = candidate_info.get('clean_text', '')
            candidate_name = self.extract_name_from_resume(resume_content)

            # Get top chunks for this candidate
            candidate_chunks = [
                r for r in results
                if r['chunk'].get('resume_id') == resume_id
            ]
            top_chunks = sorted(candidate_chunks,
                              key=lambda x: x['combined_score'],
                              reverse=True)[:3]

            # Prepare excerpts with highlighting
            excerpts = []
            for chunk_result in top_chunks:
                chunk_text = chunk_result['chunk']['text']
                # Enhanced highlighting
                highlighted_text = self.highlight_text(chunk_text, query)
                excerpts.append({
                    'text': highlighted_text,
                    'score': chunk_result['combined_score']
                })

            formatted_results.append({
                'candidate_name': candidate_name,
                'resume_id': resume_id,
                'match_score': result['combined_score'],
                'role': candidate_info.get('role', 'Unknown'),
                'skills': candidate_info.get('skills', [])[:8],
                'has_github': candidate_info.get('has_github', False),
                'github_projects': candidate_info.get('github_projects', []),
                'excerpts': excerpts,
                'candidate_info': candidate_info
            })

            seen_candidates.add(resume_id)

        return formatted_results[:10]  # Return top 10 candidates

    def highlight_text(self, text: str, query: str) -> str:
        """Enhanced text highlighting"""
        highlighted_text = text
        query_words = set(query.lower().split())

        for word in query_words:
            if len(word) > 3:
                # Use regex for better matching
                pattern = re.compile(rf'\b{re.escape(word)}\b', re.IGNORECASE)
                highlighted_text = pattern.sub(
                    f"**{word.upper()}**",
                    highlighted_text
                )

        return highlighted_text


In [None]:
def initialize_enhanced_system():
    try:
        system = EnhancedRecruitRAGSystem(
            index_path="/content/enhanced_resume_index (1).bin", # Corrected filename
            metadata_path="/content/enhanced_resume_metadata (1).pkl" # Corrected filename
        )
        print("✅ Enhanced RAG system initialized!")
        return system
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

In [None]:
def create_gradio_interface():
    """Create a beautiful Gradio interface"""

    # Initialize system
    rag_system = initialize_enhanced_system()
    if rag_system is None:
        return gr.Interface(lambda: "System not initialized. Please check data files.")

    def search_candidates(query, num_results, require_github):
        """Search function for Gradio"""
        if not query:
            return "Please enter a search query"

        results = rag_system.hybrid_search(query, num_results * 3)
        formatted_results = rag_system.format_results(results, query)

        # Filter by GitHub if required
        if require_github:
            formatted_results = [r for r in formatted_results if r['has_github']]

        formatted_results = formatted_results[:num_results]

        if not formatted_results:
            return "No matching candidates found. Try a different query."

        # Create HTML output
        html_output = f"""
        <div style='font-family: Arial, sans-serif; max-width: 1200px; margin: 0 auto;'>
            <h2 style='color: #2563eb; margin-bottom: 20px;'>🔍 Search Results: {len(formatted_results)} Candidates Found</h2>
        """

        for i, result in enumerate(formatted_results, 1):
            # Calculate star rating (1-5 stars based on score)
            star_rating = min(5, max(1, int(result['match_score'] * 5)))
            stars = "⭐" * star_rating

            html_output += f"""
            <div style='border: 2px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-bottom: 25px; background: white; box-shadow: 0 4px 6px rgba(0,0,0,0.1);'>
                <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 15px;'>
                    <h3 style='color: #1f2937; margin: 0; font-size: 1.4em;'>
                        #{i} {result['candidate_name']} - {result['role']}
                    </h3>
                    <div style='background: #10b981; color: black; padding: 8px 16px; border-radius: 20px; font-weight: bold;'>
                        Match: {result['match_score']:.0%} {stars}
                    </div>
                </div>

                <div style='margin-bottom: 15px;color:black'>
                    <strong style='color:black'>⚡ Skills:</strong> {', '.join(result['skills'])}
                </div>

                <div style='margin-bottom: 15px;;color:black'>
                    <strong style='color:black'>📊 GitHub:</strong> {'✅ Available' if result['has_github'] else '❌ Not available'}
                    {f"({len(result['github_projects'])} projects)" if result['has_github'] else ''}
                </div>

                <div style='background: #f1f5f9; padding: 15px; border-radius: 8px; margin-bottom: 15px;'>
                    <strong style='color:#1e40af;'>🎯 Top Matching Experience:</strong>
                    {"".join([
                        f"<div style='margin: 10px 0; padding: 10px; border-left: 4px solid #2563eb; background: #ffffff; color:#111827;'><strong style='color:#2563eb;'>({excerpt['score']:.0%})</strong> {excerpt['text']}</div>"
                        for excerpt in result['excerpts']
                    ])}
                </div>

                <div style='font-size: 0.9em; color: #111827;'>
                    Candidate ID: {result['resume_id']} |
                    Source: Enhanced RAG Search
                </div>
            </div>
            """

        html_output += "</div>"
        return html_output

    # Create Gradio interface
    with gr.Blocks(title="RecruitRAG Pro", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🚀 RecruitRAG Pro - Advanced Candidate Search
        *Intelligent resume matching with AI-powered semantic search*
        """)

        with gr.Row():
            with gr.Column(scale=3):
                query_input = gr.Textbox(
                    label="🔍 Search Query",
                    placeholder="e.g., 'Senior Python developer with machine learning experience and TensorFlow projects'",
                    lines=2
                )
            with gr.Column(scale=1):
                num_results = gr.Slider(
                    label="Number of Results",
                    minimum=1,
                    maximum=20,
                    value=5,
                    step=1
                )
                github_filter = gr.Checkbox(
                    label="Only show candidates with GitHub",
                    value=True
                )

        search_btn = gr.Button("🚀 Search Candidates", variant="primary")

        with gr.Row():
            output = gr.HTML(
                label="Search Results",
                value="<div style='text-align: center; color: #6b7280; padding: 40px;'>Enter a query to find perfect candidates...</div>"
            )

        # Examples
        gr.Examples(
            examples=[
                ["Machine learning engineer with Python and TensorFlow experience"],
                ["Frontend developer React JavaScript with portfolio"],
                ["DevOps engineer AWS Docker Kubernetes"],
                ["Data scientist with SQL and machine learning projects"],
                ["Full stack developer Python React Node.js"]
            ],
            inputs=query_input
        )

        search_btn.click(
            fn=search_candidates,
            inputs=[query_input, num_results, github_filter],
            outputs=output
        )

        query_input.submit(
            fn=search_candidates,
            inputs=[query_input, num_results, github_filter],
            outputs=output
        )

    return demo


In [None]:
if __name__ == "__main__":
    # Create and launch the interface
    demo = create_gradio_interface()
    demo.launch(
        server_name="0.0.0.0",

        share=True  # Optional: creates public link
    )

✅ Enhanced RAG system initialized!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ca035c01d38275d74b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
