In [None]:
%pip install -q fastapi uvicorn pydantic pydantic-settings python-dotenv pdfplumber requests httpx python-multipart jinja2 loguru google-genai sentence-transformers torch numpy scikit-learn faiss-cpu

Note: you may need to restart the kernel to use updated packages.


## Setup

> **Important**: Set your Gemini API key before running:
> ```bash
> export GOOGLE_API_KEY="your-api-key-here"
> ```
> Get your key from https://makersuite.google.com/app/apikey

In [8]:
import os
import sys
import json
import pickle
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path.cwd() / 'backend'))

from app.services.ingest import IngestService
from app.models.schemas import ProfileInput

# Create output directory for stored profiles
PROFILE_CACHE_DIR = Path.cwd() / 'profile_cache'
PROFILE_CACHE_DIR.mkdir(exist_ok=True)

print(f"‚úÖ Profile cache directory: {PROFILE_CACHE_DIR}")
print(f"‚úÖ Ready to ingest resume")

‚úÖ Profile cache directory: /Users/mohitbhoir/Git/resume_builder/profile_cache
‚úÖ Ready to ingest resume


In [None]:
# Set your Gemini API key (get it from https://makersuite.google.com/app/apikey)
import os

GEMINI_API_KEY = "YOUR_API_KEY_HERE"  # Replace with your actual API key

if GEMINI_API_KEY != "YOUR_API_KEY_HERE":
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
    print("‚úÖ Gemini API key configured")
else:
    print("‚ö†Ô∏è  Gemini API key not set. Using regex fallback for section extraction.")
    print("   To enable AI-powered extraction:")
    print("   1. Get your key from: https://makersuite.google.com/app/apikey")
    print("   2. Replace GEMINI_API_KEY above with your actual key")

## Option 1: Ingest from PDF File Path

In [None]:
# Replace with your PDF file path
PDF_FILE_PATH = "/Users/mohitbhoir/Git/resume_builder/resume-2.pdf"

if Path(PDF_FILE_PATH).exists():
    from fastapi import UploadFile
    from io import BytesIO
    
    # Create a mock UploadFile
    class MockUploadFile:
        def __init__(self, file_path):
            self.file_path = Path(file_path)
            self.content_type = "application/pdf"
            self.file = BytesIO(self.file_path.read_bytes())
    
    service = IngestService()
    mock_file = MockUploadFile(PDF_FILE_PATH)
    result = service.ingest_pdf(mock_file)
    
    print(f"‚úÖ PDF ingested successfully")
    print(f"   Sections: {result.sections}")
    print(f"   Chunks: {result.chunks_created}")
    print(f"   Embedding provider: {result.embedding_provider}")
    print(f"\nProfile data:")
    print(f"   Experience items: {len(result.profile.experience)}")
    print(f"   Projects: {len(result.profile.projects)}")
    print(f"   Skills: {len(result.profile.skills)}")
    print(f"   Education: {len(result.profile.education)}")
else:
    print(f"‚ùå PDF file not found at {PDF_FILE_PATH}")
    print(f"   Update PDF_FILE_PATH above with your resume path")

## Option 2: Ingest from URL

In [None]:
import requests
from io import BytesIO

# Replace with your resume PDF URL
PDF_URL = "https://example.com/path/to/resume.pdf"

try:
    # Download PDF from URL
    response = requests.get(PDF_URL, timeout=10)
    response.raise_for_status()
    
    # Create mock upload file
    class MockUploadFile:
        def __init__(self, content):
            self.content_type = "application/pdf"
            self.file = BytesIO(content)
    
    service = IngestService()
    mock_file = MockUploadFile(response.content)
    result = service.ingest_pdf(mock_file)
    
    print(f"‚úÖ PDF from URL ingested successfully")
    print(f"   Sections: {result.sections}")
    print(f"   Chunks: {result.chunks_created}")
    print(f"\nProfile data:")
    print(f"   Experience items: {len(result.profile.experience)}")
    print(f"   Projects: {len(result.profile.projects)}")
    print(f"   Skills: {len(result.profile.skills)}")
    print(f"   Education: {len(result.profile.education)}")
except Exception as e:
    print(f"‚ùå Error downloading PDF: {e}")
    print(f"   Update PDF_URL above with your resume URL")

## Option 3: Ingest from Plaintext

In [11]:
# Paste your resume text here
RESUME_TEXT = """
Mohit Bhoir
Willimantic, CT 06226 | 959-995-0104 | mohitbhoir789@gmail.com
LinkedIn: [linkedin.com/in/mohitbhoir789](https://linkedin.com/in/mohitbhoir789)
GitHub: [github.com/mohitbhoir789](https://github.com/mohitbhoir789)
Portfolio: mohitbhoir789.github.io/portfolio/

Technical Skills

* Programming Languages: Python, Java, JavaScript, C++, R, SQL
* Data Science & ML Frameworks: TensorFlow, PyTorch, Scikit-Learn, Keras, Pandas, NumPy, Matplotlib, SciPy; Machine Learning (Classification, Regression, Clustering, NLP, Time-Series, Data Mining, Statistics)
* Databases & MLOps: PostgreSQL, MySQL, MongoDB; Docker, Airflow, MLflow, CI/CD Pipelines
* Cloud Platforms & Tools: AWS, GitHub, Jupyter, Power BI, Tableau, Hadoop, ETL Tools, PyCharm, MCP, AI Agents

Work Experience

Community Dreams Foundation | Data Science Intern | Sep 2025 - Dec 2025

* Worked on end-to-end data science applications involving data collection, preprocessing, modeling, and deployment for social impact initiatives.
* Processed and analyzed large-scale structured and unstructured datasets (1M+ records) using Python, Pandas, NumPy, and SQL to derive actionable insights.
* Built and evaluated machine learning models including Logistic Regression, Random Forest, XGBoost, and Gradient Boosting for prediction and classification tasks, improving model accuracy by up to 22%.
* Developed data pipelines for cleaning, feature engineering, and transformation, reducing data inconsistencies by 30%.
* Applied statistical analysis and hypothesis testing to identify trends, correlations, and key performance drivers across datasets.
* Implemented model evaluation techniques such as cross-validation, ROC-AUC, precision-recall, and confusion matrices to ensure robustness and reliability.
* Collaborated with cross-functional teams to translate business requirements into scalable data science solutions and dashboards.
* Deployed trained models using Python-based workflows and versioned experiments using MLflow and Git for reproducibility.

Beats by Dre | Data Science & Consumer Insights Extern | Jun 2025 - Aug 2025

* Performed exploratory data analysis and sentiment analysis on over 5,000 Amazon reviews using Python, BeautifulSoup, TextBlob, Pandas, and Seaborn to uncover product sentiment, preferences, and brand positioning.
* Segmented customer survey data using Pandas and NumPy to define user personas based on generation, price sensitivity, and feature prioritization.
* Extracted key demand drivers such as bass-forward sound, battery life, and design aesthetics through polarity scores, frequency distributions, and word cloud visualizations.
* Automated the data scraping pipeline using BeautifulSoup and OxyLabs, reducing manual data collection time by 90% /and improving dataset scale and quality.
* Translated technical findings into data-backed launch recommendations, including product specifications, pricing, and go-to-market strategy.
* Applied skills in data scraping, text preprocessing, EDA, sentiment analysis, and survey segmentation to support product strategy and consumer insight generation.

Amdocs, India | Software Development Engineer | Jul 2021 - Jul 2024

* Automated regression test suites using Selenium and Robot Framework, reducing release cycles by 30% /and boosting deployment efficiency.
* Engineered scalable ServiceNow workflows that eliminated 40% /of repetitive manual tasks, streamlining request processing time by 25%.
* Built real-time dashboards in Power BI, enhancing reporting efficiency and enabling data-driven decision-making for key stakeholders.
* Collaborated with 4+ cross-functional teams to design AI-based automation tools, improving accuracy and team productivity by 20%.
* Integrated regression testing pipelines within development cycles, achieving 95% test accuracy and accelerating QA feedback loops.
* Developed scripts to test APIs for Order Management System (OMS) and Customer Service Provisioning (CSP), ensuring robust backend integration and improving reliability.

Projects

Movie Recommendation Chatbot | Python, RAG, Hugging Face, Pinecone, PostgreSQL, TMDb API

* Developed a semantic-search-based chatbot using a dataset of 343K+ IMDb movies (2000-2024).
* Used Hugging Face embeddings + Pinecone vector DB to enable real-time recommendation retrieval with under 1-second latency.
* Managed metadata for 200K+ unique movie entries via PostgreSQL; enriched data using TMDb API to improve content coverage by 30%.
* Achieved over 92% /accuracy in matching user query intents to relevant movie descriptions through embedding tuning.

Statistical Analysis of Corporate Takeovers | Python, Scikit-learn, Regression, XGBoost

* Analyzed takeover data from 126 U.S. firms over 8 years to identify predictors of acquisition likelihood.
* Built classification models (Poisson regression, Random Forest, XGBoost); logistic regression model achieved AUC = 0.78 and 77% /accuracy.
* Applied scaling and feature selection to reduce model variance by 25% /and improve interpretability.

Cricket Analysis Dashboard | SQL, Tableau, LOD, KPI Metrics

* Created an interactive Tableau dashboard using 1.2M+ ODI ball-by-ball records (2002-2023) to analyze player and team performance.
* Performed SQL-based ETL and implemented LOD calculations to generate 20+ KPIs for match summaries, venue stats, and team trends.
* Improved data cleanliness and consistency by 35% through custom data wrangling scripts.

Education

University of Connecticut | Master of Science in Data Science | Aug 2024 - Present

* GPA: 3.79/4.0
* Relevant Coursework: Statistics, Machine Learning, NLP, Deep Learning, Algorithms, Data Mining, Gen-AI.

University of Mumbai | Bachelor of Engineering in Electronics Engineering | Aug 2017 - Jul 2021

* GPA: 7.78/10
* Relevant Coursework: Operating Systems, Python, Cryptography & System Security, DBMS
"""

if RESUME_TEXT.strip():
    service = IngestService()
    result = service.ingest_text(RESUME_TEXT)
    
    print(f"‚úÖ Text ingested successfully")
    print(f"   Sections: {result.sections}")
    print(f"   Chunks: {result.chunks_created}")
    print(f"\nProfile data:")
    print(f"   Experience items: {len(result.profile.experience)}")
    print(f"   Projects: {len(result.profile.projects)}")
    print(f"   Skills: {len(result.profile.skills)}")
    print(f"   Education: {len(result.profile.education)}")
else:
    print("‚ùå No resume text provided")

‚úÖ Text ingested successfully
   Sections: ['experience', 'projects', 'skills', 'education', 'certifications']
   Chunks: 0

Profile data:
   Experience items: 0
   Projects: 0
   Skills: 0
   Education: 0


## Save Profile and Embeddings Locally

In [None]:
import json
import pickle
from pathlib import Path

# Save the profile to a JSON file
profile_name = "my_profile"  # Change this to a unique name if you have multiple profiles
profile_file = PROFILE_CACHE_DIR / f"{profile_name}_profile.json"
embeddings_file = PROFILE_CACHE_DIR / f"{profile_name}_embeddings.pkl"

# Save profile
with open(profile_file, 'w') as f:
    json.dump(result.profile.dict(), f, indent=2)
print(f"‚úÖ Profile saved to: {profile_file}")

# Save embeddings vector store
vector_store = service.vector_store
with open(embeddings_file, 'wb') as f:
    pickle.dump(vector_store, f)
print(f"‚úÖ Embeddings saved to: {embeddings_file}")

# Create a metadata file
metadata = {
    "profile_name": profile_name,
    "ingest_type": result.ingest_type,
    "sections": result.sections,
    "chunks_created": result.chunks_created,
    "embedding_provider": result.embedding_provider,
}
metadata_file = PROFILE_CACHE_DIR / f"{profile_name}_metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"‚úÖ Metadata saved to: {metadata_file}")

print(f"\n‚úÖ All profile data saved successfully!")
print(f"\nüìù Profile name: {profile_name}")
print(f"   Use this name in your resume generator")

## View Saved Profiles

In [None]:
import json
from pathlib import Path

print("üìÅ Available saved profiles:\n")

metadata_files = list(PROFILE_CACHE_DIR.glob("*_metadata.json"))

if not metadata_files:
    print("   No saved profiles found")
else:
    for metadata_file in sorted(metadata_files):
        with open(metadata_file) as f:
            metadata = json.load(f)
        
        print(f"üìÑ {metadata['profile_name']}")
        print(f"   Type: {metadata['ingest_type']}")
        print(f"   Sections: {', '.join(metadata['sections'])}")
        print(f"   Chunks: {metadata['chunks_created']}")
        print()

## Load Saved Profile

In [None]:
import json

# Specify which profile to load
profile_to_load = "my_profile"  # Change to your profile name

profile_file = PROFILE_CACHE_DIR / f"{profile_to_load}_profile.json"

if profile_file.exists():
    with open(profile_file) as f:
        loaded_profile = json.load(f)
    
    print(f"‚úÖ Loaded profile: {profile_to_load}")
    print(f"\nExperience ({len(loaded_profile['experience'])} items):")
    for i, exp in enumerate(loaded_profile['experience'][:3], 1):
        print(f"  {i}. {exp[:80]}..." if len(exp) > 80 else f"  {i}. {exp}")
    
    print(f"\nProjects ({len(loaded_profile['projects'])} items):")
    for i, proj in enumerate(loaded_profile['projects'][:3], 1):
        print(f"  {i}. {proj[:80]}..." if len(proj) > 80 else f"  {i}. {proj}")
    
    print(f"\nSkills ({len(loaded_profile['skills'])} items):")
    for i, skill in enumerate(loaded_profile['skills'][:5], 1):
        print(f"  {i}. {skill}")
    
    print(f"\nEducation ({len(loaded_profile['education'])} items):")
    for i, edu in enumerate(loaded_profile['education'], 1):
        print(f"  {i}. {edu}")
else:
    print(f"‚ùå Profile not found: {profile_to_load}")
    print(f"\n   Available profiles:")
    for metadata_file in sorted(PROFILE_CACHE_DIR.glob("*_metadata.json")):
        profile_name = metadata_file.stem.replace("_metadata", "")
        print(f"     - {profile_name}")