<a href="https://colab.research.google.com/github/nikhil1001001/Alma_CV_Checker_Assignment/blob/main/Alma_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install uvicorn beautifulsoup4 requests faiss-cpu sentence-transformers python-docx pdfplumber -q

##Web Scraping

In [2]:
import requests
from bs4 import BeautifulSoup
import json

# List of URLs to scrape
URLS = [
    "https://www.uscis.gov/working-in-the-united-states/temporary-workers/o-1-visa-individuals-with-extraordinary-ability-or-achievement",
    "https://www.uscis.gov/policy-manual/volume-2-part-m#",
    "https://www.tryalma.com/o-1a-visa-guide"
]

def scrape_o1a_criteria():
    criteria = []

    for url in URLS:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch {url}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")

        # Find relevant sections (Modify selectors based on actual webpage structure)
        sections = soup.find_all("h3")  # Assuming criteria are under <h3> tags
        if not sections:
            sections = soup.find_all("p")  # Fallback to paragraphs if <h3> not found

        for section in sections:
            text = section.get_text(strip=True)
            criteria.append(text)

    # Save extracted criteria to JSON
    with open("o1a_criteria.json", "w") as f:
        json.dump(criteria, f, indent=4)

    print(f"Scraped {len(criteria)} O-1A criteria items from {len(URLS)} websites.")

if __name__ == "__main__":
    scrape_o1a_criteria()


Scraped 125 O-1A criteria items from 3 websites.


In [3]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the scraped criteria from JSON
with open("o1a_criteria.json", "r") as f:
    o1a_criteria = json.load(f)

# Load the Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Fast and lightweight model

# Convert criteria into embeddings
embeddings = model.encode(o1a_criteria)

# Store in FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance index
index.add(np.array(embeddings))  # Add embeddings to FAISS index

# Save FAISS index
faiss.write_index(index, "o1a_faiss.index")

# Save criteria mapping
with open("o1a_criteria_list.json", "w") as f:
    json.dump(o1a_criteria, f, indent=4)

print(f"Stored {len(o1a_criteria)} criteria embeddings in FAISS.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Stored 125 criteria embeddings in FAISS.


In [4]:
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer

# Load FAISS index and criteria
index = faiss.read_index("o1a_faiss.index")
with open("o1a_criteria_list.json", "r") as f:
    o1a_criteria = json.load(f)

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def match_criteria(cv_text, top_k=5):
    """Matches CV content against O-1A visa criteria using semantic search."""

    # Encode CV text
    cv_embedding = model.encode([cv_text])

    # Search in FAISS
    distances, indices = index.search(np.array(cv_embedding), top_k)

    # Get matching criteria
    matched_criteria = [o1a_criteria[idx] for idx in indices[0]]

    return matched_criteria


In [5]:
import pdfplumber
import docx

def extract_text(file_path):
    """Extract text from PDF or DOCX files."""
    text = ""

    if file_path.endswith(".pdf"):
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"

    elif file_path.endswith(".docx"):
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"

    return text.strip()


In [6]:
!pip install gradio -q


In [7]:
import gradio as gr
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Load Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Define O-1A visa criteria and their descriptions
o1a_criteria = {
    "Awards": "The applicant must have received nationally or internationally recognized prizes or awards.",
    "Membership": "The applicant must be a member of associations that require outstanding achievements.",
    "Press": "There must be published material about the applicant in major media.",
    "Judging": "The applicant must have served as a judge of the work of others in their field.",
    "Original Contribution": "The applicant must have made original contributions of major significance.",
    "Scholarly Articles": "The applicant must have authored scholarly articles in professional journals.",
    "Critical Employment": "The applicant must have been employed in a critical or essential capacity.",
    "High Remuneration": "The applicant must have commanded a high salary compared to others in the field."
}

# Convert O-1A criteria into embeddings
criteria_embeddings = {key: model.encode(value, convert_to_tensor=True) for key, value in o1a_criteria.items()}

# Function to analyze CV and match with O-1A criteria
def analyze_cv(cv_text):
    # Convert CV text into an embedding
    cv_embedding = model.encode(cv_text, convert_to_tensor=True)

    criteria_results = {}
    matched_count = 0

    for criterion, embedding in criteria_embeddings.items():
        similarity_score = util.pytorch_cos_sim(cv_embedding, embedding).item()

        # Define a similarity threshold (adjustable)
        if similarity_score > 0.5:
            criteria_results[criterion] = "✔ Found"
            matched_count += 1
        else:
            criteria_results[criterion] = "❌ Not Found"

    # Assign a rating based on the number of matched criteria
    rating = "Low" if matched_count <= 3 else "Medium" if matched_count <= 6 else "High"

    return criteria_results, rating

# Define the Gradio UI
interface = gr.Interface(
    fn=analyze_cv,
    inputs=gr.Textbox(lines=10, placeholder="Paste CV text here..."),
    outputs=[gr.JSON(label="Matched Criteria"), gr.Text(label="Visa Qualification Rating")],
    title="O-1A Visa Qualification Checker",
    description="Upload a CV and check how well it matches the O-1A visa requirements."
)

# Launch Gradio
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://25102a1dd2740121a3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [9]:
!pip install PyPDF2 -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/232.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
import gradio as gr
import torch
import PyPDF2  # Import PyPDF2 for PDF text extraction
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Load Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Define O-1A visa criteria and their descriptions
o1a_criteria = {
    "Awards": "The applicant must have received nationally or internationally recognized prizes or awards.",
    "Membership": "The applicant must be a member of associations that require outstanding achievements.",
    "Press": "There must be published material about the applicant in major media.",
    "Judging": "The applicant must have served as a judge of the work of others in their field.",
    "Original Contribution": "The applicant must have made original contributions of major significance.",
    "Scholarly Articles": "The applicant must have authored scholarly articles in professional journals.",
    "Critical Employment": "The applicant must have been employed in a critical or essential capacity.",
    "High Remuneration": "The applicant must have commanded a high salary compared to others in the field."
}

# Convert O-1A criteria into embeddings
criteria_embeddings = {key: model.encode(value, convert_to_tensor=True) for key, value in o1a_criteria.items()}

# Function to extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

# Function to analyze CV and match with O-1A criteria
def analyze_cv(pdf_file):
    # Extract text from the uploaded PDF
    cv_text = extract_text_from_pdf(pdf_file.name)

    # Convert CV text into an embedding
    cv_embedding = model.encode(cv_text, convert_to_tensor=True)

    criteria_results = {}
    matched_count = 0

    for criterion, embedding in criteria_embeddings.items():
        similarity_score = util.pytorch_cos_sim(cv_embedding, embedding).item()

        # Define a similarity threshold (adjustable)
        if similarity_score > 0.5:
            criteria_results[criterion] = "✔ Found"
            matched_count += 1
        else:
            criteria_results[criterion] = "❌ Not Found"

    # Assign a rating based on the number of matched criteria
    rating = "Low" if matched_count <= 3 else "Medium" if matched_count <= 6 else "High"

    return criteria_results, rating

# Define the Gradio UI with PDF upload
interface = gr.Interface(
    fn=analyze_cv,
    inputs=gr.File(label="Upload CV (PDF only)"),
    outputs=[gr.JSON(label="Matched Criteria"), gr.Text(label="Visa Qualification Rating")],
    title="O-1A Visa Qualification Checker",
    description="Upload a CV in PDF format and check how well it matches the O-1A visa requirements."
)

# Launch Gradio
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://184e471db3a58a30ab.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


