<a href="https://colab.research.google.com/github/meghanagaddalae/G.Meghana_Finalproject_ML_Internship/blob/main/G_Meghana__finalproject_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import io
import tempfile
from typing import List, Tuple, Dict, Any

import pandas as pd
import numpy as np
import streamlit as st

import fitz
import docx2txt

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

@st.cache_resource(show_spinner=False)
def load_spacy_model():
    return spacy.load("en_core_web_sm")

@st.cache_resource(show_spinner=False)
def load_sentence_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

nlp = load_spacy_model()
embedder = load_sentence_model()

ModuleNotFoundError: No module named 'streamlit'

In [2]:
%pip install streamlit

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


In [3]:
def extract_text_from_pdf(file_bytes: bytes) -> str:
    text_chunks = []
    with fitz.open(stream=file_bytes, filetype="pdf") as doc:
        for page in doc:
            text_chunks.append(page.get_text("text"))
    return "\n".join(text_chunks)
def extract_text_from_docx(file_bytes: bytes) -> str:
    with tempfile.NamedTemporaryFile(delete=True, suffix=".docx") as tf:
        tf.write(file_bytes)
        tf.flush()
        txt = docx2txt.process(tf.name) or ""
    return txt
def extract_text_from_txt(file_bytes: bytes) -> str:
    try:
        return file_bytes.decode("utf-8", errors="ignore")
    except Exception:
        return str(file_bytes)

def extract_text_from_file(uploaded_file) -> Tuple[str, str]:
    """Return (filename, extracted_text)"""
    name = uploaded_file.name
    data = uploaded_file.read()
    lower = name.lower()

    try:
        if lower.endswith(".pdf"):
            text = extract_text_from_pdf(data)
        elif lower.endswith(".docx"):
            text = extract_text_from_docx(data)
        elif lower.endswith(".txt"):
            text = extract_text_from_txt(data)
        else:
            text = extract_text_from_txt(data)  # fallback
    except Exception as e:
        st.warning(f"❌ Failed to extract text from {name}: {e}")
        text = ""

    return name, text

In [5]:
def clean_text(text: str) -> str:
    if not text:
        return ""
    txt = text.replace("\r\n", "\n").replace("\r", "\n")
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    txt = re.sub(r"[ \t]{2,}", " ", txt)
    txt = txt.replace("\x00", "")
    return txt.strip()
def extract_contact_info(text: str) -> Dict[str, str]:
    info = {"email": "", "phone": "", "name": ""}

    m = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    if m:
        info["email"] = m.group(0)

    m2 = re.search(r"(\+?\d{1,3}[-.\s]?)?(\(?\d{2,4}\)?[-.\s]?)?\d{6,12}", text)
    if m2:
        info["phone"] = m2.group(0)
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    for ln in lines[:10]:
        if 2 <= len(ln.split()) <= 4 and ln[0].isupper():
            if re.match(r"^[A-Z][a-z]+(\s[A-Z][a-z]+)+", ln):
                info["name"] = ln
                break

    return info

In [6]:
SKILL_KEYWORDS = {
    "python", "java", "c++", "sql", "javascript", "html", "css",
    "machine learning", "deep learning", "nlp", "tensorflow", "pytorch",
    "data analysis", "data science", "pandas", "numpy", "scikit-learn",
    "excel", "powerbi", "tableau", "docker", "kubernetes", "git",
    "cloud", "aws", "azure", "gcp", "linux"
}

def extract_keywords_from_jd(jd_text: str) -> Dict[str, Any]:
    """
    Extracts important keywords and skills from a job description.
    Returns dict with 'all_keywords' and 'skills_matched'.
    """
    doc = nlp(jd_text.lower())
    tokens = [
        token.text for token in doc
        if token.pos_ in {"NOUN", "PROPN"} and not token.is_stop
    ]
    skills_found = set()
    for skill in SKILL_KEYWORDS:
        if skill in jd_text.lower():
            skills_found.add(skill)

    return {
        "all_keywords": list(set(tokens)),
        "skills_matched": list(skills_found)
    }

In [None]:
def compute_tfidf_score(resume_text: str, jd_text: str) -> float:
    corpus = [jd_text, resume_text]
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(corpus)
    sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return round(sim * 100, 2)
def compute_bert_score(resume_text: str, jd_text: str) -> float:
    embeddings = embedder.encode([jd_text, resume_text], convert_to_tensor=True)
    sim = cosine_similarity(
        embeddings[0].cpu().numpy().reshape(1, -1),
        embeddings[1].cpu().numpy().reshape(1, -1)
    )[0][0]
    return round(sim * 100, 2)

def score_candidate(resume_text: str, jd_text: str, jd_skills: List[str]) -> Dict[str, Any]:
    cleaned_resume = clean_text(resume_text)
    contact_info = extract_contact_info(cleaned_resume)

    tfidf_score = compute_tfidf_score(cleaned_resume, jd_text)
    bert_score = compute_bert_score(cleaned_resume, jd_text)

    resume_lower = cleaned_resume.lower()
    matched = [s for s in jd_skills if s in resume_lower]
    skill_score = round((len(matched) / len(jd_skills)) * 100, 2) if jd_skills else 0
    final_score = round((0.4 * tfidf_score) + (0.4 * bert_score) + (0.2 * skill_score), 2)

    return {
        "name": contact_info.get("name", "Unknown"),
        "email": contact_info.get("email", ""),
        "phone": contact_info.get("phone", ""),
        "tfidf_score": tfidf_score,
        "bert_score": bert_score,
        "skill_score": skill_score,
        "final_score": final_score,
        "skills_matched": matched
    }

In [9]:


def main():
    st.set_page_config(page_title="Smart Resume Screening System", layout="wide")
    st.title("📄 Smart Resume Screening System")
    st.markdown("Upload resumes and a job description to rank candidates.")
    st.sidebar.header("Job Description")
    jd_file = st.sidebar.file_uploader("Upload JD file (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
    jd_text_area = st.sidebar.text_area("Or paste JD here")

    jd_text = ""
    if jd_file:
        _, jd_text = extract_text_from_file(jd_file)
    elif jd_text_area:
        jd_text = jd_text_area

    if not jd_text:
        st.warning("⚠ Please upload or paste a job description to continue.")
        return

    jd_text = clean_text(jd_text)
    jd_keywords = extract_keywords_from_jd(jd_text)

    st.sidebar.subheader("Extracted Skills from JD:")
    st.sidebar.write(", ".join(jd_keywords["skills_matched"]) or "None")

    st.header("Upload Resumes")
    resume_files = st.file_uploader(
        "Upload multiple resumes", type=["pdf", "docx", "txt"], accept_multiple_files=True
    )

    if st.button("Process Resumes") and resume_files:
        results = []

        for file in resume_files:
            fname, text = extract_text_from_file(file)
            candidate = score_candidate(text, jd_text, jd_keywords["skills_matched"])
            candidate["filename"] = fname
            results.append(candidate)

        df = pd.DataFrame(results).sort_values(by="final_score", ascending=False)

        st.subheader("📊 Candidate Ranking")
        st.dataframe(df[["filename", "name", "email", "phone", "final_score", "skills_matched"]])

        top = df.iloc[0]
        st.success(f"🏆 Top Candidate: *{top['name']}* with score {top['final_score']}")

        csv = df.to_csv(index=False).encode("utf-8")
        st.download_button("📥 Download Results as CSV", data=csv, file_name="resume_screening_results.csv")

    elif not resume_files:
        st.info("ℹ Upload resumes to process.")
if __name__ == "__main__":
    main()

NameError: name 'st' is not defined

In [11]:

jd_file_object = type("file_object", (object,), {
    "name": jd_filename,
    "read": lambda: jd_bytes
})()

_, jd_text = extract_text_from_file(jd_file_object)
cleaned_jd_text = clean_text(jd_text)
jd_keywords = extract_keywords_from_jd(cleaned_jd_text)

print("Job Description processed.")
print("Extracted Skills from JD:", ", ".join(jd_keywords["skills_matched"]) or "None")

NameError: name 'jd_filename' is not defined

In [10]:
from google.colab import files

print("Please upload the Job Description file:")
uploaded_jd = files.upload()

jd_filename = next(iter(uploaded_jd))
jd_bytes = uploaded_jd[jd_filename]

print("\nPlease upload the Resume files:")
uploaded_resumes = files.upload()

resume_files_data = [(name, uploaded_resumes[name]) for name in uploaded_resumes]

print("\nFiles uploaded successfully.")

Please upload the Job Description file:


KeyboardInterrupt: 