In [2]:
#Install Required Libraries
!pip install pdfplumber spacy pandas
!python -m spacy download en_core_web_sm





[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 544.7 kB/s eta 0:00:24
     --------------------------------------- 0.1/12.8 MB 544.7 kB/s eta 0:00:24
     --------------------------------------- 0.1/12.8 MB 544.7 kB/s eta 0:00:24
     --------------------------------------- 0.1/12.8 MB 544.7 kB/s eta 0:00:24
     --------------------------------------- 0.1/12.8 MB 425.3 kB/s eta 0:00:30
      -------------------------------------- 0.2/12.8 MB 512.0 kB/s eta 0:00:25
      -------------------------------------- 0.2/12.8 MB 512.0 kB/s eta 0:00:25
      -------------------------------------- 0.2


[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
#Import Libraries and Load SpaCy Model 
import pdfplumber
import spacy
import pandas as pd
import re

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')


In [4]:
#Define Function to Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text


In [6]:
#Define Function to Extract Information from Text
def extract_info_from_text(text):
    doc = nlp(text)
    info = {
        'name': None,
        'email': None,
        'phone': None,
        'education': [],
        'experience': [],
        'skills': []
    }

    # Simple heuristic-based extraction
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not info['name']:
            info['name'] = ent.text
        elif ent.label_ == "ORG":
            info['education'].append(ent.text)
        elif ent.label_ == "DATE":
            info['experience'].append(ent.text)

    # Regex for email and phone
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\b\d{10}\b'
    
    emails = re.findall(email_pattern, text)
    phones = re.findall(phone_pattern, text)
    
    if emails:
        info['email'] = emails[0]
    if phones:
        info['phone'] = phones[0]

    # Placeholder for skill extraction
    skills = ['Python', 'Java', 'SQL']  # Example skills
    for skill in skills:
        if skill.lower() in text.lower():
            info['skills'].append(skill)

    return info


In [7]:
#Define ATS Class

class ATS:
    def __init__(self):
        self.data = pd.DataFrame(columns=['Name', 'Email', 'Phone', 'Education', 'Experience', 'Skills'])

    def add_candidate(self, info):
        # Convert the candidate info dictionary to a DataFrame
        info_df = pd.DataFrame([info])
        # Concatenate the new candidate DataFrame with the existing DataFrame
        self.data = pd.concat([self.data, info_df], ignore_index=True)

    def rank_candidates(self):
        # Ensure 'Skills' column does not contain NaN values
        self.data['Skills'] = self.data['Skills'].apply(lambda x: x if isinstance(x, list) else [])
        # Example ranking: candidates with more skills listed come first
        self.data['SkillCount'] = self.data['Skills'].apply(lambda x: len(x))
        self.data = self.data.sort_values(by='SkillCount', ascending=False)
        self.data = self.data.drop(columns=['SkillCount'])

    def search_candidates(self, keyword):
        return self.data[self.data.apply(lambda row: row.astype(str).str.contains(keyword, case=False).any(), axis=1)]


In [8]:
#Test the ATS System

# Create an instance of ATS
ats = ATS()

# List of PDF resume file paths (update this with actual file paths on your system)
resume_files = ['lokeshres.pdf', 'naveenres.pdf']

# Extract and add candidate information
for file in resume_files:
    text = extract_text_from_pdf(file)
    info = extract_info_from_text(text)
    ats.add_candidate(info)

# Rank candidates
ats.rank_candidates()

# Print the ranked candidates
print(ats.data)


  Name Email Phone Education Experience Skills            name  \
0  NaN   NaN   NaN       NaN        NaN     []  Android Studio   
1  NaN   NaN   NaN       NaN        NaN     []  Android Studio   

                              email       phone  \
0  lokeshwaran.s.26022004@gmail.com  6380795436   
1       manepallinaveen10@gmail.com  8056147697   

                                           education  \
0  [Bachelor of Technology, Artificial Intelligen...   
1  [AI, Machine Learning Engineering, Education, ...   

                                          experience          skills  
0  [6380795436, 2021-2025, 2021, 91.4 2019, 2022 ...   [Python, SQL]  
1  [March 2024, SECONDARY June 2020 - May 2021, O...  [Python, Java]  


In [11]:
#Search Candidates

# Example search (replace 'Python' with the keyword you want to search for)
keyword = 'Sql'
search_results = ats.search_candidates(keyword)
print(search_results)


  Name Email Phone Education Experience Skills            name  \
0  NaN   NaN   NaN       NaN        NaN     []  Android Studio   

                              email       phone  \
0  lokeshwaran.s.26022004@gmail.com  6380795436   

                                           education  \
0  [Bachelor of Technology, Artificial Intelligen...   

                                          experience         skills  
0  [6380795436, 2021-2025, 2021, 91.4 2019, 2022 ...  [Python, SQL]  
