In [75]:
import PyPDF2
import os
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
from tkinter import *
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [76]:
# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')




In [77]:
def extract_text_from_pdf(candidate_path):
    try:
        with open(candidate_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)
            text = ""
            for page_num in range(num_pages):
                page_obj = pdf_reader.pages[page_num]
                text += page_obj.extract_text()
        return text.lower()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""


In [78]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()


In [79]:
def compute_scores(text):
    # Remove numbers and punctuation from text
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Define keyword categories
    terms = {
        'Project Management': ['administration', 'agile', 'budget', 'cost', 'direction', 'feasibility analysis',
                               'finance', 'kanban', 'leader', 'leadership', 'management', 'milestones', 'planning',
                               'pmi', 'pmp', 'problem', 'project', 'risk', 'schedule', 'scrum', 'stakeholders'],
        'Data Analytics': ['analytics', 'api', 'aws', 'big data', 'business intelligence', 'clustering', 'code',
                           'coding', 'data', 'database', 'data mining', 'data science', 'deep learning', 'hadoop',
                           'hypothesis test', 'iot', 'internet', 'machine learning', 'modeling', 'nosql', 'nlp',
                           'predictive', 'programming', 'python', 'r', 'sql', 'tableau', 'text mining',
                           'visualization'],
        'Healthcare': ['adverse events', 'care', 'clinic', 'cphq', 'ergonomics', 'healthcare',
                       'health care', 'health', 'hospital', 'human factors', 'medical', 'near misses',
                       'patient', 'reporting system']
    }

    # Get BERT embeddings for the resume text
    text_embedding = get_bert_embeddings(text)

    scores = {}

    for category, keywords in terms.items():
        keyword_embeddings = [get_bert_embeddings(keyword) for keyword in keywords]
        category_embedding = np.mean(keyword_embeddings, axis=0)
        similarity = cosine_similarity(text_embedding, category_embedding)
        scores[category] = similarity.mean()

    return pd.DataFrame(list(scores.items()), columns=['Category', 'Score']).sort_values(by='Score', ascending=False)


In [80]:
def plot_and_save_pie_chart(summary_df, candidate_name):
    plt.figure(figsize=(4, 5))
    plt.pie(summary_df['Score'], labels=summary_df['Category'], explode=(0.1, 0, 0), autopct='%1.0f%%', shadow=True, startangle=90)
    plt.title('Summary')
    plt.axis('equal')
    plt.savefig(f'/Users/pallavisharma/Downloads/temp/results/{candidate_name}_summary.png')
    plt.close()


In [81]:
def analyze_resumes(path):
    score_list = []
    name_list = []

    def largest_indices(list1, N):
        return sorted(range(len(list1)), key=lambda i: list1[i], reverse=True)[:N]

    for filename in os.listdir(path):
        if filename.endswith('.pdf'):
            name_list.append(filename)
            text = extract_text_from_pdf(os.path.join(path, filename))
            summary_df = compute_scores(text)
            print(summary_df)
            plot_and_save_pie_chart(summary_df, filename.split('.pdf')[0])
            score_list.append(summary_df['Score'].sum())

    top_candidates_indices = largest_indices(score_list, 2)
    shortlisted = [name_list[i] for i in top_candidates_indices]
    return shortlisted


In [82]:
def display_shortlisted_candidates():
    top = Toplevel(root)
    top.title("SUMMARY")
    top.geometry("500x500")
    shortlist = analyze_resumes('/Users/pallavisharma/Downloads/temp/resumes/')
    print(shortlist)
    label1 = Label(top, text="SHORTLISTED CANDIDATES")
    label1.pack()
    for name in shortlist:
        label = Label(top, text=name.split("Resume.pdf")[0])
        label.pack()
    top.mainloop()


In [83]:
root = Tk()
root.title("PROFILE SCREENING")
root.geometry("500x500")

label1 = Label(root, text="AI can help solve the biggest problem")
label2 = Label(root, text="of talent acquisition using")
label3 = Label(root, text="AI profile screening technology")

button = Button(root, text="Continue", command=display_shortlisted_candidates)

label1.pack()
label2.pack()
label3.pack()
button.pack()

root.mainloop()


             Category     Score
1      Data Analytics  0.280148
2          Healthcare  0.256402
0  Project Management  0.203859
             Category     Score
1      Data Analytics  0.295448
2          Healthcare  0.259678
0  Project Management  0.210095
['Resume.pdf', 'Resume_Pallavi_Sharma.pdf']
