In [7]:
import PyPDF2
import pandas as pd
import os

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def search_keywords(text, keywords):
    keyword_counts = {}
    for keyword in keywords:
        keyword_counts[keyword] = text.lower().count(keyword.lower())
    return keyword_counts

def analyze_resumes(pdf_folder, keywords):
    data = []

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            text = extract_text_from_pdf(pdf_path)
            keyword_counts = search_keywords(text, keywords)
            
            matched_phrases_list = []
            for k, v in keyword_counts.items():
                if v > 0:
                    matched_phrases_list.append(k)
            matched_phrases = ", ".join(matched_phrases_list)
            
            matched_count = sum(keyword_counts.values())
            resume_details = {
                "resume_link": pdf_path, 
                "matched_phrases": matched_phrases,
                "matched_count": matched_count
            }
            data.append(resume_details)

    df = pd.DataFrame(data)
    return df

def generate_excel_report(df, output_path):
    writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
    df.to_excel(writer, index=False, sheet_name='Sheet1')
    
    worksheet = writer.sheets['Sheet1']

    for row_num in range(len(df)):
        file_path = os.path.abspath(df.at[row_num, "resume_link"])
        file_name = os.path.basename(file_path)
        worksheet.write_url(row_num + 1, 0, f'file:///{file_path}', string=file_name)
    
    writer.close()


pdf_folder = "resumes"
keywords = ["Python", "Data Analysis", "Machine Learning", "SQL"]

df = analyze_resumes(pdf_folder, keywords)

output_path = "Analyzed_Resume_output.xlsx"
generate_excel_report(df, output_path)

print(f"Excel report generated: {output_path}")

Excel report generated: Analyzed_Resume_output.xlsx
