<a href="https://colab.research.google.com/github/muneebk98/Smart-Resume-Parser/blob/main/Smart_Resume_Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import zipfile
import os
import re
import json
import logging

path = '/content/resume_samples.zip'
extract = 'extracted_resumes'

#extracting zip folder
logging.info('We are extracting zip folder')
with zipfile.ZipFile(path, 'r') as zip_ref:
    zip_ref.extractall(extract)




txt_files = []
#check for .txt files
logging.info('Only using .txt files')
for root, dirs, files in os.walk(extract):
    for file in files:
      if file.lower().endswith(".txt"):
        full_path = os.path.join(root, file)
        txt_files.append(full_path)

print("Valid resume files:")
for tf in txt_files:
    print(tf)

#pattern to match
email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phone_pattern = r'(\+92-\d{3}-\d{7})|(0\d{3}-\d{7})'
skills_keywords = ["Python", "Java", "C++", "SQL", "Django", "React","Communication", "JavaScript", "Machine Learning","Data Analysis","Node.js"]

data=[]
logging.info('Parsing data')
#parsing
for path in txt_files:
  try:
    with open(path, "r", encoding="utf-8") as f:
      content = f.read()
      resume_info = {
            "name": None,
            "email": None,
            "phone": None,
            "skills": []
        }
      email_match = re.search(email_pattern, content)
      if email_match:
            resume_info["email"] = email_match.group()


      phone_match = re.search(phone_pattern, content)
      if phone_match:
            resume_info["phone"] = phone_match.group()


      found_skills = []
      for skill in skills_keywords:
        if skill.lower() in content.lower():
          found_skills.append(skill)

      resume_info["skills"] = found_skills



      lines = content.strip().splitlines()
      for line in lines:
        if line.strip():
          resume_info["name"] = line.strip()
          break

      if not any([resume_info["name"], resume_info["email"], resume_info["phone"], resume_info["skills"]]):
        logging.warning('File was empty')
        continue
      else: data.append(resume_info)

  except Exception as e:
    logging.error('File not opening')
    print(f"Failed to process {path}: {e}")

# creating .json file
with open("parsed_resumes.json", "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=4)
# printing json file
with open("parsed_resumes.json", "r", encoding="utf-8") as file:
    data_r = json.load(file)

for resume in data_r:
    print(resume)



Valid resume files:
extracted_resumes/resume8.txt
extracted_resumes/resume2.txt
extracted_resumes/resume4.txt
extracted_resumes/resume5.txt
extracted_resumes/resume3.txt
extracted_resumes/resume1.txt
extracted_resumes/resume9.txt
{'name': 'Hina Raza', 'email': 'hina.raza@gmail.com', 'phone': '0332-9876543', 'skills': ['C++', 'Machine Learning', 'Data Analysis']}
{'name': 'Sana Fatima', 'email': 'sana.fatima@xyz.com', 'phone': '+92-321-9876543', 'skills': ['Java', 'C++', 'Communication', 'JavaScript']}
{'name': 'Name: Junaid Akram', 'email': 'j.akram@myemail.com', 'phone': '0345-5678901', 'skills': ['React', 'Node.js']}
{'name': 'Ahmed Shah', 'email': 'ahmed_shah@domain.org', 'phone': '0315-1231234', 'skills': ['Python', 'Django']}
{'name': 'Ali Khan', 'email': 'ali.khan@example.com', 'phone': '+92-300-1234567', 'skills': ['Python', 'Java', 'SQL']}
{'name': 'Just some random text.', 'email': None, 'phone': None, 'skills': []}


In [13]:
import json
#loading data
def load():
    with open("parsed_resumes.json", "r", encoding="utf-8") as file:
        return json.load(file)
#counting total data
def count_data(data):

    return len(data)
#counting skills
def count_skills(data):
    skills_list = ["Python", "Java", "C++", "SQL", "Django", "React", "Communication", "JavaScript", "Machine Learning","Data Analysis", "Node.js"]

    skill_counts = {}
    for resume in data:
        for skill in skills_list:
            if skill in resume["skills"]:
                if skill not in skill_counts:
                    skill_counts[skill] = 1
                else:
                    skill_counts[skill] += 1

    return skills_list, skill_counts
#sorting top 5
def top_5(skill_counts):
    items = list(skill_counts.items())
    items.sort(key=lambda pair: pair[1], reverse=True)
    top_five = items[:5]
    return top_five

#checking for missing resumes
def missing_resumes(data):
    missing = []
    for resume in data:
        if not resume["phone"] or not resume["skills"]:
            missing.append(resume["name"])
    return missing
#creating report
def write_report(total, top_skills, skills_list, skill_counts, missing):
    lines = []
    lines.append("===== Resume Summary Report =====\n")
    lines.append(f"Total resumes parsed: {total}\n")

    lines.append("\nTop 5 most common skills:")
    for skill, count in top_skills:
        lines.append(f"  {skill}: {count}")

    lines.append("\nSkill frequency:")
    for skill in skills_list:
        count = skill_counts.get(skill, 0)
        lines.append(f"  {skill}: {count}")

    lines.append("\nResumes missing phone or skills:")
    if missing:
        for filename in missing:
            lines.append(f"  - {filename}")
    else:
        lines.append("  None")

    lines.append("\n=================================\n")

    report_text = "\n".join(lines)

    with open("summary_report.txt", "w", encoding="utf-8") as report:
        report.write(report_text)

    print(report_text)


def main():
    data = load()
    total = count_data(data)
    skills_list, skill_counts = count_skills(data)
    top_skills = top_5(skill_counts)
    missing = missing_resumes(data)
    write_report(total, top_skills, skills_list, skill_counts, missing)
    print("summary_report.txt created")

main()


===== Resume Summary Report =====

Total resumes parsed: 6


Top 5 most common skills:
  C++: 2
  Java: 2
  Python: 2
  Machine Learning: 1
  Data Analysis: 1

Skill frequency:
  Python: 2
  Java: 2
  C++: 2
  SQL: 1
  Django: 1
  React: 1
  Communication: 1
  JavaScript: 1
  Machine Learning: 1
  Data Analysis: 1
  Node.js: 1

Resumes missing phone or skills:
  - Just some random text.


summary_report.txt created
