# Dummy Data Creation

In [12]:
import os
from fpdf import FPDF
from docx import Document

d=os.path.join(os.getcwd(),"dummy")
os.makedirs(d,exist_ok=True)

dt=[
    {"n":"Mohammad Hashim","e":"hashimmohammad1375.@gmail.com","p":"+91-9876543210","s":"Python, ML, TensorFlow, NLP","x":"AI/ML Engineer at Microsoft (2y)","u":"B.Tech, IIT Bombay"},
    {"n":"Akshu","e":"akshu.g@email.com","p":"+91-9123456789","s":"Java, Spring Boot, Microservices, AWS","x":"Software Eng at ABC (3y)","u":"B.Tech, BIts Pilani"},
    {"n":"Kanak","e":"kanak.k@email.com","p":"+91-8765432109","s":"React, Node.js, MongoDB, Docker","x":"Full Stack Dev at DEF (4y)","u":"B.Tech, NIT Tiruchirappalli"},
]

def cpdf(r,f):
    p=FPDF()
    p.set_auto_page_break(auto=True,margin=15)
    p.add_page()
    p.set_font("Arial",size=12)
    p.cell(200,10,txt="Resume",ln=True,align="C")
    p.ln(10)
    for k,v in r.items():
        p.cell(200,10,txt=f"{k}: {v}",ln=True)
    p.output(f)

def cdocx(r,f):
    dc=Document()
    dc.add_heading("Resume",level=1)
    for k,v in r.items():
        dc.add_paragraph(f"{k}: {v}")
    dc.save(f)

pdfs,docs=[],[]
for i,r in enumerate(dt):
    fpdf=os.path.join(d,f"r{i+1}.pdf")
    fdocx=os.path.join(d,f"r{i+1}.docx")
    cpdf(r,fpdf)
    cdocx(r,fdocx)
    pdfs.append(fpdf)
    docs.append(fdocx)

pdfs,docs


(['f:\\ML\\Projects\\Resume-Screening\\dummy\\r1.pdf',
  'f:\\ML\\Projects\\Resume-Screening\\dummy\\r2.pdf',
  'f:\\ML\\Projects\\Resume-Screening\\dummy\\r3.pdf'],
 ['f:\\ML\\Projects\\Resume-Screening\\dummy\\r1.docx',
  'f:\\ML\\Projects\\Resume-Screening\\dummy\\r2.docx',
  'f:\\ML\\Projects\\Resume-Screening\\dummy\\r3.docx'])

## Extracting Text from PDF & Docx

In [13]:
import os
import fitz  
from docx import Document

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text() + "\n"
    return text.strip()

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs]).strip()

resume_texts = {}
resume_folder = "dummy"

for file in os.listdir(resume_folder):
    file_path = os.path.join(resume_folder, file)
    if file.endswith(".pdf"):
        resume_texts[file] = extract_text_from_pdf(file_path)
    elif file.endswith(".docx"):
        resume_texts[file] = extract_text_from_docx(file_path)

for file, text in resume_texts.items():
    print(f"Extracted Text from {file}:\n{text}\n{'='*50}\n")


Extracted Text from r1.docx:
Resume
n: Mohammad Hashim
e: hashimmohammad1375.@gmail.com
p: +91-9876543210
s: Python, ML, TensorFlow, NLP
x: AI/ML Engineer at Microsoft (2y)
u: B.Tech, IIT Bombay

Extracted Text from r1.pdf:
Resume
n: Mohammad Hashim
e: hashimmohammad1375.@gmail.com
p: +91-9876543210
s: Python, ML, TensorFlow, NLP
x: AI/ML Engineer at Microsoft (2y)
u: B.Tech, IIT Bombay

Extracted Text from r2.docx:
Resume
n: Akshu
e: akshu.g@email.com
p: +91-9123456789
s: Java, Spring Boot, Microservices, AWS
x: Software Eng at ABC (3y)
u: B.Tech, BIts Pilani

Extracted Text from r2.pdf:
Resume
n: Akshu
e: akshu.g@email.com
p: +91-9123456789
s: Java, Spring Boot, Microservices, AWS
x: Software Eng at ABC (3y)
u: B.Tech, BIts Pilani

Extracted Text from r3.docx:
Resume
n: Kanak
e: kanak.k@email.com
p: +91-8765432109
s: React, Node.js, MongoDB, Docker
x: Full Stack Dev at DEF (4y)
u: B.Tech, NIT Tiruchirappalli

Extracted Text from r3.pdf:
Resume
n: Kanak
e: kanak.k@email.com
p: +91-876

## Processing Text

In [26]:
import fitz  
import os
import re

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text() + "\n"
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text.strip()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

resume_folder = "dummy/"

pdfs = [os.path.join(resume_folder, file) for file in os.listdir(resume_folder) if file.endswith(".pdf")]

resume_texts = {pdf: extract_text_from_pdf(pdf) for pdf in pdfs}
preprocessed_resumes = {file: preprocess_text(text) for file, text in resume_texts.items()}

for file, text in preprocessed_resumes.items():
    print(f"Processed text from {file}:\n{text[:500]}...\n")

Processed text from dummy/r1.pdf:
resume n mohammad hashim e hashimmohammad1375gmailcom p 919876543210 s python ml tensorflow nlp x aiml engineer at microsoft 2y u btech iit bombay...

Processed text from dummy/r2.pdf:
resume n akshu e akshugemailcom p 919123456789 s java spring boot microservices aws x software eng at abc 3y u btech bits pilani...

Processed text from dummy/r3.pdf:
resume n kanak e kanakkemailcom p 918765432109 s react nodejs mongodb docker x full stack dev at def 4y u btech nit tiruchirappalli...

