In [1]:
import pandas as pd
import os
import fitz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
file_path = r"D:\Resume\java-developer-resume-sample-output.pdf"

In [3]:
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text.lower() 


In [4]:
text = extract_text_from_pdf(file_path)
print(text)

maria hibisson
maria@novoresume.com @
0123 456 789
linkedin.com/in/maria.hibisson fn
gilhub.com/maria.hibisson
maria.hibisson ! '
highly experienced, solutions-oriented professional with1 0+ years of remarkable background in overseeing all aspects of the software
development life cycle, from extensive analysis and design through execution and maintenance. proficient in utilizinga wide variety of
programs and tools to provide high-quality and cost-effective applications/systems to boost organizational efficiency and productivity.
possess unmatched coding and testing skills to deliver client/business-specific programs.
areasofexpertse
*‹’r z'
i
m s-‹
p’
0
s'*“0
-›
e
workexperence
utilize java, enterprise java bean, java ee,and apache struts web applications to develop fully automated client management
systems forthe efficient maintenance ofclient accounts.
— enforce an innovative approach toimprove the client's web
reporting system, which effectively reduced the financial tracking
analys

## Cleaning Data

In [5]:
# Cleaning the Data
stop_words = set(stopwords.words('english'))
def clean_text(sentence):

    # Cleaning white spaces
    sentence = re.sub(r"\s+", " ", sentence).strip()

    # Removing URLS
    sentence = re.sub(r"https?://\S+|www\.\S+"," ",sentence)
    
    # Removing html tags
    sentence = re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"," ",sentence)

    # Removing phone numbers
    sentence = re.sub(r"\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b", " ", sentence)
    
    # Removing emails
    sentence = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", " ", sentence)

    # Keeping only alphanumeric values
    sentence = re.sub(r"[^a-zA-Z\s]", " ", sentence)
    
    
    # Cleaning .com
    sentence = re.sub(r"\.com"," ",sentence)
        
    sentence = sentence.lower()

    #Removing Stop Words
    tokens = ""
    for token in sentence.split():
        if token not in stop_words:
            tokens=tokens+" "+token

    return tokens

In [6]:
clean_text = clean_text(text)

In [7]:
print(clean_text)

 maria hibisson linkedin com maria hibisson fn gilhub com maria hibisson maria hibisson highly experienced solutions oriented professional years remarkable background overseeing aspects software development life cycle extensive analysis design execution maintenance proficient utilizinga wide variety programs tools provide high quality cost effective applications systems boost organizational efficiency productivity possess unmatched coding testing skills deliver client business specific programs areasofexpertse r z p e workexperence utilize java enterprise java bean java ee apache struts web applications develop fully automated client management systems forthe efficient maintenance ofclient accounts enforce innovative approach toimprove client web reporting system effectively reduced financial tracking analysis time supervise leada team juniorjava developers toascertain successful completion key projects company within budget schedule collaborate closely management vendors associated th

## Lemmatization

In [8]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    lemmatized_tokens = []
    for token in text.split():
        lemmatized_token = lemmatizer.lemmatize(token)
        lemmatized_tokens.append(lemmatized_token)
    return ' '.join(lemmatized_tokens)
lemmatized_text = lemmatize_text(clean_text)
print(lemmatized_text)

maria hibisson linkedin com maria hibisson fn gilhub com maria hibisson maria hibisson highly experienced solution oriented professional year remarkable background overseeing aspect software development life cycle extensive analysis design execution maintenance proficient utilizinga wide variety program tool provide high quality cost effective application system boost organizational efficiency productivity posse unmatched coding testing skill deliver client business specific program areasofexpertse r z p e workexperence utilize java enterprise java bean java ee apache strut web application develop fully automated client management system forthe efficient maintenance ofclient account enforce innovative approach toimprove client web reporting system effectively reduced financial tracking analysis time supervise leada team juniorjava developer toascertain successful completion key project company within budget schedule collaborate closely management vendor associated third party guarantee

## Tokenization

In [9]:
def tokenize(text):
    return word_tokenize(text)
tokenized_text = tokenize(lemmatized_text)

In [10]:
tagged = nltk.pos_tag(tokenized_text)

In [11]:
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
nouns = [word for word, tag in tagged if tag in noun_tags]
verbs = [word for word, tag in tagged if tag in verb_tags]

In [12]:
print(nouns)
print(verbs)

['maria', 'com', 'maria', 'gilhub', 'com', 'maria', 'hibisson', 'maria', 'solution', 'year', 'background', 'software', 'development', 'life', 'cycle', 'analysis', 'design', 'execution', 'maintenance', 'variety', 'program', 'tool', 'quality', 'cost', 'application', 'system', 'efficiency', 'productivity', 'posse', 'skill', 'client', 'business', 'program', 'areasofexpertse', 'r', 'z', 'p', 'e', 'workexperence', 'java', 'enterprise', 'java', 'java', 'ee', 'apache', 'strut', 'web', 'application', 'client', 'management', 'system', 'maintenance', 'ofclient', 'account', 'enforce', 'approach', 'client', 'system', 'analysis', 'time', 'supervise', 'leada', 'team', 'juniorjava', 'developer', 'toascertain', 'completion', 'key', 'project', 'company', 'budget', 'schedule', 'collaborate', 'management', 'vendor', 'party', 'guarantee', 'functionality', 'performance', 'transaction', 'database', 'management', 'application', 'ofthenew', 'java', 'vendor', 'application', 'system', 'service', 'industry', 'con

## TF-IDF Vectorization

In [17]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(nouns)

# Setting topics to 3
lda = LatentDirichletAllocation(n_components=3)

In [18]:
lda.fit(X)
print(lda.components_)

[[0.33589041 0.3358562  0.3361421  0.33581753 0.33589113 0.33614205
  0.33589039 0.33589039 2.32854536 0.33563642 0.33589042 0.33570416
  1.32816745 0.3358904  1.3281675  0.3358904  0.33589039 0.33585621
  0.33614207 0.33589039 0.33614206 1.32816748 0.3358904  3.32868262
  0.33614204 2.32854535 0.33589041 0.33589038 1.32816749 2.32854531
  0.33614207 1.32816748 1.32816748 0.33614205 0.33589041 1.32816748
  1.32816749 0.33614203 0.33614203 0.33614205 0.33589041 0.33614205
  4.32875277 0.33614204 0.33614206 0.33614205 0.33614207 0.33589039
  2.3285453  0.3358562  4.32875277 0.33589043 1.32816748 1.32816742
  1.32816744 0.33614209 0.33614207 0.33614204 1.3281675  1.32816741
  2.32854535 0.3358904  0.33614205 0.33614205 1.32816747 0.33589043
  0.33614207 0.3359321  1.3281675  0.33585621 1.32816746 0.33589039
  0.33589044 1.32816749 0.33560181 0.33614208 0.33589039 2.32854545
  1.32816748 0.33614203 0.33589043 1.32816744 0.33556679 1.32816747
  0.33570426 0.33589041 0.33614204 0.33614205 0.

## Topics and Associated Words

In [20]:
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx + 1}: ")
    #Get top 5 words with highest weights for the topic
    top_words_idx = topic.argsort()[-1:][::-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_idx]
    print(", ".join(top_words))
    print()

Topic 1: 
java

Topic 2: 
system

Topic 3: 
application

