In [45]:
import PyPDF2 as pdf
from PyPDF2 import PdfReader, PdfWriter
import pandas as pd
import os
import fitz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [16]:
folder_path = "D:\Resume"

In [24]:
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text.lower() 


In [25]:
texts = []
file_names = []

In [26]:
for file_name in os.listdir(folder_path):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(folder_path, file_name)
        text = extract_text_from_pdf(file_path)
        texts.append(text)
        file_names.append(file_name)

In [27]:
df = pd.DataFrame({'File_Name' : file_names, 'Text' : texts})

## Cleaning the data 
Tokenization, Removing stopwords, lemmatization.

In [80]:
# Cleaning the Data
stop_words = set(stopwords.words('english'))
def clean_text(sentence):

    # Cleaning white spaces
    sentence = re.sub(r"\s+", " ", sentence).strip()

    # Removing URLS
    sentence = re.sub(r"https?://\S+|www\.\S+"," ",sentence)
    
    # Removing html tags
    sentence = re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"," ",sentence)

    # Removing phone numbers
    sentence = re.sub(r"\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b", " ", sentence)
    
    # Removing emails
    sentence = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", " ", sentence)

    # Keeping only alphanumeric values
    sentence = re.sub(r"[^a-zA-Z\s]", " ", sentence)
    
    
    # Cleaning .com
    sentence = re.sub(r"\.com"," ",sentence)
        
    sentence = sentence.lower()

    #Removing Stop Words
    tokens = ""
    for token in sentence.split():
        if token not in stop_words:
            tokens=tokens+" "+token

    return tokens

In [81]:
df['Cleaned_Text'] = df['Text'].apply(clean_text)

In [82]:
df

Unnamed: 0,File_Name,Text,Cleaned_Text,Lemmatized_Text,Tokenized_Text
0,academic_resume.pdf,kahan jash\n+91 95865 35559 • kahanjash15@gmai...,kahan jash linkedin industrial experience cyg...,kahan jash 91 95865 35559 linkedin industrial ...,"[kahan, jash, 91, 95865, 35559, linkedin, indu..."
1,Ashish Patel Data Scientist-7.pdf,ashish patel \nsr. data scientist & author \n8...,ashish patel sr data scientist author ml rese...,ashish patel sr data scientist author ml resea...,"[ashish, patel, sr, data, scientist, author, m..."
2,Jimit's Resume_August23_Tech.pdf,page 1 of 2 \njimit vaghela \nsoftware enginee...,page jimit vaghela software engineer experien...,page 1 2 jimit vaghela software engineer exper...,"[page, 1, 2, jimit, vaghela, software, enginee..."
3,Komal_Lamba_Resume.pdf,\nexperienced data science professional with ...,experienced data science professional years e...,experienced data science professional 2 year e...,"[experienced, data, science, professional, 2, ..."
4,malhar resume.pdf,malhar shinde\nmachine learning enthusiast wit...,malhar shinde machine learning enthusiast dem...,malhar shinde machine learning enthusiast demo...,"[malhar, shinde, machine, learning, enthusiast..."
5,Manas's Resume (2).pdf,manas jani \ndata analyst \nversatile professi...,manas jani data analyst versatile professiona...,manas jani data analyst versatile professional...,"[manas, jani, data, analyst, versatile, profes..."
6,Manas's Resume (6).pdf,manas jani \ndata analyst \nversatile professi...,manas jani data analyst versatile professiona...,manas jani data analyst versatile professional...,"[manas, jani, data, analyst, versatile, profes..."
7,Manas's Resume .pdf,manas jani \ndata analyst \nversatile professi...,manas jani data analyst versatile professiona...,manas jani data analyst versatile professional...,"[manas, jani, data, analyst, versatile, profes..."
8,parth resume.pdf,parth parikh \n +91 95585 55477 \n email: ...,parth parikh email education b tech electroni...,parth parikh 91 95585 55477 email education b ...,"[parth, parikh, 91, 95585, 55477, email, educa..."
9,pavan resume updated-2 (1).pdf,pavan gupta\npre final year cse student of gla...,pavan gupta pre final year cse student gla un...,pavan gupta pre final year cse student gla uni...,"[pavan, gupta, pre, final, year, cse, student,..."


## Tokenization

In [63]:
def tokenize(text):
    return word_tokenize(text)
df['Tokenized_Text'] = df['Cleaned_Text'].apply(tokenize)

In [64]:
df.head()

Unnamed: 0,File_Name,Text,Cleaned_Text,Lemmatized_Text,Tokenized_Text
0,academic_resume.pdf,kahan jash\n+91 95865 35559 • kahanjash15@gmai...,kahan jash 91 95865 35559 linkedin industrial...,kahan jash +91 95865 35559 • • linkedin indust...,"[kahan, jash, 91, 95865, 35559, linkedin, indu..."
1,Ashish Patel Data Scientist-7.pdf,ashish patel \nsr. data scientist & author \n8...,ashish patel sr data scientist author ml rese...,ashish patel sr. data scientist & author ml re...,"[ashish, patel, sr, data, scientist, author, m..."
2,Jimit's Resume_August23_Tech.pdf,page 1 of 2 \njimit vaghela \nsoftware enginee...,page 1 2 jimit vaghela software engineer expe...,page 1 2 jimit vaghela software engineer exper...,"[page, 1, 2, jimit, vaghela, software, enginee..."
3,Komal_Lamba_Resume.pdf,\nexperienced data science professional with ...,experienced data science professional 2 years..., experienced data science professional 2 year...,"[experienced, data, science, professional, 2, ..."
4,malhar resume.pdf,malhar shinde\nmachine learning enthusiast wit...,malhar shinde machine learning enthusiast dem...,malhar shinde machine learning enthusiast demo...,"[malhar, shinde, machine, learning, enthusiast..."


## Lemmatization

In [83]:
lemmatizer = WordNetLemmatizer()

In [84]:
def lemmatize_text(text):
    lemmatized_tokens = []
    for token in text.split():
        lemmatized_token = lemmatizer.lemmatize(token)
        lemmatized_tokens.append(lemmatized_token)
    return ' '.join(lemmatized_tokens)

In [85]:
df['Lemmatized_Text'] = df['Cleaned_Text'].apply(lemmatize_text)

In [86]:
df

Unnamed: 0,File_Name,Text,Cleaned_Text,Lemmatized_Text,Tokenized_Text
0,academic_resume.pdf,kahan jash\n+91 95865 35559 • kahanjash15@gmai...,kahan jash linkedin industrial experience cyg...,kahan jash linkedin industrial experience cygn...,"[kahan, jash, 91, 95865, 35559, linkedin, indu..."
1,Ashish Patel Data Scientist-7.pdf,ashish patel \nsr. data scientist & author \n8...,ashish patel sr data scientist author ml rese...,ashish patel sr data scientist author ml resea...,"[ashish, patel, sr, data, scientist, author, m..."
2,Jimit's Resume_August23_Tech.pdf,page 1 of 2 \njimit vaghela \nsoftware enginee...,page jimit vaghela software engineer experien...,page jimit vaghela software engineer experienc...,"[page, 1, 2, jimit, vaghela, software, enginee..."
3,Komal_Lamba_Resume.pdf,\nexperienced data science professional with ...,experienced data science professional years e...,experienced data science professional year exp...,"[experienced, data, science, professional, 2, ..."
4,malhar resume.pdf,malhar shinde\nmachine learning enthusiast wit...,malhar shinde machine learning enthusiast dem...,malhar shinde machine learning enthusiast demo...,"[malhar, shinde, machine, learning, enthusiast..."
5,Manas's Resume (2).pdf,manas jani \ndata analyst \nversatile professi...,manas jani data analyst versatile professiona...,manas jani data analyst versatile professional...,"[manas, jani, data, analyst, versatile, profes..."
6,Manas's Resume (6).pdf,manas jani \ndata analyst \nversatile professi...,manas jani data analyst versatile professiona...,manas jani data analyst versatile professional...,"[manas, jani, data, analyst, versatile, profes..."
7,Manas's Resume .pdf,manas jani \ndata analyst \nversatile professi...,manas jani data analyst versatile professiona...,manas jani data analyst versatile professional...,"[manas, jani, data, analyst, versatile, profes..."
8,parth resume.pdf,parth parikh \n +91 95585 55477 \n email: ...,parth parikh email education b tech electroni...,parth parikh email education b tech electronic...,"[parth, parikh, 91, 95585, 55477, email, educa..."
9,pavan resume updated-2 (1).pdf,pavan gupta\npre final year cse student of gla...,pavan gupta pre final year cse student gla un...,pavan gupta pre final year cse student gla uni...,"[pavan, gupta, pre, final, year, cse, student,..."


## TF-IDF Vectorization

In [90]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Lemmatized_Text'])

# Setting topics to 3
lda = LatentDirichletAllocation(n_components=1)

In [91]:
lda.fit(X)
print(lda.components_)

[[1.05294086 1.04716047 1.06108586 ... 1.03815392 1.15874706 1.08668693]]


## Topics and Associated Words

In [93]:
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx + 1}: ")
    #Get top 5 words with highest weights for the topic
    top_words_idx = topic.argsort()[-1:][::-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_idx]
    print(", ".join(top_words))
    print()

Topic 1: 
data

