In [1]:
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import NMF
import pandas as pd
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parth.parikh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\parth.parikh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\parth.parikh1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load labeled resume data from CSV
resume_data = pd.read_csv(r'C:\Users\parth.parikh1\Downloads\data.csv')  # Replace 'labeled_resumes.csv' with your CSV file path

In [5]:
resume_data['Category'].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [7]:
# Preprocess labeled data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Cleaning white spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Removing URLS
    text = re.sub(r"https?://\S+|www\.\S+"," ",text)
    # Removing html tags
    text = re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"," ",text)
    # Removing phone numbers
    text = re.sub(r"\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b", " ", text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Removing emails
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", " ", text)
    # Keeping only alphanumeric values
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [9]:
resume_data['preprocessed_text'] = resume_data['Resume'].apply(preprocess_text)

In [10]:
# Encode labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(resume_data['Category'])

In [11]:
# Feature extraction with NMF
nmf_model = NMF(n_components=100)  # Adjust n_components as needed
tfidf_vectorizer = TfidfVectorizer()
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('nmf', nmf_model)
])
train_embeddings = pipeline.fit_transform(resume_data['preprocessed_text'])

In [12]:
# Train a classifier (using Random Forest)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed
classifier.fit(train_embeddings, train_labels_encoded)

In [13]:
def extract_text_from_pdf(file_path):
    text = ""
    if file_path.endswith('.pdf'):
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
    # elif file_path.endswith('.docx'):
    #     # For extracting text from docx files, you can use textract as before
    #     text = textract.process(file_path).decode('utf-8')
    return text

In [22]:
def predict_domain_from_resume(resume_text):
    resume_embedding = pipeline.transform([preprocess_text(resume_text)])
    predicted_label_encoded = classifier.predict(resume_embedding)
    predicted_label = label_encoder.inverse_transform(predicted_label_encoded)
    return predicted_label[0]

In [23]:
def main():
    # Input resume file path
    resume_file_path = input("Enter the path of the PDF resume: ")
    if not os.path.exists(resume_file_path):
        print("File not found.")
        return

    # Extract text from resume
    resume_text = extract_text_from_pdf(resume_file_path)

    # Predict domain from resume
    predicted_domain = predict_domain_from_resume(resume_text)

    print("Predicted Domain:", predicted_domain)

In [27]:
if __name__ == "__main__":
    main()

Predicted Domain: Data Science
