In [5]:
import fitz
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
import joblib

# Function for text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    text = re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", " ", text)
    text = re.sub(r"\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b", " ", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


# # Define the TF-IDF vectorizer
# tfidf_vectorizer = TfidfVectorizer()
# # Define the StandardScaler
# scaler = StandardScaler(with_mean=False)
# # Define the MLPClassifier
# mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', solver='adam', alpha=0.01, random_state=42)

# # Define the pipeline with the TF-IDF vectorizer, StandardScaler, and MLPClassifier
# best_nn_pipeline = Pipeline([
#     ('tfidf', tfidf_vectorizer),
#     ('scaler', scaler),
#     ('clf', mlp_classifier)
# ])

# Define the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Define the Multinomial Naive Bayes classifier with the best parameters
best_nb_classifier = MultinomialNB(alpha=0.1)

# Define the pipeline with the TF-IDF vectorizer and Multinomial Naive Bayes classifier
best_nb_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', best_nb_classifier)
])


# Load the fitted TF-IDF vectorizer
#tfidf_vectorizer = joblib.load(r'D:\HR-Analytics-Final\notebook\tfidf_vectorizer_final.pkl')

# Load the label encoder used during model training
label_encoder = joblib.load(r'D:\HR-Analytics-Final\models\label_encoder_final.pkl')  # Load your label encoder object

# Load the trained model
loaded_model = joblib.load(r'D:\HR-Analytics-Final\models\best_nb_pipeline.pkl')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ''
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Path to the PDF file
pdf_path = r'D:\HR-Analytics-Final\src\uploads\Vaishali_Panchal_CV.pdf'

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Preprocess the extracted text
preprocessed_pdf_text = preprocess_text(pdf_text)

# Transform using the fitted TF-IDF vectorizer
#pdf_embeddings = tfidf_vectorizer.transform([preprocessed_pdf_text])

# Make prediction using the loaded model
predicted_domain_number = loaded_model.predict([preprocessed_pdf_text])

# Convert the predicted domain number to domain name
predicted_domain_name = label_encoder.inverse_transform(predicted_domain_number)

print("Predicted Domain:", predicted_domain_name)


Predicted Domain: ['Web Designing']
