In [10]:
import pdftotext
import logging
import spacy
import joblib
import logging
import numpy as np

In [11]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [12]:
class_mapping = {
    'ACCOUNTANT': 0,
    'ADVOCATE': 1,
    'AGRICULTURE': 2,
    'APPAREL': 3,
    'ARTS': 4,
    'AUTOMOBILE': 5,
    'AVIATION': 6,
    'BANKING': 7,
    'BPO': 8,
    'BUSINESS-DEVELOPMENT': 9,
    'CHEF': 10,
    'CONSTRUCTION': 11,
    'CONSULTANT': 12,
    'DESIGNER': 13,
    'DIGITAL-MEDIA': 14,
    'ENGINEERING': 15,
    'FINANCE': 16,
    'FITNESS': 17,
    'HEALTHCARE': 18,
    'HR': 19,
    'INFORMATION-TECHNOLOGY': 20,
    'PUBLIC-RELATIONS': 21,
    'SALES': 22,
    'TEACHER': 23
}

In [13]:
model = joblib.load("../../app/AIModel/domain/model_xg.pkl")
vectorizer = joblib.load("../../app/AIModel/domain/vector_xg.pkl")
nlp = spacy.load("en_core_web_lg")

In [14]:
def remove_stopwords_and_lemmatize(text):
    cleaned_text = ' '.join(text.strip().split())
    doc = nlp(cleaned_text.lower())
    tokens_lemmatized = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens_lemmatized)

In [15]:
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, "rb") as f:
        pdf = pdftotext.PDF(f)
    return "\n\n".join(pdf)

In [16]:
def process_pdf_and_classify(pdf_file):
    text = extract_text_from_pdf(pdf_file)
    preprocessed_text = remove_stopwords_and_lemmatize(text)
    X = np.array([preprocessed_text])
    X_vec = vectorizer.transform(X)
    probabilities = model.predict_proba(X_vec)
    return probabilities

In [17]:
def top_probabilities(probabilities):
    top_indices = np.argsort(probabilities[0])[::-1][:5]
    print("Top 5 probabilidades:")
    for idx in top_indices:
        class_name = [key for key, value in class_mapping.items() if value == idx][0]
        prob_percent = probabilities[0][idx] * 100
        print(f"{class_name}: {prob_percent:.2f}%")

In [38]:
pdf_file = 'TEACHER.pdf'
probabilities = process_pdf_and_classify(pdf_file)

In [39]:
top_probabilities(probabilities) 

Top 5 probabilidades:
TEACHER: 99.90%
BUSINESS-DEVELOPMENT: 0.02%
AGRICULTURE: 0.01%
SALES: 0.01%
ARTS: 0.01%
