In [54]:
!pip install PyPDF2

import os
import pandas as pd
from pdfminer.high_level import extract_text
import PyPDF2



## Exception handling

In [68]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            return text
    except Exception as e:
        print(f"Error extracting from {pdf_path}: {e}")
        return None


## Define File Paths

In [69]:
#folder paths
base_folder_path = "C:\\Users\\matte\\Desktop\\ML-AI\\Untitled Folder\\data\\data\\data"
categories = ["HR", "Designer", "Information-Technology", "Teacher", "Advocate", "Business-Development", "Healthcare"]

## Loop Through File Path and create dictionary

In [70]:
all_data = []

In [71]:
for category in categories:
    print(f'Adding all pdf text from {category}')
    folder_path = os.path.join(base_folder_path, category.upper())
    for file_name in os.listdir(folder_path):
        full_path = os.path.join(folder_path, file_name)
        text = extract_text_from_pdf(full_path)
        if text:
            all_data.append({
                'Category': category,
                'FileName': file_name,
                'Text': text.strip()
            })

Adding all pdf text from HR
Adding all pdf text from Designer
Adding all pdf text from Information-Technology
Adding all pdf text from Teacher
Adding all pdf text from Advocate
Adding all pdf text from Business-Development
Adding all pdf text from Healthcare


## Create the Data Frame

In [85]:
df = pd.DataFrame(all_data)

In [105]:
print(df['Text'])


0      HR PERSONNEL ASSISTANT\nSummary\nI am a U.S. c...
1      HR BENEFITS/LEAVE COORDINATOR\nSummary\n13 yea...
2      HR MANAGER\nSummary\nHuman Resources Manager w...
3      HR GENERALIST\nSummary\nDedicated and focused ...
4      HR EMPLOYEE RELATIONS SPECIALIST\nSummary\nDed...
                             ...                        
386    PERSONAL HEALTHCARE ASSISTANT\nProfessional Su...
387    OCCUPATIONAL HEALTH NURSE COORDINATOR\nProfess...
388    KEY ACCOUNT MANAGER\nSummary\nAccomplished pha...
389    TEACHER ASSISTANT\nSummary\nSeeking a challeng...
390    OFFICE MANAGER/MANAGING DIRECTOR\nProfessional...
Name: Text, Length: 391, dtype: object


## Create csv file

In [97]:
# Save to a CSV if needed
df.to_csv("extracted_resume_texts_4.csv", index=False)

# Begin NLP Pre-Processing

Before training a model, the data needs be preprocessed:

Tokenization: Breaking the text into individual words or tokens.
Stop words removal: Removing common words that do not contribute much to the content.
Vectorization: Converting text data into a numerical format using techniques like TF-IDF.

## Import sklearn to do the heavy lifting

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [89]:
# From above, use the 'df' as the DataFrame
X = df['Text']  # Extracting the resume content as features
y = df['Category']  # Extracting the job categories as labels

## Splitting the data 

In [91]:
# Splitting the data into a training set and a test set (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Begin Pre-Processing

In [92]:
# Vectorizing the text data using TF-IDF (this includes tokenization, stop words removal, and vectorization)
# - stop_words='english': Removes common English words (like 'and', 'the', etc.) that don't contain useful information
# - max_features=5000: Limits the vector length to the top 5000 terms by frequency
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_transformed = vectorizer.fit_transform(X_train)  # Fit to the training data and transform it
X_test_transformed = vectorizer.transform(X_test)  # Transform the test data

## Training a classifier using Logistic Regression

In [93]:
# - max_iter=1000: Maximum number of iterations for the optimization algorithm to converge
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_transformed, y_train)

In [94]:
# Predicting job categories on the test set
y_pred = clf.predict(X_test_transformed)

## Printing the Evaluation of the model

In [95]:
# Accuracy gives a simple ratio of correct predictions to total predictions
print("Accuracy:", accuracy_score(y_test, y_pred))
# Classification report provides a breakdown of the model's performance for each category (precision, recall, f1-score)
print(classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.7341772151898734
                        precision    recall  f1-score   support

              Advocate       0.41      0.78      0.54         9
  Business-Development       0.72      0.76      0.74        17
              Designer       0.87      0.96      0.92        28
                    HR       1.00      0.00      0.00         3
            Healthcare       1.00      0.00      0.00         6
Information-Technology       0.80      0.73      0.76        11
               Teacher       1.00      0.60      0.75         5

              accuracy                           0.73        79
             macro avg       0.83      0.55      0.53        79
          weighted avg       0.80      0.73      0.70        79

