In [16]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

In [8]:
# Load and preprocess data
resume_data = pd.read_csv(r'D:\HR-Analytics-Final\notebook\undersampled_data.csv')

In [9]:
# Function for text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    text = re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", " ", text)
    text = re.sub(r"\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b", " ", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [11]:
resume_data['preprocessed_text'] = resume_data['Resume'].apply(preprocess_text)

In [12]:
# Encode labels
label_encoder = LabelEncoder()
resume_data['Category_encoded'] = label_encoder.fit_transform(resume_data['Category'])

In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    resume_data['preprocessed_text'], 
    resume_data['Category_encoded'], 
    test_size=0.2, 
    random_state=42, 
    stratify=resume_data['Category_encoded']
)

In [14]:
# Define TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [36]:
# Define parameters for each classifier
rf_params = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

svm_params = {
    'clf__C': [0.1, 1, 10],
    'clf__gamma': ['scale', 'auto'],
    'clf__kernel': ['linear', 'rbf']
}

nb_params = {
    'clf__alpha': [0.1, 0.5, 1.0]
}

logreg_params = {
    'clf__C': [0.1, 1, 10],
    'clf__penalty': ['l1', 'l2']
}

gb_params = {
    'clf__learning_rate': [0.1, 0.05, 0.01],
    'clf__max_depth': [3, 5, 10],
    'clf__min_samples_split': [2, 5, 10]
}

nn_params = {
    'clf__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'clf__activation': ['relu', 'logistic', 'tanh', 'softmax'],
    'clf__solver': ['adam', 'sgd'],  # 'adam' often works well for large datasets, 'sgd' for smaller datasets
    'clf__alpha': [0.0001, 0.001, 0.01]
}

In [37]:
# Define classifier pipelines with TF-IDF vectorizer
classifier_pipelines = {
    'Random Forest': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', RandomForestClassifier(random_state=42))
    ]),
    'SVM': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', SVC(random_state=42))
    ]),
    'Multinomial Naive Bayes': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ]),
    'Logistic Regression': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(random_state=42))
    ]),
    'Neural Network': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', MLPClassifier(random_state=42))
    ])
}


In [38]:
from sklearn.metrics import log_loss

# Perform grid search cross-validation for each classifier
for name, pipeline in classifier_pipelines.items():
    print("Training", name)
    if name == 'SVM':
        grid_search = GridSearchCV(pipeline, svm_params, cv=3, scoring='accuracy', n_jobs=-1)
    elif name == 'Random Forest':
        grid_search = GridSearchCV(pipeline, rf_params, cv=3, scoring='accuracy', n_jobs=-1)
    elif name == 'Multinomial Naive Bayes':
        grid_search = GridSearchCV(pipeline, nb_params, cv=3, scoring='accuracy', n_jobs=-1)
    elif name == 'Logistic Regression':
        grid_search = GridSearchCV(pipeline, logreg_params, cv=3, scoring='accuracy', n_jobs=-1)
    elif name == 'Neural Network':
        grid_search = GridSearchCV(pipeline, nn_params, cv=3, scoring='accuracy', n_jobs=-1)
    else:
        continue
    
    grid_search.fit(X_train, y_train)
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation accuracy:", grid_search.best_score_)

    # Calculate loss on the test set
    if hasattr(grid_search.best_estimator_, 'predict_proba'):
        y_pred_proba = grid_search.predict_proba(X_test)
        loss = log_loss(y_test, y_pred_proba)
        print("Test loss:", loss)

    y_pred = grid_search.predict(X_test)
    print("Test accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("\n")


Training Random Forest
Best parameters: {'clf__max_depth': 20, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}
Best cross-validation accuracy: 0.9903902964584854
Test loss: 0.5800176092115225
Test accuracy: 1.0
Classification Report:
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         4
                     Arts       1.00      1.00      1.00         4
       Automation Testing       1.00      1.00      1.00         4
               Blockchain       1.00      1.00      1.00         4
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         4
             Data Analyst       1.00      1.00      1.00         4
             Data Science       1.00      1.00      1.00         4
                 Database       1.00      1.00      1.00         4
          DevOps Engineer       1.00      1.00      1.00    

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\parth.parikh1\AppData\Local\anaconda3\envs\hrenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\parth.parikh1\AppData\Local\anaconda3\envs\hrenv\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\parth.parikh1\AppData\Local\anaconda3\envs\hrenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual

Best parameters: {'clf__C': 10, 'clf__penalty': 'l2'}
Best cross-validation accuracy: 0.983178674451743
Test loss: 0.2357708639841659
Test accuracy: 0.9903846153846154
Classification Report:
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         4
                     Arts       1.00      1.00      1.00         4
       Automation Testing       1.00      1.00      1.00         4
               Blockchain       1.00      1.00      1.00         4
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         4
             Data Analyst       1.00      1.00      1.00         4
             Data Science       0.80      1.00      0.89         4
                 Database       1.00      0.75      0.86         4
          DevOps Engineer       1.00      1.00      1.00         4
         DotNet Developer       1.00      1.00      1.00         4
    

54 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\parth.parikh1\AppData\Local\anaconda3\envs\hrenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\parth.parikh1\AppData\Local\anaconda3\envs\hrenv\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\parth.parikh1\AppData\Local\anaconda3\envs\hrenv\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 747, in fit
    self._validate_params()
  File "c:\Users\p

Best parameters: {'clf__activation': 'logistic', 'clf__alpha': 0.01, 'clf__hidden_layer_sizes': (100,), 'clf__solver': 'adam'}
Best cross-validation accuracy: 0.9422201369339311
Test loss: 0.09968510223376012
Test accuracy: 0.9903846153846154
Classification Report:
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         4
                     Arts       1.00      1.00      1.00         4
       Automation Testing       1.00      1.00      1.00         4
               Blockchain       1.00      1.00      1.00         4
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         4
             Data Analyst       1.00      1.00      1.00         4
             Data Science       0.80      1.00      0.89         4
                 Database       1.00      0.75      0.86         4
          DevOps Engineer       1.00      1.00      1.00       

In [35]:
resume_data.head()

Unnamed: 0,Resume,Category,preprocessed_text,Category_encoded
0,"TECHNICAL QUALIFICATIONS: â?¢ Windows, Ms. Off...",Advocate,technical qualification window m officeeducati...,0
1,"Education Details \n B.Com, LL.B., Universit...",Advocate,education detail bcom llb university clacutta ...,0
2,Education Details \n LLB. Dibrugarh Universi...,Advocate,education detail llb dibrugarh university advo...,0
3,Education Details \nNovember 2016 to January 2...,Advocate,education detail november january llm master l...,0
4,SKILLS â?¢ Knows English as native speaker (IE...,Advocate,skill know english native speaker ielts overal...,0


In [49]:
import joblib
# Define the pipeline with the best parameters for the Neural Network
best_nn_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', solver='adam', alpha=0.01, random_state=42))
])

# Print the pipeline
print(best_nn_pipeline)


Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
                ('clf',
                 MLPClassifier(activation='logistic', alpha=0.01,
                               random_state=42))])


In [50]:

# Save the pipeline with the best parameters
joblib.dump(best_nn_pipeline, 'best_nn_pipeline.pkl')

['best_nn_pipeline.pkl']

The best performing model with 0.099 loss and 0.99 ccuracy is Neural Network with Multilayer Percepton and parameters as 
Best parameters: {'clf__activation': 'logistic', 'clf__alpha': 0.01, 'clf__hidden_layer_sizes': (100,), 'clf__solver': 'adam'}

In [51]:
# Save the label encoder object to a file
joblib.dump(label_encoder, 'label_encoder_final.pkl')

['label_encoder_final.pkl']

In [52]:
# Define TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Fit the TF-IDF vectorizer with your training data
tfidf_vectorizer.fit(X_train)

# Save the fitted TF-IDF vectorizer to a file
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer_final.pkl')

['tfidf_vectorizer_final.pkl']

In [55]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Define the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Define the StandardScaler
scaler = StandardScaler(with_mean=False)

# Define the MLPClassifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', solver='adam', alpha=0.01, random_state=42)

# Define the pipeline with the TF-IDF vectorizer, StandardScaler, and MLPClassifier
best_nn_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('scaler', scaler),
    ('clf', mlp_classifier)
])

# Fit the pipeline with training data
best_nn_pipeline.fit(X_train, y_train)


In [56]:
# Save the pipeline with the best parameters
joblib.dump(best_nn_pipeline, 'best_nn_pipeline.pkl')

['best_nn_pipeline.pkl']

In [58]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Define the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Define the Multinomial Naive Bayes classifier with the best parameters
best_nb_classifier = MultinomialNB(alpha=0.1)

# Define the pipeline with the TF-IDF vectorizer and Multinomial Naive Bayes classifier
best_nb_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', best_nb_classifier)
])

# Fit the pipeline with training data
best_nb_pipeline.fit(X_train, y_train)




In [59]:
# Save the trained pipeline with the best parameters
joblib.dump(best_nb_pipeline, 'best_nb_pipeline.pkl')

['best_nb_pipeline.pkl']