#Import Libraries and Load Dataset

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from transformers import BertTokenizer, BertModel
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Load the cleaned dataset
file_path = "combined_cleaned_dataset.csv"  # Update with your file name
combined_df = pd.read_csv(file_path)

In [6]:
combined_df.head()

Unnamed: 0,ID,Name,Role,Transcript,Resume,decision,Reason for decision,Job Description,Transcript_length,Resume_length,Job_Description_length,Transcript_Resume_Score,Transcript_JobDescription_Score,Resume_JobDescription_Score,Final_Match_Score
0,uppaup,alice smith,software engineer,heres a simulated interview for a software eng...,heres a sample resume for alice smith\n\nalice...,rejected,unsatisfactory references or background check,here is a comprehensive job description for a ...,3101,2599,3857,0.128843,0.159541,0.267942,0.185442
1,uppaup,hank brown,software engineer,heres a simulated interview for a software eng...,heres a resume for hank brown a selected candi...,selected,growth mindset and adaptability,here is a job description for a software engin...,4965,2634,1066,0.268331,0.190271,0.268425,0.242342
2,uppaup,bob jones,data scientist,heres a simulated interview for a data scienti...,heres a sample resume for bob jones who applie...,rejected,inadequate communication or interpersonal skills,here is a comprehensive job description for a ...,2803,3050,3105,0.311734,0.285505,0.466292,0.354511
3,uppaup,bob miller,software engineer,heres a simulated interview for a software eng...,heres a sample resume for bob miller a softwar...,selected,strong cultural fit,here is a sample job description for a softwar...,4119,2491,3354,0.226898,0.259491,0.326965,0.271118
4,uppaup,ivy jones,data engineer,heres a simulated interview for a data enginee...,heres a sample resume for ivy jones\n\nivy jon...,rejected,lack of relevant skills or experience,here is a sample job description for a data en...,3321,2864,3843,0.361991,0.420489,0.588236,0.456905


In [7]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3174 entries, 0 to 3173
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               3174 non-null   object 
 1   Name                             3174 non-null   object 
 2   Role                             3174 non-null   object 
 3   Transcript                       3174 non-null   object 
 4   Resume                           3174 non-null   object 
 5   decision                         3174 non-null   object 
 6   Reason for decision              3174 non-null   object 
 7   Job Description                  3174 non-null   object 
 8   Transcript_length                3174 non-null   int64  
 9   Resume_length                    3174 non-null   int64  
 10  Job_Description_length           3174 non-null   int64  
 11  Transcript_Resume_Score          3174 non-null   float64
 12  Transcript_JobDescri

#Generate BERT Embeddings

In [8]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

#Purpose:
 Initializes the BERT tokenizer and model for generating embeddings.
#Explanation:
BertTokenizer is used to tokenize the text data.

BertModel is the pre-trained BERT model that will be used to generate embeddings.

In [9]:
def get_bert_embeddings(text_column):
    """
    Generate BERT embeddings for a given text column.
    """
    embeddings = []
    for text in text_column:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        outputs = bert_model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy()
        embeddings.append(cls_embedding.flatten())
    return np.array(embeddings)

#Bert Embeddings for Transcript

In [11]:
# Generate embeddings
combined_df['Transcript_embeddings'] = list(get_bert_embeddings(combined_df['Transcript']))

#Bert Embeddings for Resume

In [12]:
combined_df['Resume_embeddings'] = list(get_bert_embeddings(combined_df['Resume']))

#Bert Embeddings for Job_Description

In [13]:
combined_df['Job_Description_embeddings'] = list(get_bert_embeddings(combined_df['Job Description']))

In [18]:
combined_df.head()

Unnamed: 0,ID,Name,Role,Transcript,Resume,decision,Reason for decision,Job Description,Transcript_length,Resume_length,Job_Description_length,Transcript_Resume_Score,Transcript_JobDescription_Score,Resume_JobDescription_Score,Final_Match_Score,Transcript_embeddings,Resume_embeddings,Job_Description_embeddings
0,uppaup,alice smith,software engineer,heres a simulated interview for a software eng...,heres a sample resume for alice smith\n\nalice...,rejected,unsatisfactory references or background check,here is a comprehensive job description for a ...,3101,2599,3857,0.128843,0.159541,0.267942,0.185442,"[-0.12769948, 0.23564614, -0.24298234, -0.1760...","[-0.60704345, 0.21568073, -0.24985889, -0.1235...","[-0.5358736, -0.48269117, 0.21164827, -0.26815..."
1,uppaup,hank brown,software engineer,heres a simulated interview for a software eng...,heres a resume for hank brown a selected candi...,selected,growth mindset and adaptability,here is a job description for a software engin...,4965,2634,1066,0.268331,0.190271,0.268425,0.242342,"[-0.40148726, 0.009195297, -0.2979158, -0.1014...","[-0.9606815, -0.13537142, -0.20789696, -0.1510...","[-0.14280689, -0.18455483, -0.14317629, 0.2373..."
2,uppaup,bob jones,data scientist,heres a simulated interview for a data scienti...,heres a sample resume for bob jones who applie...,rejected,inadequate communication or interpersonal skills,here is a comprehensive job description for a ...,2803,3050,3105,0.311734,0.285505,0.466292,0.354511,"[-0.3487926, 0.020128839, -0.2336443, -0.16934...","[-0.9852158, -0.068383835, 0.0065283687, -0.18...","[-0.63903207, -0.117160745, 0.24371581, 0.2260..."
3,uppaup,bob miller,software engineer,heres a simulated interview for a software eng...,heres a sample resume for bob miller a softwar...,selected,strong cultural fit,here is a sample job description for a softwar...,4119,2491,3354,0.226898,0.259491,0.326965,0.271118,"[-0.5011333, 0.124364235, -0.27339292, -0.0982...","[-0.7354975, 0.3179848, -0.035682846, -0.12026...","[-0.7920371, 0.15553382, -0.26356754, 0.256035..."
4,uppaup,ivy jones,data engineer,heres a simulated interview for a data enginee...,heres a sample resume for ivy jones\n\nivy jon...,rejected,lack of relevant skills or experience,here is a sample job description for a data en...,3321,2864,3843,0.361991,0.420489,0.588236,0.456905,"[-0.2763126, 0.10006799, -0.15044409, -0.14078...","[-0.6419997, 0.09423823, 0.046364304, 0.057823...","[-0.60751843, 0.102332525, 0.26001558, 0.10051..."


In [19]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3174 entries, 0 to 3173
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               3174 non-null   object 
 1   Name                             3174 non-null   object 
 2   Role                             3174 non-null   object 
 3   Transcript                       3174 non-null   object 
 4   Resume                           3174 non-null   object 
 5   decision                         3174 non-null   object 
 6   Reason for decision              3174 non-null   object 
 7   Job Description                  3174 non-null   object 
 8   Transcript_length                3174 non-null   int64  
 9   Resume_length                    3174 non-null   int64  
 10  Job_Description_length           3174 non-null   int64  
 11  Transcript_Resume_Score          3174 non-null   float64
 12  Transcript_JobDescri

In [49]:
# Assuming 'data' is your pandas DataFrame
unique_decisions = combined_df['decision'].unique()
print("Unique values in 'decision' column:", unique_decisions)

Unique values in 'decision' column: ['rejected' 'selected' 'reject' 'select']


In [51]:
# Assuming 'data' is your pandas DataFrame
combined_df['decision'] = combined_df['decision'].replace({'select': 'selected', 'reject': 'rejected'})

In [60]:
# Assuming 'data' is your pandas DataFrame
unique_decisions = combined_df['decision'].unique()
print("Unique values in 'decision' column:", unique_decisions)

Unique values in 'decision' column: ['rejected' 'selected']


#Combine Features and Train-Test Split

In [52]:
# Combine embeddings and features
handcrafted_features = ['Transcript_length', 'Resume_length', 'Job_Description_length']
X_embeddings = np.hstack([
    np.vstack(df['Transcript_embeddings']),
    np.vstack(df['Resume_embeddings']),
    np.vstack(df['Job_Description_embeddings']),
    combined_df[handcrafted_features].values
])

In [53]:
# Encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['decision'])

In [54]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42, stratify=y)

In [57]:
print(np.unique(y_test))

[0 1]


#Random Forest Model

In [59]:
# Train Random Forest
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

# Evaluate Random Forest
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest - ROC AUC Score:", roc_auc_score(y_test, y_prob_rf))

Random Forest - Accuracy: 0.7984251968503937
Random Forest - ROC AUC Score: 0.9120363939436033


#Artificial Neural Network (ANN)

In [61]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# Define ANN model
ann = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

In [62]:
# Compile the ANN
ann.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [67]:
# Train the ANN
history = ann.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7890 - loss: 0.3806 - val_accuracy: 0.8031 - val_loss: 0.3234
Epoch 2/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8085 - loss: 0.3359 - val_accuracy: 0.8268 - val_loss: 0.3127
Epoch 3/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8047 - loss: 0.3454 - val_accuracy: 0.8287 - val_loss: 0.3124
Epoch 4/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8126 - loss: 0.3297 - val_accuracy: 0.8307 - val_loss: 0.3057
Epoch 5/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8272 - loss: 0.3149 - val_accuracy: 0.8130 - val_loss: 0.3690
Epoch 6/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7755 - loss: 0.3782 - val_accuracy: 0.8051 - val_loss: 0.3251
Epoch 7/100
[1m64/64[0m [32m━━━

In [68]:
# Evaluate the ANN
y_prob_ann = ann.predict(X_test).flatten()
y_pred_ann = (y_prob_ann > 0.5).astype(int)

print("ANN - Accuracy:", accuracy_score(y_test, y_pred_ann))
print("ANN - ROC AUC Score:", roc_auc_score(y_test, y_prob_ann))

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
ANN - Accuracy: 0.8251968503937008
ANN - ROC AUC Score: 0.9303127418490663


#Combine Random Forest and ANN Predictions

In [69]:
# Combine predictions from ANN and Random Forest
combined_predictions = (y_prob_rf + y_prob_ann) / 2
combined_pred_labels = (combined_predictions > 0.5).astype(int)

# Evaluate combined model
print("Combined Model - Accuracy:", accuracy_score(y_test, combined_pred_labels))
print("Combined Model - ROC AUC Score:", roc_auc_score(y_test, combined_predictions))


Combined Model - Accuracy: 0.8062992125984252
Combined Model - ROC AUC Score: 0.9314339293155796


In [71]:
# Save the combined dataset with embeddings for future use
df.to_csv("combined_dataset_with_embeddings.csv", index=False)
print("Dataset with embeddings saved as 'combined_dataset_with_embeddings.xlsx'")


Dataset with embeddings saved as 'combined_dataset_with_embeddings.xlsx'
