In [None]:
from backend import config
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Load the CSV email dataset
data_frame = pd.read_csv(os.path.join(config.DATADIR, 'dataset', 'final', 'cleaned_email_dataset.csv'))
data_frame.head(5)

Unnamed: 0,messages,label
0,thank applying weve received application revie...,0
1,application successfully submitted team review...,0
2,weve received job application hiring team revi...,0
3,thank applying application review touch shortl...,0
4,application received appreciate interest revie...,0


In [12]:
# Shuffling the data frame
data_frame = data_frame.sample(frac=1)
data_frame.head(5)

Unnamed: 0,messages,label
89,complete assessmentplease complete assessment ...,1
314,careful review decided pursue candidates role,3
291,regret inform chosen move forward applicants,3
88,next step complete assessmentcomplete attached...,1
178,upcoming interviewyour interview set please re...,2


In [4]:
# Loading the embedding model 'all-MiniLM-L6-v2'
model_embedding = SentenceTransformer('all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
# Create another data frame with embeddings
data_frame_embedding = data_frame.copy()
data_frame_embedding['embeddings'] = data_frame['messages'].apply(model_embedding.encode)
data_frame_embedding.head()

Unnamed: 0,messages,label,embeddings
89,complete assessmentplease complete assessment ...,1,"[-0.06800712, 0.011791748, 0.0048056487, 0.029..."
314,careful review decided pursue candidates role,3,"[-0.01601289, 0.03447098, -0.027226582, 0.0244..."
291,regret inform chosen move forward applicants,3,"[-0.0044419556, 0.052336972, 0.07175572, 0.007..."
88,next step complete assessmentcomplete attached...,1,"[-0.035205007, -0.004156081, 0.0038351587, 0.0..."
178,upcoming interviewyour interview set please re...,2,"[-0.10671389, 0.10529863, 0.028867992, 0.00845..."


In [29]:
# Splitting the data into train, test after dividing them into X and y
X = data_frame_embedding['embeddings'].tolist()
y = data_frame_embedding['label'].tolist()
type(X), type(y)

(list, list)

In [30]:
from sklearn.model_selection import train_test_split

# Splitting in train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [34]:
# Training a Logistic Regression model
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
model_LR.fit(X_train, y_train)



In [35]:
from sklearn import metrics

# Predicting on test data
test_prediction = model_LR.predict(X_test)
# Calculate Precision, Recall, and Accuracy
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, test_prediction))
print("Logistic Regression Precision (macro):", metrics.precision_score(y_test, test_prediction, average='macro'))
print("Logistic Regression Recall (macro):", metrics.recall_score(y_test, test_prediction, average='macro'))

Logistic Regression Accuracy: 1.0
Logistic Regression Precision (macro): 1.0
Logistic Regression Recall (macro): 1.0


In [56]:
# Prediction on actual emails
label_mappings = {
    0: 'Applied Jobs',
    1: 'Assessments',
    2: 'Interview Scheduled',
    3: 'Rejections'
}

email_message = """Hi Nilay,

We've received your application for R32881 - Autonomy Intern- ML/AI position and are excited about your interest in moving the world forward at Oshkosh! We're currently reviewing all applications for this role and will be in touch as soon as possible.

In the meantime, we encourage you to explore how we're making a difference at www.oshkoshcorp.com.

We appreciate you considering us for your next career opportunity!

Thank you,

Oshkosh Talent Acquisition Team"""

In [57]:
email_message_embeddings = [model_embedding.encode(email_message)]
email_message_embeddings

[array([-4.29931432e-02, -3.42442058e-02,  3.85792069e-02, -2.54681539e-02,
         5.86832426e-02, -4.38545235e-02, -1.26685472e-02,  4.30225506e-02,
        -8.52764174e-02, -1.82065163e-02, -8.63190517e-02, -1.75612997e-02,
         5.44540398e-03,  1.87922716e-02, -1.05846589e-02,  5.18034622e-02,
         3.76903340e-02, -7.49456957e-02, -8.10540393e-02, -6.53143227e-02,
        -1.43497041e-03, -3.82996164e-02,  4.15813476e-02, -1.24139851e-02,
        -1.71025656e-02,  4.70296927e-02,  4.97256666e-02, -3.21901706e-03,
        -2.02992111e-02, -5.48825301e-02,  2.72424687e-02, -1.40126999e-02,
         6.21387586e-02, -1.18452478e-02,  9.04961750e-02,  1.17827542e-01,
         2.42999606e-02, -8.64721015e-02,  2.13957168e-02,  1.77384131e-02,
        -1.23494286e-02, -6.67191669e-02, -4.74820286e-03, -3.47712799e-03,
        -5.83515642e-03, -1.13718761e-02, -6.71023056e-02, -4.30286787e-02,
         4.05420922e-02,  4.79223691e-02, -5.40285856e-02, -1.10059172e-01,
         3.4

In [58]:
category_prediction = model_LR.predict(email_message_embeddings)[0]
label_mappings[category_prediction]

'Rejections'