# **Resume ChatBot**

In [226]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
import nltk
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from scipy.stats import pearsonr

In [3]:
from sklearn.model_selection import GridSearchCV

In [4]:
data = pd.read_csv('resume_screening_dataset.csv')

In [5]:
conn = sqlite3.connect('resume.db')

In [6]:
data.to_sql('Resume',conn,if_exists='replace')

5000

In [7]:
df = pd.read_sql('SELECT * FROM Resume',conn)

In [8]:
df.head()

Unnamed: 0,index,resume_text,job_description,qualified
0,0,"AI Researcher with skills in Cybersecurity, Et...",Looking for a AI Researcher with experience in...,1
1,1,"Data Scientist with skills in Python, Machine ...",Looking for a Data Scientist with experience i...,0
2,2,"Data Scientist with skills in SQL, PostgreSQL,...",Looking for a Data Scientist with experience i...,1
3,3,Cybersecurity Analyst with skills in Cybersecu...,Looking for a Cybersecurity Analyst with exper...,1
4,4,"IT Support Specialist with skills in Java, Spr...",Looking for a IT Support Specialist with exper...,0


## Data Preprocessing

1) **LowerCase**

In [9]:
def to_lowercase(text):
    df[text] = df[text].str.lower()

In [10]:
to_lowercase('resume_text')
to_lowercase('job_description')

2) **Remove Special Characters**

In [11]:
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]','',text)
    return  cleaned_text

In [12]:
df['resume_text'] = df['resume_text'].apply(clean_text)
df['job_description'] = df['job_description'].apply(clean_text)

3) **Tokenization**

In [13]:
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

def tokenize_text(text):
    return word_tokenize(text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\madha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
df['resume_text_cleaned'] = df['resume_text'].apply(tokenize_text)
df['job_description_cleaned'] = df['job_description'].apply(tokenize_text)

In [15]:
df.head()

Unnamed: 0,index,resume_text,job_description,qualified,resume_text_cleaned,job_description_cleaned
0,0,ai researcher with skills in cybersecurity eth...,looking for a ai researcher with experience in...,1,"[ai, researcher, with, skills, in, cybersecuri...","[looking, for, a, ai, researcher, with, experi..."
1,1,data scientist with skills in python machine l...,looking for a data scientist with experience i...,0,"[data, scientist, with, skills, in, python, ma...","[looking, for, a, data, scientist, with, exper..."
2,2,data scientist with skills in sql postgresql m...,looking for a data scientist with experience i...,1,"[data, scientist, with, skills, in, sql, postg...","[looking, for, a, data, scientist, with, exper..."
3,3,cybersecurity analyst with skills in cybersecu...,looking for a cybersecurity analyst with exper...,1,"[cybersecurity, analyst, with, skills, in, cyb...","[looking, for, a, cybersecurity, analyst, with..."
4,4,it support specialist with skills in java spri...,looking for a it support specialist with exper...,0,"[it, support, specialist, with, skills, in, ja...","[looking, for, a, it, support, specialist, wit..."


In [16]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\madha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def remove_stopwords(text):
    filtered_words = [word for word in text if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [18]:
df['resume_text_cleaned_2'] = df['resume_text_cleaned'].apply(remove_stopwords)
df['job_description_cleaned_2'] = df['job_description_cleaned'].apply(remove_stopwords)

In [19]:
df['resume_text_cleaned_2'] = df['resume_text_cleaned_2'].apply(lambda x: x.split())

In [20]:
df['job_description_cleaned_2'] = df['job_description_cleaned_2'].apply(lambda x: x.split())

In [21]:
df.head()

Unnamed: 0,index,resume_text,job_description,qualified,resume_text_cleaned,job_description_cleaned,resume_text_cleaned_2,job_description_cleaned_2
0,0,ai researcher with skills in cybersecurity eth...,looking for a ai researcher with experience in...,1,"[ai, researcher, with, skills, in, cybersecuri...","[looking, for, a, ai, researcher, with, experi...","[ai, researcher, skills, cybersecurity, ethica...","[looking, ai, researcher, experience, cybersec..."
1,1,data scientist with skills in python machine l...,looking for a data scientist with experience i...,0,"[data, scientist, with, skills, in, python, ma...","[looking, for, a, data, scientist, with, exper...","[data, scientist, skills, python, machine, lea...","[looking, data, scientist, experience, linux, ..."
2,2,data scientist with skills in sql postgresql m...,looking for a data scientist with experience i...,1,"[data, scientist, with, skills, in, sql, postg...","[looking, for, a, data, scientist, with, exper...","[data, scientist, skills, sql, postgresql, mys...","[looking, data, scientist, experience, sql, po..."
3,3,cybersecurity analyst with skills in cybersecu...,looking for a cybersecurity analyst with exper...,1,"[cybersecurity, analyst, with, skills, in, cyb...","[looking, for, a, cybersecurity, analyst, with...","[cybersecurity, analyst, skills, cybersecurity...","[looking, cybersecurity, analyst, experience, ..."
4,4,it support specialist with skills in java spri...,looking for a it support specialist with exper...,0,"[it, support, specialist, with, skills, in, ja...","[looking, for, a, it, support, specialist, wit...","[support, specialist, skills, java, spring, bo...","[looking, support, specialist, experience, jav..."


In [22]:
df_resume = df[['resume_text_cleaned_2','job_description_cleaned_2','qualified']]

In [23]:
df_resume.head()

Unnamed: 0,resume_text_cleaned_2,job_description_cleaned_2,qualified
0,"[ai, researcher, skills, cybersecurity, ethica...","[looking, ai, researcher, experience, cybersec...",1
1,"[data, scientist, skills, python, machine, lea...","[looking, data, scientist, experience, linux, ...",0
2,"[data, scientist, skills, sql, postgresql, mys...","[looking, data, scientist, experience, sql, po...",1
3,"[cybersecurity, analyst, skills, cybersecurity...","[looking, cybersecurity, analyst, experience, ...",1
4,"[support, specialist, skills, java, spring, bo...","[looking, support, specialist, experience, jav...",0


## Text embeddings

In [24]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
from torch.nn.functional import cosine_similarity

In [26]:
import torch

In [27]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
df_resume['resume_text_cleaned_3'] = df_resume['resume_text_cleaned_2'].apply(lambda x: model.encode(x,convert_to_tensor=True))
df_resume['job_description_cleaned_3'] = df_resume['job_description_cleaned_2'].apply(lambda x: model.encode(x,convert_to_tensor=True))

In [29]:
df_resume.head()

Unnamed: 0,resume_text_cleaned_2,job_description_cleaned_2,qualified,resume_text_cleaned_3,job_description_cleaned_3
0,"[ai, researcher, skills, cybersecurity, ethica...","[looking, ai, researcher, experience, cybersec...",1,"[[tensor(-0.0365), tensor(-0.0152), tensor(0.0...","[[tensor(-0.0874), tensor(-0.0195), tensor(-0...."
1,"[data, scientist, skills, python, machine, lea...","[looking, data, scientist, experience, linux, ...",0,"[[tensor(0.0262), tensor(0.0674), tensor(-0.01...","[[tensor(-0.0874), tensor(-0.0195), tensor(-0...."
2,"[data, scientist, skills, sql, postgresql, mys...","[looking, data, scientist, experience, sql, po...",1,"[[tensor(0.0262), tensor(0.0674), tensor(-0.01...","[[tensor(-0.0874), tensor(-0.0195), tensor(-0...."
3,"[cybersecurity, analyst, skills, cybersecurity...","[looking, cybersecurity, analyst, experience, ...",1,"[[tensor(-0.0408), tensor(-0.0063), tensor(-0....","[[tensor(-0.0874), tensor(-0.0195), tensor(-0...."
4,"[support, specialist, skills, java, spring, bo...","[looking, support, specialist, experience, jav...",0,"[[tensor(-0.0943), tensor(0.0330), tensor(0.01...","[[tensor(-0.0874), tensor(-0.0195), tensor(-0...."


In [30]:
def calculate_similarity(resume_embedding, job_embedding):
    
    resume_embedding = torch.tensor(resume_embedding) if not isinstance(resume_embedding, torch.Tensor) else resume_embedding
    job_embedding = torch.tensor(job_embedding) if not isinstance(job_embedding, torch.Tensor) else job_embedding

    resume_embedding = resume_embedding.view(-1)
    job_embedding = job_embedding.view(-1)

    
    print(f"Resume Embedding Shape: {resume_embedding.shape}")
    print(f"Job Embedding Shape: {job_embedding.shape}")

    if resume_embedding.shape != job_embedding.shape:
        min_size = min(resume_embedding.shape[0], job_embedding.shape[0])
        resume_embedding = resume_embedding[:min_size]
        job_embedding = job_embedding[:min_size]

    similarity = cosine_similarity(resume_embedding.unsqueeze(0), job_embedding.unsqueeze(0))  
    return similarity.item() 



In [None]:
df_resume['similarity_score'] = df_resume.apply(
    lambda row: calculate_similarity(row['resume_text_cleaned_3'], row['job_description_cleaned_3']),
    axis=1
)

In [258]:
df_resume['Selected'] = df_resume['similarity_score'].apply(lambda x: 'Yes' if x>0.52 else 'No' )

In [259]:
df_selected = df_resume[df_resume['Selected']=='Yes']

In [260]:
df_resume['Selected_Numeric'] = df_resume['Selected'].map({'Yes':1,'No':0})

In [261]:
accuracy = accuracy_score(df_resume['qualified'],df_resume['Selected_Numeric'])
print(f'Model Accuracy: {accuracy*100:.2f}%')

Model Accuracy: 65.74%


## Model Building

In [262]:
X = np.asarray(df_resume['Selected_Numeric']).reshape(-1,1)
y = np.asarray(df_resume['qualified']).reshape(-1,1)

In [263]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [264]:
model_rf = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=42)

In [265]:
model_rf.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [266]:
ypred_rf = model_rf.predict(X_test)

In [267]:
accuracy_rf = accuracy_score(y_test,ypred_rf)
print(f'Model Accuracy: {accuracy_rf*100:.2f}%')

Model Accuracy: 64.13%


In [268]:
joblib.dump(model_rf,'random_forest_model_2.pkl')

['random_forest_model_2.pkl']

2) **SVM**

In [269]:
model_svm = svm.SVC(C=1,kernel='rbf')

In [270]:
model_svm.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [271]:
ypred_svm = model_svm.predict(X_test)

In [272]:
accuracy_svm = accuracy_score(y_test,ypred_svm)
print(f'Model Accuracy: {accuracy_svm*100:.2f}%')

Model Accuracy: 64.13%


In [273]:
# joblib.dump(model_svm,'resume_screening_model.pkl')

3) **XGBoost**

In [274]:
model_xg = XGBClassifier(n_estimators=100,max_depth=5,learning_rate=0.1)

In [275]:
model_xg.fit(X_train,y_train)

In [276]:
ypred_xg = model_xg.predict(X_test)

In [277]:
accuracy_xg = accuracy_score(y_test,ypred_xg)
print(f'Model Accuracy: {accuracy_xg*100:.2f}%')

Model Accuracy: 64.13%


4) **Logistic Regression**

In [278]:
model_lr = LogisticRegression(C=0.1,solver='liblinear')

In [279]:
model_lr.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [280]:
ypred_lr = model_lr.predict(X_test)

In [281]:
accuracy_lr = accuracy_score(y_test,ypred_lr)
print(f'Model Accuracy: {accuracy_lr*100:.2f}%')

Model Accuracy: 63.67%


5) **Combined Model**

In [282]:
ensemble = VotingClassifier(estimators=[('model_rf',model_rf),('model_lr',model_lr),('model_xg',model_xg)],voting='soft')

In [283]:
ensemble.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [284]:
ypred_en = ensemble.predict(X_test)

In [285]:
accuracy_en = accuracy_score(y_test,ypred_en)
print(f'Model Accuracy: {accuracy_en*100:.2f}%')

Model Accuracy: 64.13%
