In [28]:
import re # for cleaning Resume_str
import pandas as pd

### Load file

In [29]:
file_path = 'Resume.csv'
df = pd.read_csv(file_path)

### Data observation

In [30]:
df.columns

Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')

In [31]:
df = df.drop(columns=['Resume_html'])
df.columns

Index(['ID', 'Resume_str', 'Category'], dtype='object')

In [32]:
df.head(5)

Unnamed: 0,ID,Resume_str,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,33176873,HR DIRECTOR Summary Over 2...,HR
3,27018550,HR SPECIALIST Summary Dedica...,HR
4,17812897,HR MANAGER Skill Highlights ...,HR


In [33]:
df.Resume_str[0]

"         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strategies, inventory control, loss pr

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2484 non-null   int64 
 1   Resume_str  2484 non-null   object
 2   Category    2484 non-null   object
dtypes: int64(1), object(2)
memory usage: 58.3+ KB


In [35]:
df_gb = df.groupby('Category')
print('Number of Category: {}'.format(df_gb.ngroups))
print(df_gb.size())

Number of Category: 24
Category
ACCOUNTANT                118
ADVOCATE                  118
AGRICULTURE                63
APPAREL                    97
ARTS                      103
AUTOMOBILE                 36
AVIATION                  117
BANKING                   115
BPO                        22
BUSINESS-DEVELOPMENT      120
CHEF                      118
CONSTRUCTION              112
CONSULTANT                115
DESIGNER                  107
DIGITAL-MEDIA              96
ENGINEERING               118
FINANCE                   118
FITNESS                   117
HEALTHCARE                115
HR                        110
INFORMATION-TECHNOLOGY    120
PUBLIC-RELATIONS          111
SALES                     116
TEACHER                   102
dtype: int64


### Preprocess data

In [36]:
def clean_spaces(s):
    s = ' '.join(re.split('[ ]+', s.strip()))

    return s

# Todo:
# add more preprocess function for preprocessor

def preprocessor(df):
    df['Resume_str'] = df['Resume_str'].apply(lambda x: clean_spaces(x))

    return df

In [37]:
df = preprocessor(df)

### Map Resume_str to a embedding (vector)

In [38]:
import torch
from sentence_transformers import SentenceTransformer, util

In [39]:
model = SentenceTransformer('all-distilroberta-v1')
model.max_seq_length = 512

# Corpus with resumes
Resume_corpus = df['Resume_str'].tolist()

# Calculate the embeddng for every resume_str
corpus_embeddings = model.encode(Resume_corpus)
print(corpus_embeddings.shape)

(2484, 768)


### Apply k-Means clustering on the embeddings

In [40]:
import numpy as np
from sklearn.cluster import KMeans

num_clusters = df.groupby('Category').ngroups # 24
clustering_model = KMeans(n_clusters=num_clusters)

clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_ # Get the clustered label for each embedding
print(cluster_assignment.shape)

clustered_resumes = [[] for i in range(num_clusters)] # Will contain embeddings for each cluster  
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_resumes[cluster_id].append(Resume_corpus[sentence_id])

(2484,)


In [41]:
print('Number of resumes in each cluster')
for i, cluster in enumerate(clustered_resumes):
    print('Cluster {}: {}'.format(i+1, len(cluster)))

Number of resumes in each cluster
Cluster 1: 47
Cluster 2: 82
Cluster 3: 41
Cluster 4: 190
Cluster 5: 204
Cluster 6: 133
Cluster 7: 131
Cluster 8: 105
Cluster 9: 116
Cluster 10: 98
Cluster 11: 137
Cluster 12: 53
Cluster 13: 107
Cluster 14: 86
Cluster 15: 95
Cluster 16: 92
Cluster 17: 143
Cluster 18: 105
Cluster 19: 103
Cluster 20: 72
Cluster 21: 34
Cluster 22: 118
Cluster 23: 100
Cluster 24: 92


In [42]:
# Todo:
# 1. Visualize the clustered result
# 2. Try applying other clustering methods
# 3. Other stuff that can perform on cluster, e.g. Topic modeling?

### Find the cross domain (category) resumes

In [43]:
def get_avg_embeddings(df_gb, group_name):
    print('Group name: {}'.format(group_name))
    df_group = df_gb.get_group(group_name)

    # Resume corpus of groups
    group_corpus = df_group['Resume_str'].tolist()

    group_corpus_embeddings = model.encode(group_corpus, convert_to_tensor=True)
    print('Shape of group_corpus_embeddings: {}'.format(group_corpus_embeddings.shape))

    group_avg_embedding = torch.mean(group_corpus_embeddings, dim=0, keepdim=True)
    print('Shape of group_avg_embeddings: {}'.format(group_avg_embedding.shape))

    return group_avg_embedding

In [44]:
def search_resumeID(query_embedding, corpus_embeddings, top_k):
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]
    for hit in hits:
        print(df.ID[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))

In [45]:
ACCNT_avg_embedding = get_avg_embeddings(df_gb=df_gb, group_name='ACCOUNTANT')
IT_avg_embedding = get_avg_embeddings(df_gb=df_gb, group_name='INFORMATION-TECHNOLOGY')

Group name: ACCOUNTANT
Shape of group_corpus_embeddings: torch.Size([118, 768])
Shape of group_avg_embeddings: torch.Size([1, 768])
Group name: INFORMATION-TECHNOLOGY
Shape of group_corpus_embeddings: torch.Size([120, 768])
Shape of group_avg_embeddings: torch.Size([1, 768])


In [46]:
query_embedding = (ACCNT_avg_embedding + IT_avg_embedding) / 2
search_resumeID(query_embedding, corpus_embeddings, top_k=5)

27637576 (Score: 0.8978)
11441764 (Score: 0.8915)
23636277 (Score: 0.8828)
22492537 (Score: 0.8826)
25127518 (Score: 0.8812)


In [47]:
# Todo:
# 1. Cross 3 or more domain (categories)
# 2. Find qualitative example (we can show in report/presentation)
# 3. Other vector (embedding) operation to perform, i.e., other task