In [1]:
#Install nltk package
#pip install nltk

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re
from collections import Counter
import nltk
from scipy.sparse import csr_matrix
import os

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#read data
ldata = pd.read_csv('_large_data2.csv')

In [4]:
test_data = pd.read_csv('test_data.csv')

In [5]:
ldata['index No.'] = range(len(ldata))

In [6]:
# Combine 'index No.' and 'job_name' columns to create the combined index column
ldata['combined_index'] = [f"{index}_{name}" for name, index in zip(ldata['job_name'], ldata['index No.'])]

In [7]:
#filter data - Use job descriptions using engligh only:
en_prob_mask = ldata['en_prob'] >= 0.9
ldata = ldata[en_prob_mask]

In [8]:
# data scientist jobs are included 4x the amount. Only keeping 1/4 of them
less_ds_mask = ~((ldata['job_name'] == "Data Scientist") & (ldata['job_index'] > 25))
ldata = ldata[less_ds_mask]
ldata.shape[0]

119127

In [9]:
ldata.head()

Unnamed: 0,index,job_name,job_index,description,extracted_text,tokenized_text,en_prob,index No.,combined_index
0,0,Accountant,0,"<div class=""jobsearch-jobDescriptionText"" id=""...",sfd professional corporation is a small but ex...,"['sfd', 'professional', 'corporation', 'is', '...",1.0,0,0_Accountant
1,1,Accountant,1,"<div class=""jobsearch-jobDescriptionText"" id=""...",job no: 512135 brand: flight centre travel gro...,"['job', 'no', 'brand', 'flight', 'centre', 'tr...",1.0,1,1_Accountant
2,2,Accountant,2,"<div class=""jobsearch-jobDescriptionText"" id=""...","the future you want is within reach. at pcl, y...","['future', 'you', 'want', 'is', 'within', 'rea...",1.0,2,2_Accountant
3,3,Accountant,3,"<div class=""jobsearch-jobDescriptionText"" id=""...",junior accountant an enthusiastic and motivate...,"['junior', 'accountant', 'an', 'enthusiastic',...",1.0,3,3_Accountant
4,4,Accountant,4,"<div class=""jobsearch-jobDescriptionText"" id=""...",az accounting firm is a professional cpa firm ...,"['az', 'accounting', 'firm', 'is', 'profession...",1.0,4,4_Accountant


In [10]:
# Only use jobs with 50+ postings
value_counts = ldata['job_name'].value_counts()

jobs_50_count = value_counts[value_counts > 50].index
jobs_mask = ldata['job_name'].isin(list(jobs_50_count))

ldata = ldata[jobs_mask]
ldata['job_name'].value_counts()

job_name
Administrator             2454
Administration Manager    2451
Accountant                2432
Administration Staff      2355
Accounts Manager          2348
                          ... 
Homeopath                   68
Reprographic Assistant      60
Maths Teacher               56
Cartographer                55
Horse Dealer                55
Name: count, Length: 162, dtype: int64

In [11]:
# Load NLTK stopwords
stop_words = set(stopwords.words('english'))
#remove words such as is, to ...
additional_stop_words = {'is', 'to', 'with', 'and', 'for', 'in', 'on', 'of', 'are', 'we', 'you','apply','resume','experience','etc.','a','an'}

In [12]:
def preprocess_text(text):
    # Tokenize the text
    tokens = re.findall(r'\b\w+\b', text.lower())
    # Remove stop words and additional stop words
    tokens = [token for token in tokens if token not in stop_words and token not in additional_stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

In [13]:
# Add a new column with tokenized text but with filler words removed
ldata['clean_tokenized_text'] = ldata['tokenized_text'].apply(preprocess_text)

In [21]:
# Load the .npy file
indices = np.load('_train_data_index.npy')
testindices = np.load('_test_data_index.npy')

train_data = ldata[ldata['index'].isin(indices)]
test_data = ldata[ldata['index'].isin(testindices)]

In [22]:
test_data

Unnamed: 0,index,job_name,job_index,description,extracted_text,tokenized_text,en_prob,index No.,combined_index,clean_tokenized_text
5,5,Accountant,5,"<div class=""jobsearch-jobDescriptionText"" id=""...",position available: accountant about us segway...,"['position', 'available', 'accountant', 'about...",1.0,5,5_Accountant,position available accountant us segway powers...
14,14,Accountant,14,"<div class=""jobsearch-jobDescriptionText"" id=""...",assisting senior accountants in the preparatio...,"['assisting', 'senior', 'accountants', 'prepar...",1.0,14,14_Accountant,assisting senior accountants preparation month...
16,16,Accountant,16,"<div class=""jobsearch-jobDescriptionText"" id=""...",junior accountant role: to provide accounting ...,"['junior', 'accountant', 'role', 'provide', 'a...",1.0,16,16_Accountant,junior accountant role provide accounting supp...
17,17,Accountant,17,"<div class=""jobsearch-jobDescriptionText"" id=""...",staff accountant stern cohen llp is currently ...,"['staff', 'accountant', 'stern', 'cohen', 'llp...",1.0,17,17_Accountant,staff accountant stern cohen llp currently rec...
19,19,Accountant,19,"<div class=""jobsearch-jobDescriptionText"" id=""...","choose local, choose welch llp welch llp is id...","['choose', 'local', 'choose', 'welch', 'llp', ...",1.0,19,19_Accountant,choose local choose welch llp welch llp ideal ...
...,...,...,...,...,...,...,...,...,...,...
135589,135589,Tree Surgeon,335,"<section class=""cached-bot-fjv-skeletonstyle__...",company overview: whether you are experienced ...,"['company', 'overview', 'whether', 'you', 'are...",1.0,135589,135589_Tree Surgeon,company overview whether experienced green ind...
135590,135590,Tree Surgeon,336,"<section class=""cached-bot-fjv-skeletonstyle__...",overview : if you enjoy working outdoors in a ...,"['overview', 'if', 'you', 'enjoy', 'working', ...",1.0,135590,135590_Tree Surgeon,overview enjoy working outdoors professional s...
135592,135592,Tree Surgeon,338,"<section class=""cached-bot-fjv-skeletonstyle__...",what are you looking for in an arboricultural ...,"['what', 'are', 'you', 'looking', 'an', 'arbor...",1.0,135592,135592_Tree Surgeon,looking arboricultural career want make lastin...
135593,135593,Tree Surgeon,339,"<section class=""cached-bot-fjv-skeletonstyle__...",overview : if you enjoy working outdoors in a...,"['overview', 'if', 'you', 'enjoy', 'working', ...",1.0,135593,135593_Tree Surgeon,overview enjoy working outdoors professional s...


In [23]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(train_data['clean_tokenized_text'])

In [24]:
def find_similar_jobs(input_description, df, vectorizer, top_n=10):
    # Preprocess the input job description
    input_description_update = input_description.replace('\n', ' ')
    clean_text = preprocess_text(input_description_update)
    
    # Vectorize the input job description summary
    input_vector = vectorizer.transform([clean_text])
    
    # Compute cosine similarity with original job descriptions' summaries
    similarities = cosine_similarity(input_vector, tfidf_matrix)[0]
    
    # Get indices of top N similar job descriptions
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Retrieve top N similar job descriptions from original database
    similar_jobs = df.iloc[top_indices][['index','job_name', 'clean_tokenized_text', 'extracted_text']]
    
    # Add similarity score to the DataFrame
    similar_jobs['similarity_score'] = similarities[top_indices]
    
    return similar_jobs

In [25]:
# Test
input_description = "An Accountant, or Senior Accounting Professional, is responsible for performing complex accounting needs for their employer. Their duties include assuming a leadership role to guide accounting personnel in daily tasks, reviewing financial data to make forecasting decisions and communicating with company Executives about financial strategies"
similar_jobs = find_similar_jobs(input_description, train_data, vectorizer)
print(similar_jobs)

         index            job_name  \
26582    26582   Advertising Agent   
9437      9437      Accounts Staff   
105023  105023          Roadworker   
6861      6861    Accounts Manager   
26643    26643   Advertising Agent   
9180      9180      Accounts Staff   
104958  104958          Roadworker   
26604    26604   Advertising Agent   
26894    26894   Advertising Agent   
2847      2847  Accounts Assistant   

                                     clean_tokenized_text  \
26582   partner saas company security industry short h...   
9437    position summary new logo sales account execut...   
105023  position summary new logo sales account execut...   
6861    position summary new logo sales account execut...   
26643   strategic account executive partnerships saas ...   
9180    strategic account executive partnerships saas ...   
104958  job brief sales director job description looki...   
26604   pronavigator pronavigator insurance technology...   
26894   koda brands like work ha

In [26]:
def find_similar_jobscompare(input_description, df, vectorizer, top_n=10):
    # Preprocess the input job description
    input_description_update = input_description.replace('\n', ' ')
    clean_text = preprocess_text(input_description_update)
    
    # Vectorize the input job description summary
    input_vector = vectorizer.transform([clean_text])
    
    # Compute cosine similarity with original job descriptions' summaries
    similarities = cosine_similarity(input_vector, tfidf_matrix)[0]
    
    # Get indices of top N similar job descriptions
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Retrieve top N similar job descriptions from original database
    similar_jobs = df.iloc[top_indices][['index','job_name', 'clean_tokenized_text', 'extracted_text']]
    
    # Add similarity score to the DataFrame
    similar_jobs['similarity_score'] = similarities[top_indices]
    
    return similar_jobs

In [None]:
print(test_data.loc[121274, 'extracted_text'])

In [59]:
# Define the number of top similar jobs to consider
top_n = 10

# Iterate over each row in 'test_data' and find similar jobs in 'train_data'
for i, row in test_data[23727:23728].iterrows():
    # Extract job description from 'test_data'
    input_description = row['extracted_text']
    
    # Find similar jobs in 'train_data'
    similar_jobs = find_similar_jobscompare(input_description, train_data, vectorizer, top_n=top_n)
    
    # Extract top N similar job names
    similar_job_names = similar_jobs['job_name'].tolist()
    
    # Add the top N similar job names as individual columns in 'test_data'
    for j in range(top_n):
        column_name = f'similar_job_{j+1}'
        test_data.at[i, column_name] = similar_job_names[j]

In [62]:
# Initialize a list to store the counts
job_name_counts = []

# Iterate over each row in the DataFrame
for index, row in test_data.iterrows():
    # Count how many job titles match the job_name
    count = sum(row[f'similar_job_{i}'] == row['job_name'] for i in range(1, 11))
    # Append the count to the list
    job_name_counts.append(count)

# Add the counts as a new column to the DataFrame
test_data['job_name_count'] = job_name_counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['job_name_count'] = job_name_counts


In [5]:
# Initialize a list to store the counts
job_name_counts_first_5 = []

# Iterate over each row in the DataFrame
for index, row in test_data.iterrows():
    # Count how many job titles match the job_name in the first 5 columns
    count = sum(row[f'similar_job_{i}'] == row['job_name'] for i in range(1, 6))
    # Append the count to the list
    job_name_counts_first_5.append(count)

# Add the counts as a new column to the DataFrame
test_data['job_name_count_first_5'] = job_name_counts_first_5

In [64]:
#Add a column to show % of the job_name that is the same as the test data
test_data['job_name_count_normalized'] = test_data['job_name_count'] / 10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['job_name_count_normalized'] = test_data['job_name_count'] / 10


In [6]:
#Add a column to show % of the job_name that is the same as the test data for first 5 columns only
test_data['job_name_count_normalized_first_5'] = test_data['job_name_count_first_5'] / 5

In [7]:
test_data

Unnamed: 0.1,Unnamed: 0,index,job_name,job_index,description,extracted_text,tokenized_text,en_prob,index No.,combined_index,...,similar_job_5,similar_job_6,similar_job_7,similar_job_8,similar_job_9,similar_job_10,job_name_count,job_name_count_normalized,job_name_count_first_5,job_name_count_normalized_first_5
0,5,5,Accountant,5,"<div class=""jobsearch-jobDescriptionText"" id=""...",position available: accountant about us segway...,"['position', 'available', 'accountant', 'about...",1.0,5,5_Accountant,...,Accountant,Accountant,Accountant,Accountant,Accountant,Accountant,10,1.0,5,1.0
1,14,14,Accountant,14,"<div class=""jobsearch-jobDescriptionText"" id=""...",assisting senior accountants in the preparatio...,"['assisting', 'senior', 'accountants', 'prepar...",1.0,14,14_Accountant,...,Accountant,Accountant,Accountant,Accountant,Accountant,Accountant,10,1.0,5,1.0
2,16,16,Accountant,16,"<div class=""jobsearch-jobDescriptionText"" id=""...",junior accountant role: to provide accounting ...,"['junior', 'accountant', 'role', 'provide', 'a...",1.0,16,16_Accountant,...,Accountant,Accountant,Accountant,Accountant,Accountant,Accountant,9,0.9,4,0.8
3,17,17,Accountant,17,"<div class=""jobsearch-jobDescriptionText"" id=""...",staff accountant stern cohen llp is currently ...,"['staff', 'accountant', 'stern', 'cohen', 'llp...",1.0,17,17_Accountant,...,Accountant,Accountant,Accountant,Accountant,Accountant,Accountant,10,1.0,5,1.0
4,19,19,Accountant,19,"<div class=""jobsearch-jobDescriptionText"" id=""...","choose local, choose welch llp welch llp is id...","['choose', 'local', 'choose', 'welch', 'llp', ...",1.0,19,19_Accountant,...,Accountant,Accountant,Accounts Clerk,Accountant,Accountant,Accountant,9,0.9,5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23723,135589,135589,Tree Surgeon,335,"<section class=""cached-bot-fjv-skeletonstyle__...",company overview: whether you are experienced ...,"['company', 'overview', 'whether', 'you', 'are...",1.0,135589,135589_Tree Surgeon,...,Tree Surgeon,Market Gardener,Tree Surgeon,Market Gardener,Tree Surgeon,Market Gardener,4,0.4,2,0.4
23724,135590,135590,Tree Surgeon,336,"<section class=""cached-bot-fjv-skeletonstyle__...",overview : if you enjoy working outdoors in a ...,"['overview', 'if', 'you', 'enjoy', 'working', ...",1.0,135590,135590_Tree Surgeon,...,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,10,1.0,5,1.0
23725,135592,135592,Tree Surgeon,338,"<section class=""cached-bot-fjv-skeletonstyle__...",what are you looking for in an arboricultural ...,"['what', 'are', 'you', 'looking', 'an', 'arbor...",1.0,135592,135592_Tree Surgeon,...,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,10,1.0,5,1.0
23726,135593,135593,Tree Surgeon,339,"<section class=""cached-bot-fjv-skeletonstyle__...",overview : if you enjoy working outdoors in a...,"['overview', 'if', 'you', 'enjoy', 'working', ...",1.0,135593,135593_Tree Surgeon,...,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,Tree Surgeon,10,1.0,5,1.0


In [68]:
#calculate total % of job title that as the same as test data
percentage = test_data['job_name_count'].sum()/(len(test_data) * 10)
print(percentage)

0.3852916385704653


In [8]:
#calculate total % of job title that as the same as test data
percentage_first_5 = test_data['job_name_count_first_5'].sum()/(len(test_data) * 5)
print(percentage_first_5)

0.42223533378287254


In [9]:
test_data.to_csv('test_data.csv')