In [1]:
#Install nltk package
#pip install nltk

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re
from collections import Counter
import nltk
from scipy.sparse import csr_matrix
import os

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#read data
ldata = pd.read_csv('_large_data2.csv')

In [5]:
ldata['index No.'] = range(len(ldata))

In [6]:
# Combine 'index No.' and 'job_name' columns to create the combined index column
ldata['combined_index'] = [f"{index}_{name}" for name, index in zip(ldata['job_name'], ldata['index No.'])]

In [7]:
#filter data - Use job descriptions using engligh only:
en_prob_mask = ldata['en_prob'] >= 0.9
ldata = ldata[en_prob_mask]

In [8]:
# data scientist jobs are included 4x the amount. Only keeping 1/4 of them
less_ds_mask = ~((ldata['job_name'] == "Data Scientist") & (ldata['job_index'] > 25))
ldata = ldata[less_ds_mask]
ldata.shape[0]

119127

In [32]:
ldata.head()

Unnamed: 0,index,job_name,job_index,description,extracted_text,tokenized_text,en_prob,index No.,combined_index,clean_tokenized_text
0,0,Accountant,0,"<div class=""jobsearch-jobDescriptionText"" id=""...",sfd professional corporation is a small but ex...,"['sfd', 'professional', 'corporation', 'is', '...",1.0,0,0_Accountant,sfd professional corporation small expanding a...
1,1,Accountant,1,"<div class=""jobsearch-jobDescriptionText"" id=""...",job no: 512135 brand: flight centre travel gro...,"['job', 'no', 'brand', 'flight', 'centre', 'tr...",1.0,1,1_Accountant,job brand flight centre travel group work type...
2,2,Accountant,2,"<div class=""jobsearch-jobDescriptionText"" id=""...","the future you want is within reach. at pcl, y...","['future', 'you', 'want', 'is', 'within', 'rea...",1.0,2,2_Accountant,future want within reach pcl personal professi...
3,3,Accountant,3,"<div class=""jobsearch-jobDescriptionText"" id=""...",junior accountant an enthusiastic and motivate...,"['junior', 'accountant', 'an', 'enthusiastic',...",1.0,3,3_Accountant,junior accountant enthusiastic motivated indiv...
4,4,Accountant,4,"<div class=""jobsearch-jobDescriptionText"" id=""...",az accounting firm is a professional cpa firm ...,"['az', 'accounting', 'firm', 'is', 'profession...",1.0,4,4_Accountant,az accounting firm professional cpa firm offer...


In [9]:
# Only use jobs with 50+ postings
value_counts = ldata['job_name'].value_counts()

jobs_50_count = value_counts[value_counts > 50].index
jobs_mask = ldata['job_name'].isin(list(jobs_50_count))

ldata = ldata[jobs_mask]
ldata['job_name'].value_counts()

job_name
Administrator             2454
Administration Manager    2451
Accountant                2432
Administration Staff      2355
Accounts Manager          2348
                          ... 
Homeopath                   68
Reprographic Assistant      60
Maths Teacher               56
Cartographer                55
Horse Dealer                55
Name: count, Length: 162, dtype: int64

In [10]:
# Load NLTK stopwords
stop_words = set(stopwords.words('english'))
#remove words such as is, to ...
additional_stop_words = {'is', 'to', 'with', 'and', 'for', 'in', 'on', 'of', 'are', 'we', 'you','apply','resume','experience','etc.','a','an'}

In [11]:
def preprocess_text(text):
    # Tokenize the text
    tokens = re.findall(r'\b\w+\b', text.lower())
    # Remove stop words and additional stop words
    tokens = [token for token in tokens if token not in stop_words and token not in additional_stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

In [12]:
# Add a new column with tokenized text but with filler words removed
ldata['clean_tokenized_text'] = ldata['tokenized_text'].apply(preprocess_text)

In [13]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(ldata['clean_tokenized_text'])

In [14]:
def find_similar_jobs(input_description, df, vectorizer, top_n=15):
    # Preprocess the input job description
    input_description_update = input_description.replace('\n', ' ')
    clean_text = preprocess_text(input_description_update)
    
    # Vectorize the input job description summary
    input_vector = vectorizer.transform([clean_text])
    
    # Compute cosine similarity with original job descriptions' summaries
    similarities = cosine_similarity(input_vector, tfidf_matrix)[0]
    
    # Get indices of top N similar job descriptions
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Retrieve top N similar job descriptions from original database
    similar_jobs = df.iloc[top_indices][['index','job_name', 'clean_tokenized_text', 'extracted_text']]
    
    # Add similarity score to the DataFrame
    similar_jobs['similarity_score'] = similarities[top_indices]
    
    return similar_jobs

In [31]:
# Test
#input_description = "An Accountant, or Senior Accounting Professional, is responsible for performing complex accounting needs for their employer. Their duties include assuming a leadership role to guide accounting personnel in daily tasks, reviewing financial data to make forecasting decisions and communicating with company Executives about financial strategies"
#input_description = """Serve as lead data strategist to identify and 
#integrate new datasets that can be leveraged through our product capabilities, 
#and work closely with the engineering team in the development of data products Execute analytical experiments to help solve problems across various domains and industries Identify relevant data sources and sets to mine for client business needs, and collect large structured and unstructured datasets and variables Devise and utilize algorithms and models to mine big-data stores; perform data and error analysis to improve models; clean and validate data for uniformity and accuracy Analyze data for trends and patterns, and interpret data with clear objectives in mind Implement analytical models in production by collaborating with software developers and machine-learning engineers"""
#input_description = """Generate Leads: Proactively identify and pursue potential clients through various channels such as cold calling, email campaigns, networking events, and social media outreach.

#Qualify Prospects: Assess potential clients' needs, budgets, timelines, and decision-making processes to determine their suitability as qualified leads.

#Conduct Product Demonstrations: Present the features and benefits of our SAAS solutions to potential clients through personalized demos, webinars, and presentations.

#Build Relationships: Cultivate strong relationships with leads and existing clients to understand their pain points, provide tailored solutions, and maintain long-term partnerships.

#Handle Objections: Address any concerns or objections raised by prospects regarding our SAAS offerings, and effectively communicate the value proposition to overcome hesitations.

#Negotiate Contracts: Collaborate with clients to negotiate pricing, terms, and contracts, ensuring a mutually beneficial agreement that aligns with both their needs and our company's goals.

#Track Sales Pipeline: Maintain accurate records of all sales activities, including leads, opportunities, communications, and deals, using CRM software to track progress and forecast future sales.

#Stay Updated: Stay abreast of industry trends, competitive offerings, and market developments to effectively position our SAAS solutions and maintain a competitive edge.

#Collaborate with Team: Work closely with marketing, product development, and customer success teams to share insights, gather feedback, and ensure a cohesive approach to client acquisition and retention.

#Achieve Sales Targets: Meet or exceed monthly, quarterly, and annual sales targets by effectively managing time, resources, and priorities to drive revenue growth for the company.

#Provide Feedback: Gather feedback from clients regarding product satisfaction, user experience, and feature requests to contribute to ongoing product enhancements and improvements.

#Continuous Learning: Continuously improve sales skills and product knowledge through training sessions, workshops, and self-study to enhance performance"""
#input_description = """I want to be an actor"""
input_description = """Generate Leads: Proactively identify and pursue potential clients through various channels such as cold calling, email campaigns, networking events, and social media outreach.
 
Qualify Prospects: Assess potential clients' needs, budgets, timelines, and decision-making processes to determine their suitability as qualified leads.
 
Conduct Product Demonstrations: Present the features and benefits of our SAAS solutions to potential clients through personalized demos, webinars, and presentations.
 
Build Relationships: Cultivate strong relationships with leads and existing clients to understand their pain points, provide tailored solutions, and maintain long-term partnerships.
 
Handle Objections: Address any concerns or objections raised by prospects regarding our SAAS offerings, and effectively communicate the value proposition to overcome hesitations.
 
Negotiate Contracts: Collaborate with clients to negotiate pricing, terms, and contracts, ensuring a mutually beneficial agreement that aligns with both their needs and our company's goals.
 
Track Sales Pipeline: Maintain accurate records of all sales activities, including leads, opportunities, communications, and deals, using CRM software to track progress and forecast future sales.
 
Stay Updated: Stay abreast of industry trends, competitive offerings, and market developments to effectively position our SAAS solutions and maintain a competitive edge.
 
Collaborate with Team: Work closely with marketing, product development, and customer success teams to share insights, gather feedback, and ensure a cohesive approach to client acquisition and retention.
 
Achieve Sales Targets: Meet or exceed monthly, quarterly, and annual sales targets by effectively managing time, resources, and priorities to drive revenue growth for the company.
 
Provide Feedback: Gather feedback from clients regarding product satisfaction, user experience, and feature requests to contribute to ongoing product enhancements and improvements.
 
Continuous Learning: Continuously improve sales skills and product knowledge through training sessions, workshops, and self-study to enhance performance and adapt to evolving market dynamics."""
similar_jobs = find_similar_jobs(input_description, ldata, vectorizer)
print(similar_jobs)

         index            job_name  \
32183    32183               Agent   
26582    26582   Advertising Agent   
105023  105023          Roadworker   
6861      6861    Accounts Manager   
9437      9437      Accounts Staff   
2571      2571  Accounts Assistant   
9180      9180      Accounts Staff   
26643    26643   Advertising Agent   
104958  104958          Roadworker   
26894    26894   Advertising Agent   
9133      9133      Accounts Staff   
26604    26604   Advertising Agent   
9204      9204      Accounts Staff   
99597    99597     Revenue Officer   
9444      9444      Accounts Staff   

                                     clean_tokenized_text  \
32183   partner saas company security industry short h...   
26582   partner saas company security industry short h...   
105023  position summary new logo sales account execut...   
6861    position summary new logo sales account execut...   
9437    position summary new logo sales account execut...   
2571    position summary 

In [30]:
print(ldata.loc[121274, 'extracted_text'])

summary: provide professional assistance to internal and external customers having hardware- and  problems with their supported desktop, laptop or peripherals. able to resolve local area networking issues to ensure connectivity to the corporate network, and work with the infrastructure and messaging teams as necessary to resolve incidents. support is provided in a timely manner in accordance to published slas while maintaining a high level of customer satisfactions. position duties and responsibilities provide  technical support for supported desktops, laptops, and peripherals. this includes the following activities: maintaining standard software configurations, including troubleshooting, loading and configuring software images, supported applications and drivers may participate in it projects installing, supporting and troubleshooting approved desktop software onsite & via sccm performing planned installs, maintenance, moves, adds and changes provide support services to employees with

In [16]:
# Function to compute cosine similarity for each group
def compute_group_similarity(group):
    # Set parameters for TfidfVectorizer
    max_features = 200  # Limit the vocabulary size

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=max_features)

    # Fit and transform the text data
    tfidf_matrix_group = vectorizer.fit_transform(group['clean_tokenized_text'])

    # Convert to sparse matrix if not already sparse
    if not isinstance(tfidf_matrix_group, csr_matrix):
        tfidf_matrix_group = csr_matrix(tfidf_matrix_group)

    # Compute cosine similarity
    similarity_matrix_group = cosine_similarity(tfidf_matrix_group)

    return similarity_matrix_group

In [17]:
# Group the dataset by 'job_name'
grouped = ldata.groupby('job_name')

# Directory to save the CSV files
output_dir = "individual_matrices"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Loop over each group
for name, group in grouped:
    # Compute cosine similarity for the group
    similarity_matrix_group = compute_group_similarity(group)
    
    # Create DataFrame for the group with combined_index as index and columns
    group_df = pd.DataFrame(similarity_matrix_group, columns=group['combined_index'], index=group['combined_index'])
    
    # Save the DataFrame as a CSV file
    output_file = os.path.join(output_dir, f"{name}.csv")
    group_df.to_csv(output_file)

In [None]:
"""#Code to directly combine data --- did not work due to insufficient memory
# Group the dataset by 'job_name'
grouped = ldata.groupby('job_name')

# List to store DataFrames for each group
group_dfs = []

# Loop over each group
for name, group in grouped:
    # Compute cosine similarity for the group
    similarity_matrix_group = compute_group_similarity(group)
    
    # Create DataFrame for the group with combined_index as index and columns
    group_df = pd.DataFrame(similarity_matrix_group, columns=group['combined_index'], index=group['combined_index'])
    
    # Add the DataFrame to the list
    group_dfs.append(group_df)

# Concatenate DataFrames along the rows to combine all groups
combined_similarity_df = pd.concat(group_dfs)

# Reset the index of the combined DataFrame
combined_similarity_df.reset_index(drop=True, inplace=True)

In [None]:
"""#To generate whole consine similarity matrix --- did not work due to insufficient memory
# Set parameters for TfidfVectorizer
max_features = 10000  # Limit the vocabulary size

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=max_features)

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(ldata['clean_tokenized_text'])

# Convert to sparse matrix if not already sparse
if not isinstance(tfidf_matrix, np.ndarray):
    tfidf_matrix = tfidf_matrix.toarray()

# Compute cosine similarity using batch processing
batch_size = 10000
n_samples = tfidf_matrix.shape[0]
similarity_matrix = np.zeros((n_samples, n_samples))

for i in range(0, n_samples, batch_size):
    start = i
    end = min(i + batch_size, n_samples)
    similarity_matrix[start:end] = cosine_similarity(tfidf_matrix[start:end], tfidf_matrix)

# Convert similarity matrix to DataFrame for inspection
similarity_df = pd.DataFrame(similarity_matrix, columns=mdata['job_name'], index=mdata['job_name'])