It is an open secret that resumes are screened based on keywords. Further more, to improve one's chance of success it is important to address direct qualities in the job listing. Since hand customizing a resume for each position is ideal, it would help to know how to create a template that already addresses common keywords while potentially leaving room open for the specifics.

The goal of this project is to calculate the TF-IDF values of words occuring across numerous Software Development positions in order to find the highest ranked words. Doing so will provide insight into the common keywords found across this documents and provides the user with a word bank that they can potentially utilize when constructing resumes.

# Import Libraries

Import libraries and download necessary files

In [5]:
import nltk
import numpy as np
import re
import requests
import time


from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from prettytable import PrettyTable
from selenium import webdriver
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from webdriver_manager.firefox import GeckoDriverManager

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/burntiger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/burntiger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Define Functions

In [18]:
def load_list(path: str) -> list:
    """This method is designed to load file which contains the linkedin job urls and return a list containing
       these urls.
    
        Parameters
    ----------
    path : string
        Contains the relative path to the file containing the linkedin job urls


    Returns
    -------
    list_of_jobs : list
        returns a list of urls extracted from the opened file
        
    """
    
    list_of_jobs = list()
    
    with open(path) as file:
        for line in file:
            list_of_jobs.append(line.strip())
    
    return list_of_jobs     

def scrape_list_sel(jobs: list) -> list:
    """This method uses selenium in order to open each url and extract text from each job description.
    
        
        Parameters
    ----------
    jobs : list
        A list that contains the url of each job posting


    Returns
    -------
    jobs_HTML : list
        Returns a list of strings, each of which is the entirety of the raw unprocessed text from each job description.
        """
    
    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
    jobs_HTML = list()
    print("Scraping jobs:")
    with tqdm(total = len(jobs) ) as pbar:
        for job in jobs:
            driver.get(job)
            src = driver.page_source
            soup = BeautifulSoup(src, 'lxml')
            intro = soup.find('div', {'class': 'show-more-less-html__markup show-more-less-html__markup--clamp-after-5'})
            jobs_HTML.append(intro)
            time.sleep(3) ##set to 3 to prevent getting rate limited by Linkedin
            pbar.update()
    driver.close()
    return jobs_HTML

def scrape_list_get(jobs: list) -> list:
    """This method uses selenium in order to open each url and extract text from each job description.
    
        
        Parameters
    ----------
    jobs : list
        A list that contains the url of each job posting


    Returns
    -------
    jobs_HTML : list
        Returns a list of strings, each of which is the entirety of the raw unprocessed text from each job description.
        """
    
    jobs_HTML = list()
    print("Scraping jobs:")
    with tqdm(total = len(jobs) ) as pbar:
        for job in jobs:
            src = requests.get(job)
            soup = BeautifulSoup(src.text, 'lxml')
            intro = soup.find('div', {'class': 'show-more-less-html__markup show-more-less-html__markup--clamp-after-5'})
            jobs_HTML.append(intro)
            time.sleep(1) ##set to 1 to prevent getting rate limited by Linkedin
            pbar.update()
    return jobs_HTML

def remove_stopwords(listing: str) -> str:
    """This method is designed to remove all stopwords as defined by NLTK.
        
        Parameters
    ----------
    listing : string
        A string that needs to have stopwords removed


    Returns
    -------
    filtered_sentence : string
        A string with the stopwords removed
    """
    stop_words = set(stopwords.words('english'))
  
    word_tokens = word_tokenize(listing)

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    filtered_sentence = ""

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence += w + " "

    return filtered_sentence

def preprocess_text(jobs_HTML: list) -> list:
    """Method used to preprocess the scraped text. Converts to lower, removes html tags,
    removes anything that is not alphanumeric, and removes the stopwords.
    
        
        Parameters
    ----------
    jobs_HTML : list
        A list of strings, each of which is the entirety of the raw unprocessed text from each job description.


    Returns
    -------
    list
        a list of strings used that are the header columns
        """
    
    tags = re.compile('<.*?>')
    jobs_processed = list()
    for job in jobs_HTML:
        job = str(job).lower()
        job = re.sub(tags, ' ', job)
        job = re.sub(r'[^\w\s]', '', job)
        job = remove_stopwords(job)
        jobs_processed.append(job)
    return jobs_processed

def calculate_TFIDF_uni(listings: list) -> list:
    """Calculatues the TFIDF value for all unigrams.
        
        Parameters
    ----------
    listing : list
        A list of preprocessed job postings to calculate TF-IDF on


    Returns
    -------
    tfidf_feature_names : list
        A list of the top 50 ranked unigrams
        """
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', analyzer='word', ngram_range=(1,1))
    tfidf = tfidf_vectorizer.fit_transform(listings)
    importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    return tfidf_feature_names[importance[:100]]

def calculate_TFIDF_bi(listings: list) -> list:
    """Calculatues the TFIDF value for all bigrams.
        
        Parameters
    ----------
    listing : list
        A list of preprocessed job postings to calculate TF-IDF on


    Returns
    -------
    tfidf_feature_names : list
        A list of the top 50 ranked bigrams
        """
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', analyzer='word', ngram_range=(2,2))
    tfidf = tfidf_vectorizer.fit_transform(listings)
    importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    return tfidf_feature_names[importance[:100]]

def calculate_TFIDF_both(listings: list) -> list:
    """Calculatues the TFIDF value for unigrams and bigrams combined.
        
        Parameters
    ----------
    listing : list
        A list of preprocessed job postings to calculate TF-IDF on


    Returns
    -------
    tfidf_feature_names : list
        A list of the top 50 ranked unigrams and bigrams
        """
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', analyzer='word', ngram_range=(1,2))
    tfidf = tfidf_vectorizer.fit_transform(listings)
    importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    return tfidf_feature_names[importance[:100]]

# Calculate TFIDF

In this section we open the file containing the urls to the job listings, extract the text from the urls, preprocess that text, and then calculate TF-IDF on the corpus.

In [19]:
list_of_jobs = load_list('../data/JobsList.txt')
jobs_HTML = scrape_list_get(list_of_jobs)
preprocessed_jobs = preprocess_text(jobs_HTML)
TFIDF_uni = calculate_TFIDF_uni(preprocessed_jobs)
TFIDF_bi = calculate_TFIDF_bi(preprocessed_jobs)
TFIDF_both = calculate_TFIDF_both(preprocessed_jobs)

Scraping jobs:


100%|███████████████████████████████████████████| 31/31 [00:55<00:00,  1.78s/it]


# Display Results
In the following section we display the results of our experiment

In [21]:
y = PrettyTable()
y.title = 'List of top 100 TFIDF value unigrams'
y.field_names = ["Ranking", "Unigram", " Ranking ", " Unigram ",]
for x in range(50):
    y.add_row([x+1, TFIDF_uni[x], x+51, TFIDF_uni[x+25]])
    
print(y)

+-------------------------------------------------------+
|          List of top 100 TFIDF value unigrams         |
+---------+--------------+-----------+------------------+
| Ranking |   Unigram    |  Ranking  |     Unigram      |
+---------+--------------+-----------+------------------+
|    1    |   software   |     51    |     building     |
|    2    |     team     |     52    |       java       |
|    3    |  experience  |     53    |    including     |
|    4    |     work     |     54    |       job        |
|    5    | development  |     55    |     looking      |
|    6    | engineering  |     56    |     projects     |
|    7    |     data     |     57    |       role       |
|    8    |     new      |     58    |     services     |
|    9    |  engineers   |     59    |     science      |
|    10   |    design    |     60    |     company      |
|    11   | application  |     61    |     mission      |
|    12   |   product    |     62    |   requirements   |
|    13   |   

In [22]:
y = PrettyTable()
y.title = 'List of top 50 TFIDF value bigrams'
y.field_names = ["Ranking Col", "Bigram", " Ranking Col ", " Bigram "]
for x in range(50):
    y.add_row([x+1, TFIDF_bi[x], x+51, TFIDF_bi[x+25]])
    
print(y)

+-----------------------------------------------------------------------------------+
|                         List of top 50 TFIDF value bigrams                        |
+-------------+--------------------------+---------------+--------------------------+
| Ranking Col |          Bigram          |  Ranking Col  |          Bigram          |
+-------------+--------------------------+---------------+--------------------------+
|      1      |   software development   |       51      |      best practices      |
|      2      |     computer science     |       52      |    experience working    |
|      3      |    software engineer     |       53      |     years experience     |
|      4      |     engineering team     |       54      |       work closely       |
|      5      |       team members       |       55      |      medical dental      |
|      6      |        test plans        |       56      |      dental vision       |
|      7      |      united states       |       57   

In [23]:
y = PrettyTable()
y.title = 'List of top 50 TFIDF value of both unigrams and bigrams'
y.field_names = ["Ranking", "n-Gram", " Ranking ", " n-Gram "]
for x in range(50):
    y.add_row([x+1, TFIDF_both[x], x+51, TFIDF_both[x+25]])
    
print(y)

+-------------------------------------------------------------------+
|      List of top 50 TFIDF value of both unigrams and bigrams      |
+---------+----------------------+-----------+----------------------+
| Ranking |        n-Gram        |  Ranking  |        n-Gram        |
+---------+----------------------+-----------+----------------------+
|    1    |       software       |     51    |         job          |
|    2    |         team         |     52    | software development |
|    3    |      experience      |     53    |         java         |
|    4    |         work         |     54    |     applications     |
|    5    |     development      |     55    |      knowledge       |
|    6    |         data         |     56    |         role         |
|    7    |     engineering      |     57    |       company        |
|    8    |         new          |     58    |       services       |
|    9    |      engineers       |     59    |       projects       |
|    10   |       pr