# Import Necessary Libraries

In [27]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])
from nltk.tokenize import word_tokenize

#Data loading/ Data manipulation
import pandas as pd
import numpy as np

# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Import json library
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import the CSV file

In [15]:
file_path = 'data/Linkedin Job Postings (2023-2024)/postings.csv'
data = pd.read_csv(file_path)

# Display the first few rows
# print(data.head())



In [16]:
# put data into a df
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())



     job_id            company_name   
0    921716   Corcoran Sawyer Smith  \
1   1829192                     NaN   
2  10998357  The National Exemplar    
3  23221523  Abrams Fensterman, LLP   
4  35982263                     NaN   

                                               title   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description  max_salary pay_period   
0  Job descriptionA leading real estate firm in N...        20.0     HOURLY  \
1  At Aspen Therapy and Wellness , we are committ...        50.0     HOURLY   
2  The National Exemplar is accepting application...     65000.0     YEARLY   
3  Senior Associate Attorney - Elder Law / Trusts...    175000.0     YEARLY   
4  Looking for HVAC service tech 

In [17]:
# rename title to roles
df.rename(columns={"title": "roles"}, inplace=True)
print(df.head())



     job_id            company_name   
0    921716   Corcoran Sawyer Smith  \
1   1829192                     NaN   
2  10998357  The National Exemplar    
3  23221523  Abrams Fensterman, LLP   
4  35982263                     NaN   

                                               roles   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description  max_salary pay_period   
0  Job descriptionA leading real estate firm in N...        20.0     HOURLY  \
1  At Aspen Therapy and Wellness , we are committ...        50.0     HOURLY   
2  The National Exemplar is accepting application...     65000.0     YEARLY   
3  Senior Associate Attorney - Elder Law / Trusts...    175000.0     YEARLY   
4  Looking for HVAC service tech 

In [20]:
# Keep only the columns we need "title", "description", "skills_desc"
df = df[["roles", "description", "skills_desc"]]
print(df.head())

                                               roles   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description   
0  Job descriptionA leading real estate firm in N...  \
1  At Aspen Therapy and Wellness , we are committ...   
2  The National Exemplar is accepting application...   
3  Senior Associate Attorney - Elder Law / Trusts...   
4  Looking for HVAC service tech with experience ...   

                                         skills_desc  
0  Requirements: \n\nWe are seeking a College or ...  
1                                                NaN  
2  We are currently accepting resumes for FOH - A...  
3  This position requires a baseline understandin...  
4                                                Na

In [21]:
# Combine the description and skills_desc columns
df['combined_skills_desc'] = df['description'] + df['skills_desc']
print(df.head())


                                               roles   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description   
0  Job descriptionA leading real estate firm in N...  \
1  At Aspen Therapy and Wellness , we are committ...   
2  The National Exemplar is accepting application...   
3  Senior Associate Attorney - Elder Law / Trusts...   
4  Looking for HVAC service tech with experience ...   

                                         skills_desc   
0  Requirements: \n\nWe are seeking a College or ...  \
1                                                NaN   
2  We are currently accepting resumes for FOH - A...   
3  This position requires a baseline understandin...   
4                                             

In [22]:
# Drop the description and skills_desc columns
df = df.drop(columns=['description', 'skills_desc'])
print(df.head())

                                               roles   
0                              Marketing Coordinator  \
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                combined_skills_desc  
0  Job descriptionA leading real estate firm in N...  
1                                                NaN  
2  The National Exemplar is accepting application...  
3  Senior Associate Attorney - Elder Law / Trusts...  
4                                                NaN  


In [25]:
# Remove any rows with missing values
df = df.dropna()


# print the shape of the data
print(df.shape)

# reset the index
df = df.reset_index(drop=True)

print(df.head())

(2439, 2)
                                               roles   
0                              Marketing Coordinator  \
1                        Assitant Restaurant Manager   
2  Senior Elder Law / Trusts and Estates Associat...   
3                              Respiratory Therapist   
4                                     Worship Leader   

                                combined_skills_desc  
0  Job descriptionA leading real estate firm in N...  
1  The National Exemplar is accepting application...  
2  Senior Associate Attorney - Elder Law / Trusts...  
3  At Children’s, the region’s only full-service ...  
4  It is an exciting time to be a part of our chu...  


In [None]:
# add all roles into src/role_patterns.jsonl file with all lowercase
# create a new file named role_patterns.jsonl
# open the file in write mode
# with open('role_patterns.jsonl', 'w') as f:
#     # iterate over the rows of the df
#     for index, row in df.iterrows():
#         # create a dictionary with the index and the role
#         role = {"label": "ROLE", "pattern": row["roles"].lower()}
#         # write the role to the file
#         f.write(json.dumps(role) + '\n')



In [30]:
# Save this to a new csv file
df.to_csv('data/Linkedin Job Postings (2023-2024)/postings_combined_desc.csv', index=False)

In [44]:
# !python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "skill_patterns_lowercase.jsonl"

# Test of the Spacy library

In [45]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

We will create two python functions to extract all the skills within a resume and create an array containing all the skills. Later we are going to apply this function to our dataset and create a new feature called skill. This will help us visualize trends and patterns within the dataset

In [46]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset

def get_hard_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "HARD SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset

def unique_skills(x):
    return list(set(x))

# Cleaning Resume Text using nltk

In [47]:
# Go through each row of the 'Resume_str' column and clean the text
df['combined_skills_desc'] = df['combined_skills_desc'].apply(lambda x: re.sub(r'\s+', ' ', x))

# Remove special characters and digits
df['combined_skills_desc'] = df['combined_skills_desc'].apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

# Remove stopwords
stop_words = set(stopwords.words('english'))

# turn the text into lowercase
df['combined_skills_desc'] = df['combined_skills_desc'].apply(lambda x: x.lower())

print(df.head())

                                               roles   
0                              Marketing Coordinator  \
1                        Assitant Restaurant Manager   
2  Senior Elder Law / Trusts and Estates Associat...   
3                              Respiratory Therapist   
4                                     Worship Leader   

                                combined_skills_desc   
0  job descriptiona leading real estate firm in n...  \
1  the national exemplar is accepting application...   
2  senior associate attorney elder law trusts and...   
3  at children s the region s only full service p...   
4  it is an exciting time to be a part of our chu...   

                                              skills       hard_skills  
0  [marketing, design, advertising, graphic desig...        [maintain]  
1  [organization, communication, teamwork, custom...                []  
2  [marketing, problem solving, search engine, an...                []  
3  [support, schedule, professiona

In [48]:
print(df.head())

cleaned_JD = df
cleaned_JD.to_csv("data/Linkedin Job Postings (2023-2024)/cleaned_JD.csv", index=False)

                                               roles   
0                              Marketing Coordinator  \
1                        Assitant Restaurant Manager   
2  Senior Elder Law / Trusts and Estates Associat...   
3                              Respiratory Therapist   
4                                     Worship Leader   

                                combined_skills_desc   
0  job descriptiona leading real estate firm in n...  \
1  the national exemplar is accepting application...   
2  senior associate attorney elder law trusts and...   
3  at children s the region s only full service p...   
4  it is an exciting time to be a part of our chu...   

                                              skills       hard_skills  
0  [marketing, design, advertising, graphic desig...        [maintain]  
1  [organization, communication, teamwork, custom...                []  
2  [marketing, problem solving, search engine, an...                []  
3  [support, schedule, professiona

In [49]:
processed = df

processed['skills'] = processed['combined_skills_desc'].apply(get_skills)
processed['skills'] = processed['skills'].apply(unique_skills)

print(processed.head())


                                               roles   
0                              Marketing Coordinator  \
1                        Assitant Restaurant Manager   
2  Senior Elder Law / Trusts and Estates Associat...   
3                              Respiratory Therapist   
4                                     Worship Leader   

                                combined_skills_desc   
0  job descriptiona leading real estate firm in n...  \
1  the national exemplar is accepting application...   
2  senior associate attorney elder law trusts and...   
3  at children s the region s only full service p...   
4  it is an exciting time to be a part of our chu...   

                                              skills       hard_skills  
0  [marketing, business, advertising, graphic des...        [maintain]  
1         [organization, teamwork, customer service]                []  
2                   [problem solving, search engine]                []  
3               [support, schedule

In [50]:
processed['hard_skills'] = processed['combined_skills_desc'].apply(get_hard_skills)
processed['hard_skills'] = processed['hard_skills'].apply(unique_skills)

print(processed.head())


                                               roles   
0                              Marketing Coordinator  \
1                        Assitant Restaurant Manager   
2  Senior Elder Law / Trusts and Estates Associat...   
3                              Respiratory Therapist   
4                                     Worship Leader   

                                combined_skills_desc   
0  job descriptiona leading real estate firm in n...  \
1  the national exemplar is accepting application...   
2  senior associate attorney elder law trusts and...   
3  at children s the region s only full service p...   
4  it is an exciting time to be a part of our chu...   

                                              skills   
0  [marketing, business, advertising, graphic des...  \
1         [organization, teamwork, customer service]   
2                   [problem solving, search engine]   
3               [support, schedule, professionalism]   
4  [team building, graphic design, google, twi

In [51]:
# Save the data to a new CSV file
processed.to_csv("data/Linkedin Job Postings (2023-2024)/cleaned_JD_with_skills.csv", index=False)