# Import Necessary Libraries

In [70]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])
from nltk.tokenize import word_tokenize

#Data loading/ Data manipulation
import pandas as pd
import numpy as np

# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import the CSV file

In [71]:
file_path = 'data/Resume Dataset/Resume/Resume.csv'
data = pd.read_csv(file_path)

# Display the first few rows
# print(data.head())



In [72]:
# put data into a df
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())

# Export the data to another CSV file
# df.to_csv('data/Resume Dataset/Resume/Resume_clean.csv', index=False)


         ID                                         Resume_str   
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...  \
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


# Clean the data

In [73]:
# Remove the Resume_html column
df.drop('Resume_html', axis=1, inplace=True)

# Remove the ID column
df.drop('ID', axis=1, inplace=True)

# Display the first few rows
# print(df.head())

In [74]:
print(df.shape)

# Find distinct categories in Category column
print(df['Category'].value_counts())


# Display the first few rows
# print(df.head())

# Reset the index
df = df.reset_index(drop=True)

# See the shape of the data
print(df.shape)




(2484, 2)
Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64
(2484, 2)


In [75]:
# Remove commas from the 'Resume_str' column
df['Resume_str'] = df['Resume_str'].str.replace('\n', ' ', regex=True)
df['Resume_str'] = df['Resume_str'].str.replace('\r', ' ', regex=True)
df['Resume_str'] = df['Resume_str'].str.strip()

df['Resume_str'] = df['Resume_str'].str.replace('"', '', regex=True)
df['Resume_str'] = df['Resume_str'].str.replace(',', ' ', regex=True)

df['Resume_str'] = df['Resume_str'].fillna('').astype(str)

# Remove rows where all values are NaN
df = df.dropna(how='all')

# Optionally remove rows where a specific column (e.g., 'Resume_str') is empty
df = df[df['Resume_str'].str.strip().astype(bool)]

# Display the first few rows
# print(df.head())

# Display size
print(df.shape)


(2483, 2)


In [83]:
# Export the data to another CSV file
df.to_csv('data/Resume Dataset/Resume/Resume_Clean.csv', index=False)

In [86]:
# !python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "skill_patterns.jsonl"

In [87]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

We will create two python functions to extract all the skills within a resume and create an array containing all the skills. Later we are going to apply this function to our dataset and create a new feature called skill. This will help us visualize trends and patterns within the dataset

In [88]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

# Cleaning Resume Text using nltk

In [91]:
# Go through each row of the 'Resume_str' column and clean the text
df['Resume_str'] = df['Resume_str'].apply(lambda x: re.sub(r'\s+', ' ', x))

# Remove special characters and digits
df['Resume_str'] = df['Resume_str'].apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Resume_str'] = df['Resume_str'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Show the first few rows
print(df.head())


                                          Resume_str Category
0  HR ADMINISTRATOR MARKETING ASSOCIATE HR ADMINI...       HR
1  HR SPECIALIST US HR OPERATIONS Summary Versati...       HR
2  HR DIRECTOR Summary years experience recruitin...       HR
3  HR SPECIALIST Summary Dedicated Driven Dynamic...       HR
4  HR MANAGER Skill Highlights HR SKILLS HR Depar...       HR


In [92]:
# Create a CSV for the Processed Data
df.to_csv('data/Resume Dataset/Resume/Resume_Processed.csv', index=False)



In [95]:
processed = df

processed["skills"] = processed["Resume_str"].str.lower().apply(get_skills)
processed["skills"] = processed["skills"].apply(unique_skills)

# Display the first few rows
print(processed.head())

                                          Resume_str Category   
0  HR ADMINISTRATOR MARKETING ASSOCIATE HR ADMINI...       HR  \
1  HR SPECIALIST US HR OPERATIONS Summary Versati...       HR   
2  HR DIRECTOR Summary years experience recruitin...       HR   
3  HR SPECIALIST Summary Dedicated Driven Dynamic...       HR   
4  HR MANAGER Skill Highlights HR SKILLS HR Depar...       HR   

                                              skills  
0  [commerce, documentation, conflict resolution,...  
1  [communication, training, feedback, marketing,...  
2  [training, monitoring, security, information m...  
3  [communication, training, monitoring, document...  
4  [training, decision making, conflict resolutio...  


In [99]:
# Save the data to a new CSV file
processed.to_csv("data/Resume Dataset/Resume/Resume_With_Skills.csv", index=False)

In [None]:
sent = nlp(
    """

    """
)
displacy.render(sent, style="ent", jupyter=True)

In [1]:
patterns = df.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])


# options=[{"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}]
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
displacy.render(sent, style="ent", jupyter=True, options=options)

NameError: name 'df' is not defined