In [2]:
import json
import re
import pandas as pd

In [3]:
path = '../data/backup.json'

In [4]:
# Combine all patterns into a dictionary
ALL_PATTERNS = {
    'Not Entry Level': 'NOT_ENTRY_LEVEL_PATTERNS',
    'Entry Level': 'ENTRY_LEVEL_PATTERNS'
}

ALL_PATTERNS.items()

dict_items([('Not Entry Level', 'NOT_ENTRY_LEVEL_PATTERNS'), ('Entry Level', 'ENTRY_LEVEL_PATTERNS')])

In [5]:
# Opening JSON file
f = open(path)
 
# returns JSON object as
# a dictionary
data = json.load(f)

In [6]:
main_key = [key for key in data.keys() if isinstance(key, str)][0]
jobs_dict = data[main_key]['jobs']

In [7]:
df = pd.DataFrame.from_dict(jobs_dict, orient='index')
df = df.drop(columns=['__collections__'])

In [8]:
# Getting small subset of dictionary items

sub_dict = {}
max_count = df.shape[0]
count = 0
for key, value in jobs_dict.items():
    if count < max_count:
        sub_dict[key] = value
        count += 1
    else:
        break

sub_df = pd.DataFrame.from_dict(sub_dict, orient='index')
sub_df = sub_df.drop(columns=['__collections__'])

In [9]:
# Set the display option to show all rows (replace 100 with the desired number)
pd.set_option('display.max_rows', 1000)

# Display the first 100 rows of the DataFrame
sub_df.shape

(32713, 5)

In [10]:
# Define patterns

# ENTRY LEVEL PATTERNS
pattern_new_grad = r'\b(?:new\s*(?:\d{4})?\s*grad(?:uate)?s?|recent\s*grad(?:uate)?s?)\b'
pattern_entry_level = r'\b(?:entry[-\s]?level|junior|beginner)\b'
pattern_entrylevelexperience = r'\b0-2\s*years\b'
pattern_sales_associate = r'\b(?:sales\s*associate|sales\s*rep(?:resentative)?|retail\s*sales\s*associate)\b'
# pattern_high_school_equivalent = r'\b(?:high\s*school\s*diploma(?:\s*or\s*equivalent\s*life\s*experiences)?)\b'
pattern_high_school_equivalent = r'\b(?:High\s*School\s*Diploma(?:\s*or\s*equivalent\s*life\s*experiences)?)\b'
pattern_volunteer = r'\b(?:volunteer(?:ed|ing)?|unpaid\s*work|community\s*service)\b'
pattern_experience_range = r'\b(?:0\s*to\s*\d+\s*(?:years|yrs?)?\s*(?:of\s*)?(?:previous|relevant)?\s*experience)\b'
pattern_sixteen_requirement = r'\b(?:must\s*be\s*16\s*years\s*of\s*age)\b'
pattern_bachelors_preferred = r'\b[Bb]achelor[\'’]?s?\s+degree\s*(?:is\s*)?preferred\b'



# NOT ENTRY LEVEL PATTERNS
pattern_management = r'\b(?:managerial)\b'
pattern_required_experience = r'\b(?:experience[-\s]?required|experience[-\s]?is[-\s]?required)\b'
pattern_lead_manage_team = r'\b(?:lead\s+and\s+manage\s+a\s+team)\b'
pattern_experience = r'\b(?:\d+\s*\+?\s*years\s+of\s+related\s+work\s+experience)\b'
pattern_senior_job_title = r'\b(?:director|manager|senior|head|lead|supervisor|sr|president)\b'
pattern_roman_numerals_job_title = r'\b(?:II|III|IV)\b'
# pattern_degree_and_experience = r'(\s*degree)\s*and\s*(\d+\s*(?:\+)?\s*years\s*of\s*experience)'
pattern_general_experience = r'\b\d+\s*\+\s*years\s*(?:of\s*)?(?:experience\s*)?(?:in\s*)?(?:an?\s*)?.*?\bposition\b'
pattern_relevant_experience = r'\b(?:\d+|\b(?:one|two|three|four|five|six|seven|eight|nine|ten)\b|\d+\s*\+)\s*(?:\(|-)?\s*(?:\d+)\s*(?:years|yrs?)\s*(?:of\s*)?(?:industry|relevant|related)\s+experience\b'

sub_df['label'] = -1  # Default value for Uncertainty 

# NOTE: Job_Title Patterns should hold priority - e.g. Senior/Director = 0 vs. Entry Level = 1 
# UPDATE NOT ENTRY LEVEL ROLES - 'Label' = 0 for management-related job details

######### JOB DETAILS
### NOT ENTRY LEVEL 
sub_df.loc[
    sub_df['job_details'].str.contains(pattern_management, case=False, regex=True) |
    sub_df['job_details'].str.contains(pattern_required_experience, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_experience, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_general_experience, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_relevant_experience, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_lead_manage_team, case=False, regex=True),
#     sub_df['job_details'].str.contains(pattern_degree_and_experience, case=False, regex=True),
    'label'
] = 0

### ENTRY LEVEL
# Update 'Label' based on patterns
sub_df.loc[
    sub_df['job_details'].str.contains(pattern_new_grad, flags=re.IGNORECASE) |
    sub_df['job_details'].str.contains(pattern_entrylevelexperience, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_entry_level, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_experience_range, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_sixteen_requirement, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_bachelors_preferred, case=False, regex=True)|
    sub_df['job_details'].str.contains(pattern_high_school_equivalent, case=False, regex=True),
    'label'
] = 1


############## JOB TITLE 
# Entry Level
sub_df.loc[
    sub_df['job_title'].str.contains(pattern_sales_associate, case=False, regex=True)|
    sub_df['job_title'].str.contains(pattern_volunteer, case=False, regex=True)|
    sub_df['job_title'].str.contains(pattern_new_grad, case=False, regex=True)|
    sub_df['job_title'].str.contains(pattern_entry_level, case=False, regex=True),
    'label'
] = 1

# NOT ENTRY 
sub_df.loc[
    sub_df['job_title'].str.contains(pattern_senior_job_title, case=False, regex=True)|
    sub_df['job_title'].str.contains(pattern_roman_numerals_job_title, case=False, regex=True),
    'label'
] = 0


import re

#Populate Matched String Column 

jobdetails_patterns = [
    (pattern_new_grad, 'New Grad'),
    (pattern_entrylevelexperience, 'Entry Level with Experience'),
    (pattern_entry_level, 'Entry Level Job Descrption'),
    (pattern_required_experience, 'Required Experience'),
    (pattern_lead_manage_team, 'Lead and Manage a Team'),
    (pattern_experience, 'Years of Experience'),
    (pattern_management,"Managerial"),
    (pattern_experience_range, '0 - x Years of Experience'),
    (pattern_high_school_equivalent, 'High School Degree Only'),
    (pattern_sixteen_requirement, 'Minimum 16 Years Age'),
    (pattern_general_experience,'Previous Management Experience'),
    (pattern_relevant_experience, 'Required Previous Industry Experience')
#     (pattern_degree_and_experience, 'Degree and x Number of Years Experience')
]

jobtitle_patterns = [
    (pattern_senior_job_title, "Not Entry-Level Job Title"),
    (pattern_sales_associate, "Sales Associate or Sales Rep Job Title"),
    (pattern_entry_level, 'Entry-Level Job Title'),
    (pattern_roman_numerals_job_title, 'II+ Role Title'),
    (pattern_volunteer, 'Volunteer Job Title'),
]

# Add a column to store the matched pattern
sub_df['label_pattern'] = ''
sub_df['matched_pattern'] = ''


###### MANUAL INPUTS

sub_df.loc[sub_df['job_id'] == '2988474723', 'label'] = 1
sub_df.loc[sub_df['job_id'] == '2988474723','label_pattern'] = 'Manual Input - MA'


# Iterate through jobdetails patterns and update the 'Label' and 'Matched_Pattern' columns
for pattern, label in jobdetails_patterns:
    matched_indices = sub_df['job_details'].str.contains(pattern, case=False, regex=True)
    sub_df.loc[matched_indices, 'label_pattern'] = label
    sub_df.loc[matched_indices, 'matched_pattern'] = pattern

# Iterate through jobtitle patterns and update the 'Label' and 'Matched_Pattern' columns
for pattern, label in jobtitle_patterns:
    matched_indices = sub_df['job_title'].str.contains(pattern, case=False, regex=True)
    sub_df.loc[matched_indices, 'label_pattern'] = label
    sub_df.loc[matched_indices, 'matched_pattern'] = pattern

########### SPecial Condditions
# Condition: Supervisor in job_title and 'high school diploma' in job_details
supervisor_high_school_condition = (
    sub_df['job_title'].str.contains(r'supervisor|manager', case=False, regex=True) &
    sub_df['job_details'].str.contains(r'high\s*school\s*diploma', case=False, regex=True)
)

# # Update 'Label' for the specified condition
# sub_df.loc[supervisor_high_school_condition, 'label'] = 1
# sub_df.loc[supervisor_high_school_condition, 'label_pattern'] = 'Supervisor in Title with High School Diploma Descr.'
# sub_df.loc[supervisor_high_school_condition, 'matched_pattern'] = supervisor_high_school_condition


In [14]:
# Display the DataFrame
sub_df['label'] = -1
sub_df.iloc[0]['job_details']

'\n \n              About the job\n             \n \n   \n Seeking Associate Dentist:   Charpentier Family Dentistry is a thriving, privately owned dental practice, located in New Iberia, LA. We are searching for a full-time associate dentist to complement our dental practice Monday-Fridays. What we have to offer: -We see 70-90 New Patients a month; we provide the patients, the marketing and the management hassle-free to you. -We have 8 dental suites fully stocked with cutting edge technology including: 3 iTero scanners, Nitrous Oxide, AI Technology, Fotona Laser, CBCT, intra-oral cameras and so much more! -Make our home, your home! For the right fit, there will be an opportunity to buy into our practice. Income Potential -Unlimited earning potential -A performing dentist should expect an average of $250,000- $300,000 annually. Requirements: -DDS or DMD from an accredited university -Quick leaner, with a desire to continue to grow -Great communication skills -Eager team player, ready t

In [158]:
sub_df['job_details'] = sub_df['job_details'].str.replace('about the job', '', case=False).str.replace('\n', '')
sub_df

Unnamed: 0,job_details,job_id,company_name,li_level,job_title,label,label_pattern,matched_pattern
46353838,Seeking Assoc...,46353838,,,Associate Dentist,1,New Grad,\b(?:new\s*(?:\d{4})?\s*grad(?:uate)?s?|recent...
284401413,Casa Soñada K...,284401413,,,Bilingual Real Estate Agent,-1,,
309773287,We are lookin...,309773287,Streamline Sports Physical Therapy,,Physical Therapy Receptionist,0,Required Experience,\b(?:experience[-\s]?required|experience[-\s]?...
341779123,Are you a str...,341779123,MireGroup CPAs,,Tax Manager/Tax Strategist,0,Not Entry-Level Job Title,\b(?:director|manager|senior|head|lead|supervi...
639835117,Chef at Max's...,639835117,,,Chef,-1,,
1097131631,Customer Serv...,1097131631,,,Wine Specialist/Customer Service,0,Managerial,\b(?:managerial)\b
1269228965,If you are se...,1269228965,KOBY KARP DOCTORS EYE INSTITUTE,,Ophthalmologist,1,New Grad,\b(?:new\s*(?:\d{4})?\s*grad(?:uate)?s?|recent...
1502270687,Position: Pri...,1502270687,Foundation House,,Primary Therapist,0,Years of Experience,\b(?:\d+\s*\+?\s*years\s+of\s+related\s+work\s...
1717784213,OUR COMPANY ...,1717784213,"Mauna Lani, Auberge Resorts Collection",4.0,Director of Finance,0,Not Entry-Level Job Title,\b(?:director|manager|senior|head|lead|supervi...
2197937316,Description ...,2197937316,Kimball Midwest,,Entry-Level Outside Sales Representative,1,Entry-Level Job Title,\b(?:entry[-\s]?level|junior|beginner)\b


In [16]:
sub_df.shape

(32713, 8)

In [15]:
sub_df.to_json('data_091123.json', orient='index')

In [163]:
# Opening JSON file
f = open('sample_data_081623.json')
 
# returns JSON object as
# a dictionary
data = json.load(f)

In [164]:
df = pd.DataFrame.from_dict(data, orient='index')
df

Unnamed: 0,job_details,job_id,company_name,li_level,job_title,label,label_pattern,matched_pattern
46353838,Seeking Assoc...,46353838,,,Associate Dentist,1,New Grad,\b(?:new\s*(?:\d{4})?\s*grad(?:uate)?s?|recent...
284401413,Casa Soñada K...,284401413,,,Bilingual Real Estate Agent,-1,,
309773287,We are lookin...,309773287,Streamline Sports Physical Therapy,,Physical Therapy Receptionist,0,Required Experience,\b(?:experience[-\s]?required|experience[-\s]?...
341779123,Are you a str...,341779123,MireGroup CPAs,,Tax Manager/Tax Strategist,0,Not Entry-Level Job Title,\b(?:director|manager|senior|head|lead|supervi...
639835117,Chef at Max's...,639835117,,,Chef,-1,,
1097131631,Customer Serv...,1097131631,,,Wine Specialist/Customer Service,0,Managerial,\b(?:managerial)\b
1269228965,If you are se...,1269228965,KOBY KARP DOCTORS EYE INSTITUTE,,Ophthalmologist,1,New Grad,\b(?:new\s*(?:\d{4})?\s*grad(?:uate)?s?|recent...
1502270687,Position: Pri...,1502270687,Foundation House,,Primary Therapist,0,Years of Experience,\b(?:\d+\s*\+?\s*years\s+of\s+related\s+work\s...
1717784213,OUR COMPANY ...,1717784213,"Mauna Lani, Auberge Resorts Collection",4.0,Director of Finance,0,Not Entry-Level Job Title,\b(?:director|manager|senior|head|lead|supervi...
2197937316,Description ...,2197937316,Kimball Midwest,,Entry-Level Outside Sales Representative,1,Entry-Level Job Title,\b(?:entry[-\s]?level|junior|beginner)\b


In [186]:
sub_df.loc[sub_df['job_id'] == '3691305864']['job_details'].iloc[0]

'\n \n              About the job\n             \n \n   \n Every great story has a new beginning, and yours starts here. Welcome to Warner Bros. Discovery… the stuff dreams are made of. Who We Are… When we say, “the stuff dreams are made of,” we’re not just referring to the world of wizards, dragons and superheroes, or even to the wonders of Planet Earth. Behind WBD’s vast portfolio of iconic content and beloved brands, are the   storytellers   bringing our characters to life, the   creators   bringing them to your living rooms and the   dreamers   creating what’s next… From brilliant creatives, to technology trailblazers, across the globe, WBD offers career defining opportunities, thoughtfully curated benefits, and the tools to explore and grow into your best selves. Here you are supported, here you are celebrated, here you can thrive. We are the now and the next. The power behind the people building the future. We are born from the spirit of innovation. We are created from the idea t

In [74]:
sub_df.loc[sub_df['job_id'] == '2988474723','label_pattern'] = ' test'

In [183]:
label_frequencies = sub_df['label'].value_counts()

for job_title, frequency in label_frequencies.items():
    print(f"Label: {job_title} | Frequency: {frequency}")

Label: 0 | Frequency: 13452
Label: -1 | Frequency: 12637
Label: 1 | Frequency: 6624


In [157]:
job_title_frequencies = df['job_title'].value_counts()

for job_title, frequency in job_title_frequencies.items():
    print(f"Job Title: {job_title} | Frequency: {frequency}")

Job Title: OPERATIONS ASSISTANT MANAGER | Frequency: 208
Job Title: Retail Sales Associate | Frequency: 205
Job Title: Assistant Store Manager | Frequency: 178
Job Title: Store Manager | Frequency: 171
Job Title: Sales Associate | Frequency: 139
Job Title: SALES FLOOR ASSOCIATE | Frequency: 130
Job Title: ASSISTANT STORE MANAGER | Frequency: 106
Job Title: Electrical Engineer | Frequency: 102
Job Title: Project Manager | Frequency: 86
Job Title: Outside Sales Representative | Frequency: 85
Job Title: Sales Representative | Frequency: 84
Job Title: Production Supervisor | Frequency: 82
Job Title: Sales Associate - Spencer's | Frequency: 79
Job Title: Financial Advisor | Frequency: 75
Job Title: Merchandising Sales Associate | Frequency: 73
Job Title: Market Manager | Frequency: 72
Job Title: Business Development Manager | Frequency: 72
Job Title: Customer Service Representative | Frequency: 71
Job Title: Network Engineer | Frequency: 64
Job Title: Assistant Manager | Frequency: 62
Job T

In [197]:
job_title_frequencies = sub_df['label'].value_counts()

for job_title, frequency in job_title_frequencies.items():
    print(f"Label: {job_title} | Frequency: {frequency}")

Label: 0 | Frequency: 13452
Label: -1 | Frequency: 12637
Label: 1 | Frequency: 6624
