# Split Dataset

In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Read Data

In [2]:
df_jobs_processed = pd.read_csv('data/jobs_processed.csv')
df_jobs_processed

Unnamed: 0,job_id,title,location,location_state,description,formatted_work_type,formatted_experience_level,remote_allowed,company_industries,company_country,...,extracted_experience_requirement,extracted_skill_requirement,extracted_education_requirement,extracted_certification_requirement,extracted_min_salary,extracted_max_salary,extracted_salary,extracted_pay_period,extracted_datetime,extracted_normalized_salary
0,921716,Marketing Coordinator,"Princeton, NJ",NJ,Job descriptionA leading real estate firm in N...,Full-time,Unknown,0.0,Real Estate,US,...,Marketing: 1 year (Preferred)Graphic design: 2...,"Please, be proficient in Adobe Creative Cloud ...",,,,20.0,18.0,HOURLY,2025-04-17 16:02:34,
1,1829192,Mental Health Therapist/Counselor,"Fort Collins, CO",CO,"At Aspen Therapy and Wellness , we are committ...",Full-time,Unknown,0.0,Unknown,Unknown,...,,Conducting intake assessmentsDeveloping and im...,A graduate level psychological counseling-rela...,,,,30.0,HOURLY,2025-04-17 16:03:04,
2,10998357,Assitant Restaurant Manager,"Cincinnati, OH",OH,The National Exemplar is accepting application...,Full-time,Unknown,0.0,Restaurants,US,...,,,,,,,,,2025-04-17 15:51:48,
3,23221523,Senior Elder Law / Trusts and Estates Associat...,"New Hyde Park, NY",NY,Senior Associate Attorney - Elder Law / Trusts...,Full-time,Unknown,0.0,Law Practice,US,...,10-15 years of experienceExperience with vario...,Strong analytical and problem-solving skillsAb...,Juris Doctor degree (J.D.) from an accredited ...,,140000.0,175000.0,,,2025-04-16 23:12:49,
4,35982263,Service Technician,"Burlington, IA",IA,Looking for HVAC service tech with experience ...,Full-time,Unknown,0.0,Unknown,Unknown,...,Minimum 5 yrs. on the job with mechanical lice...,,,,,,,,2025-04-15 20:13:33,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119217,3906267117,Title IX/Investigations Attorney,"Walnut Creek, CA",CA,Our Walnut Creek office is currently seeking a...,Full-time,Mid-Senior level,0.0,Law Practice,US,...,five (5) to seven (7) years practicing as an a...,"Strong desire to learn, passion to work with p...",,,,,,,2025-04-17 00:34:26,
119218,3906267126,"Staff Software Engineer, ML Serving Platform",United States,Unknown,About Pinterest:\n\nMillions of people across ...,Full-time,Mid-Senior level,1.0,Software Development,US,...,Hands-on experience building large-scale ML us...,"Fluency in Python and C++, familiarity with at...",,,148049.0,304496.0,,YEARLY,2025-04-16 21:18:39,226272.5
119219,3906267131,"Account Executive, Oregon/Washington","Spokane, WA",WA,Company Overview\n\nEPS Learning is a leading ...,Full-time,Mid-Senior level,1.0,Education Administration Programs,US,...,Possess 2 to 4 years of prior experience selli...,Exhibit strong analytical and organizational s...,"Hold a bachelor's degree in education, busines...",,,,,,2025-04-16 21:26:39,
119220,3906267195,Business Development Manager,"Texas, United States",Unknown,The Business Development Manager is a 'hunter'...,Full-time,Unknown,1.0,Industrial Machinery Manufacturing,GB,...,Established relationships working with oil & g...,Decision MakingCommercial AwarenessCustomer Kn...,Some college or technical background would be ...,,,,,,2025-04-13 16:47:31,


## Split Data

In [3]:
'''
Carve a 10K test dataset for evaluation. The rest will be the train dataset.
'''
df_jobs_with_salary = df_jobs_processed[df_jobs_processed['normalized_salary'].notna()]
df_jobs_without_salary = df_jobs_processed.drop(df_jobs_with_salary.index)
df_jobs_test = df_jobs_with_salary.sample(10000, random_state=42)
df_jobs_train_with_salary = df_jobs_with_salary.drop(df_jobs_test.index)
df_jobs_train = pd.concat([df_jobs_train_with_salary, df_jobs_without_salary])

print(f"Labelled dataset size: {df_jobs_with_salary.shape}")
print(f"Test dataset size: {df_jobs_test.shape}")
print(f"Train labelled dataset size: {df_jobs_train_with_salary.shape}")
print(f"Train dataset size: {df_jobs_train.shape}")

Labelled dataset size: (34705, 29)
Test dataset size: (10000, 29)
Train labelled dataset size: (24705, 29)
Train dataset size: (109222, 29)


In [4]:
df_jobs_test

Unnamed: 0,job_id,title,location,location_state,description,formatted_work_type,formatted_experience_level,remote_allowed,company_industries,company_country,...,extracted_experience_requirement,extracted_skill_requirement,extracted_education_requirement,extracted_certification_requirement,extracted_min_salary,extracted_max_salary,extracted_salary,extracted_pay_period,extracted_datetime,extracted_normalized_salary
110061,3905816324,Assembler Technician,"Roseville, CA",CA,Job Description\n\nDo you have experience and ...,Contract,Entry level,0.0,Staffing and Recruiting,US,...,,Basic hand tools knowledge preferredMust be ab...,,,,,18.0,HOURLY,2025-04-17 00:27:41,
85114,3904391512,"Supervisor, Molecular Laboratory","Philadelphia, PA",PA,Please use Google Chrome or Mozilla Firefox wh...,Full-time,Unknown,0.0,Non-profit Organizations,US,...,Minimum 3 years of experience in a CLIA certif...,"Ability to resolve complex problems, often inv...","Bachelor’s degree in Biology, Medical Technolo...","ASCP certification (BB, SBB or MB) or equivale...",90000.0,110000.0,,YEARLY,2025-04-15 21:14:58,100000.0
97319,3905214437,Sales Development Representative,"Canton, GA",GA,Our client is an award winning security softwa...,Full-time,Associate,0.0,Staffing and Recruiting,US,...,At least 1 - 2 years' of relevant work experience,Excellent written and verbal communication ski...,Bachelor's degree in Business or a related field,,,,,,2025-04-14 00:53:02,
34867,3895543148,Local Operations Manager,"Canyon Lake, TX",TX,Why Vacasa\n\nWe started with just one home an...,Full-time,Mid-Senior level,0.0,Hospitality,US,...,,Technical computer skills required on all form...,,,52900.0,67441.0,,YEARLY,2025-04-16 01:12:09,60170.5
75631,3903808993,Business Development Specialist,"Houston, TX",TX,Business Development Professional Opportunity!...,Full-time,Associate,0.0,Staffing and Recruiting,GB,...,2 to 5 years of experience,"Experience with B2B Corporate Sales, Lead Gene...",Bachelor’s degree required.,,100000.0,120000.0,,,2025-04-17 00:27:41,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94944,3904967050,"Hospitalist Opportunity in Fresno, CA","Fresno, CA",CA,Saint Agnes Medical Center (SAMC) has an excel...,Full-time,Mid-Senior level,0.0,Staffing and Recruiting,US,...,,Interested candidates must possess a valid Sta...,,,337000.0,400000.0,,YEARLY,2025-04-13 01:16:01,368500.0
110017,3905813873,"IP and Bankruptcy Legal Assistant, AmLaw100","Parsippany, NJ",NJ,Position: Litigation Legal AssistantLaw Firm: ...,Full-time,Mid-Senior level,0.0,Staffing and Recruiting,US,...,Minimum of 3+ years of legal secretarial or as...,CLIENT BILLING EXPERIENCE NECESSARY,Associate's degree preferred,Notary Public is a plus,70000.0,85000.0,,,2025-04-16 01:58:29,
50916,3901480962,Universal Banker,"Crosby, ND",ND,"Job Description\n\nThe Bank of Tioga, an affil...",Full-time,Mid-Senior level,0.0,Financial Services,US,...,One year of work experience in a financial ins...,Assist with daily operating activities such as...,Possess a high school diploma or GED certifica...,Possess or obtain Notary Certification within ...,,,18.0,HOURLY,2025-04-16 19:09:30,
24458,3891070961,"Casualty Represented Senior Desk Adjuster, Per...","Denver, CO",CO,Job Description SummarySenior level role that ...,Full-time,Mid-Senior level,1.0,Insurance,US,...,Demonstrated experience handling 1st and 3rd p...,Demonstrated experience providing customer-foc...,,Obtain any required state specific property ca...,67000.0,111000.0,,YEARLY,2025-04-12 17:14:16,89000.0


In [5]:
df_jobs_test.to_csv('data/jobs_test.csv', index=False)

In [6]:
df_jobs_train

Unnamed: 0,job_id,title,location,location_state,description,formatted_work_type,formatted_experience_level,remote_allowed,company_industries,company_country,...,extracted_experience_requirement,extracted_skill_requirement,extracted_education_requirement,extracted_certification_requirement,extracted_min_salary,extracted_max_salary,extracted_salary,extracted_pay_period,extracted_datetime,extracted_normalized_salary
1,1829192,Mental Health Therapist/Counselor,"Fort Collins, CO",CO,"At Aspen Therapy and Wellness , we are committ...",Full-time,Unknown,0.0,Unknown,Unknown,...,,Conducting intake assessmentsDeveloping and im...,A graduate level psychological counseling-rela...,,,,30.0,HOURLY,2025-04-17 16:03:04,
2,10998357,Assitant Restaurant Manager,"Cincinnati, OH",OH,The National Exemplar is accepting application...,Full-time,Unknown,0.0,Restaurants,US,...,,,,,,,,,2025-04-17 15:51:48,
3,23221523,Senior Elder Law / Trusts and Estates Associat...,"New Hyde Park, NY",NY,Senior Associate Attorney - Elder Law / Trusts...,Full-time,Unknown,0.0,Law Practice,US,...,10-15 years of experienceExperience with vario...,Strong analytical and problem-solving skillsAb...,Juris Doctor degree (J.D.) from an accredited ...,,140000.0,175000.0,,,2025-04-16 23:12:49,
5,91700727,Economic Development and Planning Intern,"Raleigh, NC",NC,Job summary:The Economic Development & Plannin...,Internship,Unknown,0.0,Non-profit Organizations,US,...,,"Strong interest in economic development, city ...",Currently enrolled in a graduate or undergradu...,,14.0,20.0,,,2025-04-16 00:43:02,
17,103860943,Customer Service / Reservationist,"Providence, RI",RI,Sentinel Limousine of East Providence RI is a ...,Part-time,Unknown,0.0,Unknown,Unknown,...,,Advanced customer service and communication sk...,,,,,,,2025-04-17 15:53:24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119214,3906266217,Senior Frontend/App Developer,United States,Unknown,The Dyrt is the largest digital camping platfo...,Full-time,Mid-Senior level,1.0,Software Development,US,...,Have at least 4 years professional experience ...,Have at least 4 years professional experience ...,,,,,,,2025-04-16 21:06:08,
119215,3906266248,"Account Manager, Client Success",United States,Unknown,GoodRx is America’s healthcare marketplace. Ea...,Full-time,Mid-Senior level,1.0,Hospitals and Health Care,US,...,5+ years of professional experience in Pharmac...,Responsible for the day-to-day client relation...,,,72000.0,154000.0,96000.0,YEARLY,2025-04-14 19:10:38,113000.0
119218,3906267126,"Staff Software Engineer, ML Serving Platform",United States,Unknown,About Pinterest:\n\nMillions of people across ...,Full-time,Mid-Senior level,1.0,Software Development,US,...,Hands-on experience building large-scale ML us...,"Fluency in Python and C++, familiarity with at...",,,148049.0,304496.0,,YEARLY,2025-04-16 21:18:39,226272.5
119219,3906267131,"Account Executive, Oregon/Washington","Spokane, WA",WA,Company Overview\n\nEPS Learning is a leading ...,Full-time,Mid-Senior level,1.0,Education Administration Programs,US,...,Possess 2 to 4 years of prior experience selli...,Exhibit strong analytical and organizational s...,"Hold a bachelor's degree in education, busines...",,,,,,2025-04-16 21:26:39,


In [7]:
df_jobs_train.to_csv('data/jobs_train.csv', index=False)