# Import Librabries and Loading Dataset

In [1]:
import pandas as pd
from datasets import Dataset
import re

# Loading dataset
job_postings = pd.read_csv(r"C:\Users\DELL\Desktop\Regonet_project\Uncleaned_DS_jobs.csv")

# Dropping unnecessary columns - index column
job_postings = job_postings.drop(columns='index')

# Removing extra leading and trailing spaces from text-based columns
for col in job_postings.select_dtypes(include='object'):
    job_postings[col] = job_postings[col].map(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x,str) else x)
    
# Renaming columns to comply with Python Naming Convention
job_postings.rename(columns={'Job Title': 'job_title','Salary Estimate': 'Salary_estimate', 'Job Description': 'Job_description',
                             'Company Name': 'Company_name', 'Type of ownership': 'Type_of_ownership', 'Size':'Size (No_of_employees)'}, inplace=True)
    
# Converting the Job_title coulmn to title case
job_postings['job_title'] = job_postings['job_title'].str.title()

job_postings



Unnamed: 0,job_title,Salary_estimate,Job_description,Rating,Company_name,Location,Headquarters,Size (No_of_employees),Founded,Type_of_ownership,Industry,Sector,Revenue,Competitors
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description The Senior Data Scientist is respo...,3.1,Healthfirst 3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future Join the...",4.2,ManTech 4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview Analysis Group is one of the largest ...,3.8,Analysis Group 3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION: Do you have a passion for Dat...,3.5,INFICON 3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist Affinity Solutions / Marketing ...,2.9,Affinity Solutions 2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Data Scientist,$105K-$167K (Glassdoor est.),Summary We’re looking for a data scientist to ...,3.6,TRANZACT 3.6,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,-1
668,Data Scientist,$105K-$167K (Glassdoor est.),Job Description Become a thought leader within...,-1.0,JKGT,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
669,Data Scientist,$105K-$167K (Glassdoor est.),Join a thriving company that is changing the w...,-1.0,AccessHope,"Irwindale, CA",-1,-1,-1,-1,-1,-1,-1,-1
670,Data Scientist,$105K-$167K (Glassdoor est.),100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated 5.0,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,-1,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),-1


# Handling Missing Values in Job Posting Dataset

In [2]:
# Check for and replace missing values in our dataset with appropriate values
# Mapping of columns to appropriate replacement values for -1
replacement_map = {
    'Location': 'Unknown',
    'Size (No_of_employees)': 'Unknown',
    'Founded': pd.NA, 
    'Type_of_ownership': 'Unknown',
    'Industry': 'Unknown',
    'Sector': 'Unknown',
    'Revenue': 'Unknown',
    'Competitors': 'Unknown',
    'Headquarters': 'Unknown',
    
}
# Replace missing values in the DataFrame
for column, replacement in replacement_map.items():
    if column in job_postings.columns:
        job_postings[column] = job_postings[column].replace([-1,'-1'], replacement)
        
# Replacing the Unknow/Non-Applicable in the Revenue Column with only Unknown value
job_postings['Revenue'] = job_postings['Revenue'].str.replace(r'unknown\s*/\s*non-applicable', 'Unknown', flags=re.IGNORECASE, regex=True)


       
job_postings



Unnamed: 0,job_title,Salary_estimate,Job_description,Rating,Company_name,Location,Headquarters,Size (No_of_employees),Founded,Type_of_ownership,Industry,Sector,Revenue,Competitors
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description The Senior Data Scientist is respo...,3.1,Healthfirst 3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown,"EmblemHealth, UnitedHealth Group, Aetna"
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future Join the...",4.2,ManTech 4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),Unknown
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview Analysis Group is one of the largest ...,3.8,Analysis Group 3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),Unknown
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION: Do you have a passion for Dat...,3.5,INFICON 3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist Affinity Solutions / Marketing ...,2.9,Affinity Solutions 2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown,"Commerce Signals, Cardlytics, Yodlee"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Data Scientist,$105K-$167K (Glassdoor est.),Summary We’re looking for a data scientist to ...,3.6,TRANZACT 3.6,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown,Unknown
668,Data Scientist,$105K-$167K (Glassdoor est.),Job Description Become a thought leader within...,-1.0,JKGT,"San Francisco, CA",Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown
669,Data Scientist,$105K-$167K (Glassdoor est.),Join a thriving company that is changing the w...,-1.0,AccessHope,"Irwindale, CA",Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown
670,Data Scientist,$105K-$167K (Glassdoor est.),100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated 5.0,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),Unknown
