# Import Librabries and Loading Dataset

In [1]:
import pandas as pd
from datasets import Dataset
import re

# Loading dataset
job_postings = pd.read_csv(r"C:\Users\DELL\Desktop\Regonet_project\Uncleaned_DS_jobs.csv")

# Dropping unnecessary columns - index column
job_postings = job_postings.drop(columns='index')

# Removing extra leading and trailing spaces from text-based columns
for col in job_postings.select_dtypes(include='object'):
    job_postings[col] = job_postings[col].map(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x,str) else x)
    
# Renaming columns to comply with Python Naming Convention
job_postings.rename(columns={'Job Title': 'job_title','Salary Estimate': 'Salary_estimate', 'Job Description': 'Job_description',
                             'Company Name': 'Company_name', 'Type of ownership': 'Type_of_ownership', 'Size':'Size (No_of_employees)'}, inplace=True)
    
# Converting the Job_title coulmn to title case
job_postings['job_title'] = job_postings['job_title'].str.title()

job_postings



Unnamed: 0,job_title,Salary_estimate,Job_description,Rating,Company_name,Location,Headquarters,Size (No_of_employees),Founded,Type_of_ownership,Industry,Sector,Revenue,Competitors
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description The Senior Data Scientist is respo...,3.1,Healthfirst 3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future Join the...",4.2,ManTech 4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview Analysis Group is one of the largest ...,3.8,Analysis Group 3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION: Do you have a passion for Dat...,3.5,INFICON 3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist Affinity Solutions / Marketing ...,2.9,Affinity Solutions 2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Data Scientist,$105K-$167K (Glassdoor est.),Summary We’re looking for a data scientist to ...,3.6,TRANZACT 3.6,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,-1
668,Data Scientist,$105K-$167K (Glassdoor est.),Job Description Become a thought leader within...,-1.0,JKGT,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
669,Data Scientist,$105K-$167K (Glassdoor est.),Join a thriving company that is changing the w...,-1.0,AccessHope,"Irwindale, CA",-1,-1,-1,-1,-1,-1,-1,-1
670,Data Scientist,$105K-$167K (Glassdoor est.),100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated 5.0,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,-1,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),-1


# Separating Company Name and Rating 

In [2]:
# Printing the raw values on Company Name Column
for val in job_postings['Company_name'].head(10):
    print(repr(val))

'Healthfirst 3.1'
'ManTech 4.2'
'Analysis Group 3.8'
'INFICON 3.5'
'Affinity Solutions 2.9'
'HG Insights 4.2'
'Novartis 3.9'
'iRobot 3.5'
'Intuit - Data 4.4'
'XSELL Technologies 3.6'


In [3]:
# Extracting the company name from the 'Company_name' column
job_postings['Company_name'] = job_postings['Company_name'].str.extract(r'^(.*)\s\d+(\.\d+)?$')[0]

print(job_postings['Company_name'])

0               Healthfirst
1                   ManTech
2            Analysis Group
3                   INFICON
4        Affinity Solutions
               ...         
667                TRANZACT
668                     NaN
669                     NaN
670    ChaTeck Incorporated
671           1-800-Flowers
Name: Company_name, Length: 672, dtype: object


# Removing Extra Text from Salary Estimate Column and Extrating Minimum and Maximum Salary Column From the Salary Estimate Column

In [4]:
# Extracting salary information from the 'Salary_estimate' column
# Remove text like "(Glassdoor est.)" from the salary string
job_postings['Salary_estimate'] = job_postings['Salary_estimate'].str.replace(r'\s*\(.*\)', '', regex=True)

# Extract the min and max salary values and convert to integers
salary_range = job_postings['Salary_estimate'].str.extract(r'\$(\d+)K-\$(\d+)K')

# Convert to numeric and multiply by 1000
job_postings['min_salary'] = salary_range[0].astype(int) * 1000
job_postings['max_salary'] = salary_range[1].astype(int) * 1000

job_postings


Unnamed: 0,job_title,Salary_estimate,Job_description,Rating,Company_name,Location,Headquarters,Size (No_of_employees),Founded,Type_of_ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary
0,Sr Data Scientist,$137K-$171K,Description The Senior Data Scientist is respo...,3.1,Healthfirst,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna",137000,171000
1,Data Scientist,$137K-$171K,"Secure our Nation, Ignite your Future Join the...",4.2,ManTech,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1,137000,171000
2,Data Scientist,$137K-$171K,Overview Analysis Group is one of the largest ...,3.8,Analysis Group,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1,137000,171000
3,Data Scientist,$137K-$171K,JOB DESCRIPTION: Do you have a passion for Dat...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech...",137000,171000
4,Data Scientist,$137K-$171K,Data Scientist Affinity Solutions / Marketing ...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",137000,171000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Data Scientist,$105K-$167K,Summary We’re looking for a data scientist to ...,3.6,TRANZACT,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,-1,105000,167000
668,Data Scientist,$105K-$167K,Job Description Become a thought leader within...,-1.0,,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1,105000,167000
669,Data Scientist,$105K-$167K,Join a thriving company that is changing the w...,-1.0,,"Irwindale, CA",-1,-1,-1,-1,-1,-1,-1,-1,105000,167000
670,Data Scientist,$105K-$167K,100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,-1,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),-1,105000,167000


# Handling Missing Values in Job Posting Dataset

In [5]:
# Check for and replace missing values in our dataset with appropriate values
# Mapping of columns to appropriate replacement values for -1
replacement_map = {
    'Location': 'Unknown',
    'Size (No_of_employees)': 'Unknown',
    'Founded': pd.NA, 
    'Type_of_ownership': 'Unknown',
    'Industry': 'Unknown',
    'Sector': 'Unknown',
    'Revenue': 'Unknown',
    'Competitors': 'Unknown',
    'Headquarters': 'Unknown',
    
}
# Replace missing values in the DataFrame
for column, replacement in replacement_map.items():
    if column in job_postings.columns:
        job_postings[column] = job_postings[column].replace([-1,'-1'], replacement)
        
# Replacing the Unknow/Non-Applicable in the Revenue Column with only Unknown value
job_postings['Revenue'] = job_postings['Revenue'].str.replace(r'unknown\s*/\s*non-applicable', 'Unknown', flags=re.IGNORECASE, regex=True)


       
job_postings



Unnamed: 0,job_title,Salary_estimate,Job_description,Rating,Company_name,Location,Headquarters,Size (No_of_employees),Founded,Type_of_ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary
0,Sr Data Scientist,$137K-$171K,Description The Senior Data Scientist is respo...,3.1,Healthfirst,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown,"EmblemHealth, UnitedHealth Group, Aetna",137000,171000
1,Data Scientist,$137K-$171K,"Secure our Nation, Ignite your Future Join the...",4.2,ManTech,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),Unknown,137000,171000
2,Data Scientist,$137K-$171K,Overview Analysis Group is one of the largest ...,3.8,Analysis Group,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),Unknown,137000,171000
3,Data Scientist,$137K-$171K,JOB DESCRIPTION: Do you have a passion for Dat...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech...",137000,171000
4,Data Scientist,$137K-$171K,Data Scientist Affinity Solutions / Marketing ...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown,"Commerce Signals, Cardlytics, Yodlee",137000,171000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Data Scientist,$105K-$167K,Summary We’re looking for a data scientist to ...,3.6,TRANZACT,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown,Unknown,105000,167000
668,Data Scientist,$105K-$167K,Job Description Become a thought leader within...,-1.0,,"San Francisco, CA",Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown,105000,167000
669,Data Scientist,$105K-$167K,Join a thriving company that is changing the w...,-1.0,,"Irwindale, CA",Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown,105000,167000
670,Data Scientist,$105K-$167K,100 Remote Opportunity As an AINLP Data Scient...,5.0,ChaTeck Incorporated,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),Unknown,105000,167000


# Format Standardization

## Standardizing the Size, Revenue and Location Columns

In [6]:
# checking the raw data in the Size, Revenue, and Location Columns
for index, row in job_postings[['Size (No_of_employees)', 'Revenue', 'Location']].head(10).iterrows():
    print(f"Size (No_of_employees): {repr(row['Size (No_of_employees)'])}, Revenue: {repr(row['Revenue'])}, Location: {repr(row['Location'])}")


Size (No_of_employees): '1001 to 5000 employees', Revenue: 'Unknown', Location: 'New York, NY'
Size (No_of_employees): '5001 to 10000 employees', Revenue: '$1 to $2 billion (USD)', Location: 'Chantilly, VA'
Size (No_of_employees): '1001 to 5000 employees', Revenue: '$100 to $500 million (USD)', Location: 'Boston, MA'
Size (No_of_employees): '501 to 1000 employees', Revenue: '$100 to $500 million (USD)', Location: 'Newton, MA'
Size (No_of_employees): '51 to 200 employees', Revenue: 'Unknown', Location: 'New York, NY'
Size (No_of_employees): '51 to 200 employees', Revenue: 'Unknown', Location: 'Santa Barbara, CA'
Size (No_of_employees): '10000+ employees', Revenue: '$10+ billion (USD)', Location: 'Cambridge, MA'
Size (No_of_employees): '1001 to 5000 employees', Revenue: '$1 to $2 billion (USD)', Location: 'Bedford, MA'
Size (No_of_employees): '5001 to 10000 employees', Revenue: '$2 to $5 billion (USD)', Location: 'San Diego, CA'
Size (No_of_employees): '51 to 200 employees', Revenue: 'Un

## Converting the Size Comun to a Comsistent Format

In [7]:
## Converting the Size Comun to a Comsistent Format.
job_postings['Size (No_of_employees)'] = job_postings['Size (No_of_employees)'] \
    .str.replace('employees', '', case=False) \
    .str.replace('to', '-', case=False) \
    .str.replace(r'[^\d\-]', '', regex=True) \
    .str.strip()

job_postings['Size (No_of_employees)']

0       1001-5000
1      5001-10000
2       1001-5000
3        501-1000
4          51-200
          ...    
667     1001-5000
668              
669              
670          1-50
671     1001-5000
Name: Size (No_of_employees), Length: 672, dtype: object

## Coverting the Revenue Column to a Consistent Format

In [8]:

def clean_revenue(value):
    if pd.isna(value):
        return None

    value = value.lower()

    if "Unknown" in value in value:
        return "Unknown"

    # Handle "Less than $X million"
    match = re.search(r'less than \$([\d,.]+) million', value)
    if match:
        return f"<{float(match.group(1).replace(',', '')) * 1_000_000:.0f}"

    # Handle "$X to $Y billion"
    match = re.search(r'\$([\d,.]+)\s*to\s*\$([\d,.]+)\s*billion', value)
    if match:
        low = float(match.group(1).replace(',', '')) * 1_000_000_000
        high = float(match.group(2).replace(',', '')) * 1_000_000_000
        return f"{int(low)}-{int(high)}"

    # Handle "$X to $Y million"
    match = re.search(r'\$([\d,.]+)\s*to\s*\$([\d,.]+)\s*million', value)
    if match:
        low = float(match.group(1).replace(',', '')) * 1_000_000
        high = float(match.group(2).replace(',', '')) * 1_000_000
        return f"{int(low)}-{int(high)}"

    # Handle "$X+ billion"
    match = re.search(r'\$([\d,.]+)\+?\s*billion', value)
    if match:
        low = float(match.group(1).replace(',', '')) * 1_000_000_000
        return f"{int(low)}+"

    # Handle "$X+ million"
    match = re.search(r'\$([\d,.]+)\+?\s*million', value)
    if match:
        low = float(match.group(1).replace(',', '')) * 1_000_000
        return f"{int(low)}+"

    return value  # fallback, in case of unusual format

# Apply it to your column
job_postings['Revenue'] = job_postings['Revenue'].apply(clean_revenue)

job_postings['Revenue']


0                    unknown
1      1000000000-2000000000
2        100000000-500000000
3        100000000-500000000
4                    unknown
               ...          
667                  unknown
668                  unknown
669                  unknown
670          1000000-5000000
671    1000000000-2000000000
Name: Revenue, Length: 672, dtype: object

## Converting the Location Column to a Consistent Format

In [9]:
job_postings['Location'] = job_postings['Location'].str.strip().str.replace(r'\s*,\s*', ', ', regex=True)

job_postings['Location'] 


0           New York, NY
1          Chantilly, VA
2             Boston, MA
3             Newton, MA
4           New York, NY
             ...        
667         Fort Lee, NJ
668    San Francisco, CA
669        Irwindale, CA
670    San Francisco, CA
671         New York, NY
Name: Location, Length: 672, dtype: object

In [10]:
job_postings.sample(10)

Unnamed: 0,job_title,Salary_estimate,Job_description,Rating,Company_name,Location,Headquarters,Size (No_of_employees),Founded,Type_of_ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary
399,Applied Computer Scientist,$110K-$163K,STR is hiring an Applied Computer Scientist to...,4.5,Systems & Technology Research,"Woburn, MA","Woburn, MA",201-500,2010.0,Company - Private,Aerospace & Defense,Aerospace & Defense,100000000-500000000,Unknown,110000,163000
329,Data Scientist,$79K-$147K,Description: Position Requires a Top Secret (T...,-1.0,,"Reston, VA","Leesburg, VA",51-200,,Company - Private,IT Services,Information Technology,unknown,Unknown,79000,147000
288,Data Scientist,$141K-$225K,"About Joby Located in Northern California, the...",4.3,Joby Aviation,"San Carlos, CA","Santa Cruz, CA",51-200,,Company - Private,Unknown,Unknown,unknown,Unknown,141000,225000
241,Senior Research Statistician- Data Scientist,$90K-$124K,Acuity is seeking a Senior Research Statistici...,4.8,Acuity Insurance,"Sheboygan, WI","Sheboygan, WI",1001-5000,1925.0,Company - Private,Insurance Carriers,Insurance,1000000000-2000000000,Unknown,90000,124000
412,Data Scientist,$124K-$198K,Job Description Are you an experienced Data Sc...,5.0,PROPRIUS,"Sunnyvale, CA","London, United Kingdom",1-50,,Company - Private,Enterprise Software & Network Solutions,Information Technology,unknown,Unknown,124000,198000
522,"Manager, Field Application Scientist, Southeast",$212K-$331K,"At 10x Genomics, accelerating our understandin...",4.2,10x Genomics,"Raleigh, NC","Pleasanton, CA",501-1000,2012.0,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,unknown,Unknown,212000,331000
603,Data Scientist - Ts/Sci Fsp Or Ci Required,$80K-$132K,US Citizenship Required and (TS/SCI with FSP o...,5.0,Phoenix Operations Group,"Annapolis Junction, MD","Woodbine, MD",1-50,2011.0,Company - Private,IT Services,Information Technology,1000000-5000000,Unknown,80000,132000
665,Data Scientist,$105K-$167K,Criterion Systems seeks a Data Scientist to su...,3.8,"Criterion Systems, Inc.","Vienna, VA","Vienna, VA",201-500,2005.0,Company - Private,IT Services,Information Technology,50000000-100000000,Unknown,105000,167000
29,Data Scientist,$137K-$171K,Formation provides personalization for the lar...,2.8,Formation,"San Francisco, CA","San Francisco, CA",51-200,2015.0,Company - Private,Enterprise Software & Network Solutions,Information Technology,unknown,Unknown,137000,171000
561,Data Scientist Technical Specialist,$128K-$201K,Overview Technology is constantly changing and...,3.4,Peraton,"Chantilly, VA","Herndon, VA",1001-5000,2017.0,Company - Private,Aerospace & Defense,Aerospace & Defense,1000000000-2000000000,Unknown,128000,201000


# Final Dataset Inspection

In [11]:
job_postings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_title               672 non-null    object 
 1   Salary_estimate         672 non-null    object 
 2   Job_description         672 non-null    object 
 3   Rating                  672 non-null    float64
 4   Company_name            622 non-null    object 
 5   Location                672 non-null    object 
 6   Headquarters            672 non-null    object 
 7   Size (No_of_employees)  672 non-null    object 
 8   Founded                 554 non-null    object 
 9   Type_of_ownership       672 non-null    object 
 10  Industry                672 non-null    object 
 11  Sector                  672 non-null    object 
 12  Revenue                 672 non-null    object 
 13  Competitors             672 non-null    object 
 14  min_salary              672 non-null    in

In [12]:
job_postings.describe()

Unnamed: 0,Rating,min_salary,max_salary
count,672.0,672.0,672.0
mean,3.518601,99196.428571,148130.952381
std,1.410329,33009.958111,48035.110051
min,-1.0,31000.0,56000.0
25%,3.3,79000.0,119000.0
50%,3.8,91000.0,133000.0
75%,4.3,122000.0,165000.0
max,5.0,212000.0,331000.0
