In [19]:
import pandas as pd

df = pd.read_csv('C:/Users/maana/OneDrive/Desktop/malaysia-job-market-analysis/data/raw/jobstreet_all_job_dataset.csv')

print(df.shape)
print(df.columns.tolist())
print(df.head())

(69024, 11)
['job_id', 'job_title', 'company', 'descriptions', 'location', 'category', 'subcategory', 'role', 'type', 'salary', 'listingDate']
       job_id                                     job_title  \
0  74630583.0              Procurement Executive (Contract)   
1  74660602.0                  Account Executive/ Assistant   
2  74655679.0  Data Analyst - Asset Management, SPX Express   
3  74657624.0                              Service Engineer   
4  74679363.0                          Purchasing Executive   

                                  company  \
0   Coca-Cola Bottlers (Malaysia) Sdn Bhd   
1      Acoustic & Lighting System Sdn Bhd   
2          Shopee Mobile Malaysia Sdn Bhd   
3             Sun Medical Systems Sdn Bhd   
4  Magnet Security & Automation Sdn. Bhd.   

                                        descriptions         location  \
0  Position Purpose\nManage aspects of procuremen...  Negeri Sembilan   
1  We are looking for a Account Executive/ Assist...         

In [20]:
# How many missing values in each column?
print(df.isnull().sum())

# What job types exist?
print(df['type'].value_counts())

# What are the top 10 locations?
print(df['location'].value_counts().head(10))

# How many jobs have salary listed vs missing?
print(df['salary'].isna().sum(), "jobs have NO salary")
print(df['salary'].notna().sum(), "jobs HAVE salary")

job_id              0
job_title           0
company             0
descriptions        0
location            0
category            0
subcategory         0
role             2252
type                0
salary          37430
listingDate         0
dtype: int64
type
Full time                                      53790
['Full time']                                   9030
Contract/Temp                                   4976
['Contract/Temp']                                584
Part time                                        289
Casual/Vacation                                  251
['Part time']                                     78
['Casual/Vacation']                               22
['Contract/Temp', 'Full time']                     2
['Full time', 'Part time']                         1
['Contract/Temp', 'Full time', 'Part time']        1
Name: count, dtype: int64
location
Kuala Lumpur                15390
Petaling                     9746
Johor Bahru District         4480
Selangor            

In [21]:
import ast

def clean_type(val):
    try:
        # if it looks like a list string, convert it
        parsed = ast.literal_eval(val)
        if isinstance(parsed, list):
            return parsed[0]  # take the first value
    except:
        pass
    return val  # already clean, return as is

df['type_clean'] = df['type'].apply(clean_type)

# verify it worked
print(df['type_clean'].value_counts())

type_clean
Full time          62821
Contract/Temp       5563
Part time            367
Casual/Vacation      273
Name: count, dtype: int64


In [22]:
import re

def extract_salary(val):
    if pd.isna(val):
        return None, None
    
    # find all numbers in the string (remove commas first)
    numbers = re.findall(r'[\d,]+', val.replace(',', ''))
    numbers = [int(n) for n in numbers if len(n) >= 3]  # ignore short numbers
    
    if len(numbers) >= 2:
        return numbers[0], numbers[1]  # min, max salary
    elif len(numbers) == 1:
        return numbers[0], numbers[0]
    return None, None

df['salary_min'], df['salary_max'] = zip(*df['salary'].apply(extract_salary))
df['salary_avg'] = (df['salary_min'] + df['salary_max']) / 2

# check it worked
print(df[['salary', 'salary_min', 'salary_max', 'salary_avg']].dropna().head(10))
print("\nAverage salary across all jobs with salary listed:")
print(f"RM {df['salary_avg'].mean():,.0f} per month")

                           salary  salary_min  salary_max  salary_avg
1   RM 2,800 – RM 3,200 per month      2800.0      3200.0      3000.0
3   RM 3,000 – RM 3,500 per month      3000.0      3500.0      3250.0
4   RM 2,800 – RM 3,500 per month      2800.0      3500.0      3150.0
6   RM 3,000 – RM 4,500 per month      3000.0      4500.0      3750.0
9   RM 2,000 – RM 3,000 per month      2000.0      3000.0      2500.0
10  RM 3,000 – RM 3,200 per month      3000.0      3200.0      3100.0
12  RM 1,600 – RM 2,000 per month      1600.0      2000.0      1800.0
14  RM 3,500 – RM 4,000 per month      3500.0      4000.0      3750.0
15  RM 3,000 – RM 4,000 per month      3000.0      4000.0      3500.0
18  RM 3,500 – RM 5,000 per month      3500.0      5000.0      4250.0

Average salary across all jobs with salary listed:
RM 4,777 per month


In [23]:
# average salary by category
salary_by_category = df.groupby('category')['salary_avg'].agg(['mean', 'count']).reset_index()
salary_by_category.columns = ['category', 'avg_salary', 'job_count']

# only show categories with decent sample size
salary_by_category = salary_by_category[salary_by_category['job_count'] >= 50]
salary_by_category = salary_by_category.sort_values('avg_salary', ascending=False)

print(salary_by_category.to_string(index=False))

                              category   avg_salary  job_count
              CEO & General Management 22741.101695         59
                Real Estate & Property  6992.121069        318
Information & Communication Technology  6422.375369       2708
                 Consulting & Strategy  5844.116883         77
          Banking & Financial Services  5784.311966        702
                                 Sales  5708.160523       2984
                  Science & Technology  5616.800000        195
                          Construction  5424.653892        835
                           Engineering  5126.113754       2712
                  Healthcare & Medical  4996.263303        545
         Human Resources & Recruitment  4891.383681       1728
                                 Legal  4794.037143        175
                            Accounting  4670.759358       5637
            Mining, Resources & Energy  4658.125000         56
            Marketing & Communications  4537.791387    

In [24]:
# average salary by top 10 locations
top_locations = df['location'].value_counts().head(10).index
salary_by_location = df[df['location'].isin(top_locations)].groupby('location')['salary_avg'].mean().sort_values(ascending=False)

print(salary_by_location.round(0))

location
Penang                      7216.0
Kuala Lumpur City Centre    6217.0
Kuala Lumpur                5468.0
Selangor                    4971.0
Johor Bahru District        4701.0
Petaling                    4612.0
Seberang Perai              4510.0
Klang District              4422.0
Penang Island               4404.0
Shah Alam/Subang            4324.0
Name: salary_avg, dtype: float64


In [25]:
# top 20 companies hiring the most
top_companies = df['company'].value_counts().head(20)
print(top_companies)

# and specifically - who's hiring the most DATA roles?
data_jobs = df[df['category'] == 'Information & Communication Technology']
print(f"\nTotal ICT jobs: {len(data_jobs)}")
print(f"Average ICT salary: RM {data_jobs['salary_avg'].mean():,.0f}")
print("\nTop companies hiring ICT roles:")
print(data_jobs['company'].value_counts().head(15))

company
Private Advertiser                                    2230
AGENSI PEKERJAAN JS STAFFING SERVICES SDN BHD          288
Agensi Pekerjaan PERSOLKELLY Malaysia Sdn Bhd          239
Agensi Pekerjaan Hays (Malaysia) Sdn Bhd               238
RHB Banking Group                                      228
Michael Page International (Malaysia) Sdn Bhd          223
Intel Technology Sdn. Bhd.                             218
DKSH Malaysia Sdn Bhd                                  194
PERSOLKELLY Workforce Solutions Malaysia Sdn Bhd       191
Sunway Berhad                                          174
Standard Chartered Bank                                166
AmBank Group                                           162
SEEK                                                   148
Ambition Group Malaysia Sdn Bhd                        134
Malayan Banking Berhad (Maybank)                       133
Huawei Technologies (Malaysia) Sdn. Bhd                130
Flash Express                                   

In [26]:
# save cleaned dataframe
df_clean = df[['job_id', 'job_title', 'company', 'location', 'category', 
               'subcategory', 'role', 'type_clean', 'salary_min', 
               'salary_max', 'salary_avg', 'listingDate']].copy()

# rename for clarity
df_clean = df_clean.rename(columns={'type_clean': 'job_type'})

# convert date properly
df_clean['listingDate'] = pd.to_datetime(df_clean['listingDate'])
df_clean['month'] = df_clean['listingDate'].dt.month
df_clean['month_name'] = df_clean['listingDate'].dt.strftime('%B')

df_clean.to_csv('C:/Users/maana/OneDrive/Desktop/malaysia-job-market-analysis/data/cleaned/jobstreet_cleaned.csv', index=False)
print(f"Saved! Shape: {df_clean.shape}")
print(df_clean.dtypes)

Saved! Shape: (69024, 14)
job_id                     float64
job_title                   object
company                     object
location                    object
category                    object
subcategory                 object
role                        object
job_type                    object
salary_min                 float64
salary_max                 float64
salary_avg                 float64
listingDate    datetime64[ns, UTC]
month                        int32
month_name                  object
dtype: object


In [35]:
# stronger filter - catches all variations
agency_keywords = ['agensi', 'staffing', 'recruitment', 'michael page', 
                   'hays', 'persolkelly', 'manpower', 'ambition', 
                   'seek', 'private advertiser', 'tribehired', 'private',
                   'sdn bhd staffing', 'pekerjaan', 'outsource', 
                   'consulting', 'headhunt', 'talent', 'hr solutions',
                   'workforce', 'executive search', 'placement']

# case=False makes it catch AGENSI, Agensi, agensi all at once
mask = ~df['company'].str.contains('|'.join(agency_keywords), case=False, na=False)
df_companies = df[mask]

# check top 20 - paste this so we can see what's still slipping through
print(df_companies['company'].value_counts().head(20))
print(f"\nTotal companies remaining: {df_companies['company'].nunique()}")

company
RHB Banking Group                                     228
Intel Technology Sdn. Bhd.                            218
DKSH Malaysia Sdn Bhd                                 194
Sunway Berhad                                         174
Standard Chartered Bank                               166
AmBank Group                                          162
Malayan Banking Berhad (Maybank)                      133
Huawei Technologies (Malaysia) Sdn. Bhd               130
Flash Express                                         130
China Communications Construction (ECRL) Sdn. Bhd.    128
ExxonMobil Malaysia                                   125
CBRE                                                  118
TDCX Malaysia                                         118
Teleperformance Malaysia Sdn Bhd                      118
MumsMe Sdn Bhd                                        115
Marriott International                                112
Shopee Mobile Malaysia Sdn Bhd                        106
Averis

In [36]:
df_companies.to_csv('../data/cleaned/jobstreet_companies.csv', index=False)
print(f"Saved! {len(df_companies)} rows")

Saved! 63216 rows
