In [20]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

In [22]:
# Update this path to match your dataset location in Kaggle
INPUT_PATH = "/kaggle/input/dataset-bdjobs/bdjobs.com dataset 2025 (July - September) - bdjobs_scraped_data.csv"

df_raw = pd.read_csv(INPUT_PATH)
print("Raw shape:", df_raw.shape)
df_raw.head(3)

Raw shape: (5548, 18)


Unnamed: 0,Title,Job Link,Job ID,Job Category ID,Company Name,Promotion Text,Location,Experience Required,Deadline,Vacancy,Age,Salary,Published,Additional Requirements,Education,Remuneration Package,Employment Status,Gender
0,Manager - Compliance & Inventory,https://jobs.bdjobs.com/jobdetails/?id=1393429...,1393429,1,Lal Teer Livestock Ltd.,,Dhaka,At least 5 year(s),6 Sep2025,--,At least 35 years,Negotiable,07 Aug 2025,Age At least 35 years; Excellent Communication...,Master of Business Administration (MBA) in Acc...,"Mobile bill, Tour allowance, Provident fund; S...",Full Time,
1,Deputy Manager/ Manager – Accounts,https://jobs.bdjobs.com/jobdetails/?id=1393396...,1393396,1,SQ Group of Companies,,Dhaka,4 to 6 year(s),6 Sep2025,--,28 to 45 years,Negotiable,07 Aug 2025,Age 28 to 45 years,Master of Business Administration (MBA) in Acc...,Mobile bill; Salary Review: Yearly; Festival B...,Full Time,
2,Senior Accountant,https://jobs.bdjobs.com/jobdetails/?id=1393327...,1393327,1,A Reputed Apartment & Developers Company,,Chattogram,At least 5 year(s),5 Sep2025,1,,Negotiable,06 Aug 2025,,Needs to have experience in working in real es...,,Full Time,


In [23]:
df = df_raw.copy()

# Normalize column names to snake_case for easy coding
df.columns = (
    df.columns.str.strip()
              .str.replace(r"[^\w\s]+", "", regex=True)
              .str.replace(r"\s+", "_", regex=True)
              .str.lower()
)

print(sorted(df.columns))
df.head(3)

['additional_requirements', 'age', 'company_name', 'deadline', 'education', 'employment_status', 'experience_required', 'gender', 'job_category_id', 'job_id', 'job_link', 'location', 'promotion_text', 'published', 'remuneration_package', 'salary', 'title', 'vacancy']


Unnamed: 0,title,job_link,job_id,job_category_id,company_name,promotion_text,location,experience_required,deadline,vacancy,age,salary,published,additional_requirements,education,remuneration_package,employment_status,gender
0,Manager - Compliance & Inventory,https://jobs.bdjobs.com/jobdetails/?id=1393429...,1393429,1,Lal Teer Livestock Ltd.,,Dhaka,At least 5 year(s),6 Sep2025,--,At least 35 years,Negotiable,07 Aug 2025,Age At least 35 years; Excellent Communication...,Master of Business Administration (MBA) in Acc...,"Mobile bill, Tour allowance, Provident fund; S...",Full Time,
1,Deputy Manager/ Manager – Accounts,https://jobs.bdjobs.com/jobdetails/?id=1393396...,1393396,1,SQ Group of Companies,,Dhaka,4 to 6 year(s),6 Sep2025,--,28 to 45 years,Negotiable,07 Aug 2025,Age 28 to 45 years,Master of Business Administration (MBA) in Acc...,Mobile bill; Salary Review: Yearly; Festival B...,Full Time,
2,Senior Accountant,https://jobs.bdjobs.com/jobdetails/?id=1393327...,1393327,1,A Reputed Apartment & Developers Company,,Chattogram,At least 5 year(s),5 Sep2025,1,,Negotiable,06 Aug 2025,,Needs to have experience in working in real es...,,Full Time,


In [24]:
# These are identifiers or >90% missing
to_drop = [c for c in ["job_link", "job_id", "promotion_text"] if c in df.columns]
df = df.drop(columns=to_drop, errors="ignore")

print("After drop:", df.shape)

After drop: (5548, 15)


In [25]:
def clean_text(s):
    if pd.isna(s): 
        return np.nan
    s = str(s)
    s = re.sub(r"\s+", " ", s)           # collapse whitespace
    s = s.replace("\u00a0", " ").strip() # nbsp -> space
    return s

text_cols = [
    "title", "company_name", "location", "experience_required", "deadline",
    "vacancy", "age", "salary", "published", "additional_requirements",
    "education", "remuneration_package", "employment_status", "gender"
]
for c in text_cols:
    if c in df.columns:
        df[c] = df[c].map(clean_text)

In [28]:
NUM_RE = r"(?:(?:\d{1,3}(?:,\d{3})+)|\d+)(?:\.\d+)?"

def _to_float(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    x = re.sub(r"[^\d.]", "", str(x))
    return float(x) if x else np.nan

# Experience like: "2 to 4 year(s)", "At least 3 year(s)", "Not required"
def parse_experience(s):
    if not isinstance(s, str) or s.strip() == "":
        return (np.nan, np.nan)
    s_low = s.lower()
    if "not required" in s_low or "n/a" in s_low:
        return (0.0, 0.0)
    rng = re.findall(NUM_RE, s_low)
    vals = [float(v.replace(",", "")) for v in rng]
    if "at least" in s_low and vals:
        return (vals[0], np.nan)
    if "up to" in s_low and vals:
        return (0.0, vals[0])
    if len(vals) >= 2:
        return (min(vals[0], vals[1]), max(vals[0], vals[1]))
    if len(vals) == 1:
        return (vals[0], vals[0])
    return (np.nan, np.nan)

# Age like: "Age 24 to 35 years", "At most 30", "At least 22"
def parse_age(s):
    if not isinstance(s, str) or s.strip() == "":
        return (np.nan, np.nan)
    s_low = s.lower()
    rng = re.findall(NUM_RE, s_low)
    vals = [int(float(v.replace(",", ""))) for v in rng]
    if "at least" in s_low and vals:
        return (vals[0], np.nan)
    if "at most" in s_low and vals:
        return (np.nan, vals[0])
    if len(vals) >= 2:
        return (min(vals[0], vals[1]), max(vals[0], vals[1]))
    if len(vals) == 1:
        return (vals[0], vals[0])
    return (np.nan, np.nan)

# Salary like: "Tk. 30,000 - 50,000 (Monthly)", "Negotiable", "৳ 25,000"
# Returns monthly BDT min/max
def parse_salary(s):
    if not isinstance(s, str) or s.strip() == "":
        return (np.nan, np.nan, "unknown")
    s_low = s.lower()
    if "negotiable" in s_low:
        return (np.nan, np.nan, "negotiable")
    # detect period
    period = "monthly"
    if "year" in s_low:
        period = "yearly"
    elif "hour" in s_low:
        period = "hourly"
    elif "day" in s_low:
        period = "daily"

    nums = [float(v.replace(",", "")) for v in re.findall(NUM_RE, s_low)]
    if len(nums) >= 2:
        lo, hi = sorted(nums[:2])
    elif len(nums) == 1:
        lo, hi = nums[0], nums[0]
    else:
        return (np.nan, np.nan, period)

    # Convert to monthly BDT if necessary (very rough; adjust if you have clear rules)
    if period == "yearly":
        lo, hi = lo/12.0, hi/12.0
        period = "monthly"
    elif period == "daily":
        lo, hi = lo*26, hi*26
        period = "monthly"
    elif period == "hourly":
        lo, hi = lo*8*26, hi*8*26
        period = "monthly"
    return (lo, hi, period)

def parse_vacancy(s):
    if not isinstance(s, str) or s.strip() == "":
        return np.nan
    if "not specific" in s.lower():
        return 1.0
    nums = re.findall(NUM_RE, s)
    return float(nums[0].replace(",", "")) if nums else np.nan

In [30]:
# Experience
if "experience_required" in df.columns:
    exp_parsed = df["experience_required"].map(parse_experience)
    df["min_exp_yrs"] = exp_parsed.map(lambda x: x[0])
    df["max_exp_yrs"] = exp_parsed.map(lambda x: x[1])

# Age
if "age" in df.columns:
    age_parsed = df["age"].map(parse_age)
    df["min_age"] = age_parsed.map(lambda x: x[0])
    df["max_age"] = age_parsed.map(lambda x: x[1])

# Salary
if "salary" in df.columns:
    sal_parsed = df["salary"].map(parse_salary)
    df["min_salary_bdt_mo"] = sal_parsed.map(lambda x: x[0])
    df["max_salary_bdt_mo"] = sal_parsed.map(lambda x: x[1])
    df["salary_period_norm"] = sal_parsed.map(lambda x: x[2])

# Vacancy
if "vacancy" in df.columns:
    df["vacancy_n"] = df["vacancy"].map(parse_vacancy)

df[["min_exp_yrs","max_exp_yrs","min_age","max_age","min_salary_bdt_mo","max_salary_bdt_mo","vacancy_n"]].head(5)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,min_exp_yrs,max_exp_yrs,min_age,max_age,min_salary_bdt_mo,max_salary_bdt_mo,vacancy_n
0,5.0,,35.0,,,,
1,4.0,6.0,28.0,45.0,,,
2,5.0,,,,,,1.0
3,2.0,6.0,23.0,35.0,,,5.0
4,2.0,5.0,25.0,35.0,25000.0,30000.0,2.0


In [31]:
# Gender flags (many NaNs)
def gender_flags(s):
    s = s.lower() if isinstance(s, str) else ""
    return pd.Series({
        "male_required": 1 if "male" in s else 0,
        "female_required": 1 if "female" in s else 0
    })

if "gender" in df.columns:
    g = df["gender"].map(gender_flags)
    df = pd.concat([df, g], axis=1)

# Employment status normalized
if "employment_status" in df.columns:
    def norm_status(s):
        if not isinstance(s, str): return "unknown"
        s = s.lower()
        if "full" in s: return "full_time"
        if "part" in s: return "part_time"
        if "contract" in s: return "contract"
        if "intern" in s: return "intern"
        return "other"
    df["employment_status_norm"] = df["employment_status"].map(norm_status)

# Location: take first token (city/district) for high-cardinality control
if "location" in df.columns:
    df["location_main"] = df["location"].map(lambda s: s.split(",")[0].strip() if isinstance(s, str) and "," in s else s)


In [11]:
def has_kw(s, *kws):
    if not isinstance(s, str): return 0
    s = s.lower()
    return int(any(kw in s for kw in kws))

# Education keywords
df["edu_bachelor"] = df.get("education", np.nan).map(lambda s: has_kw(s, "bachelor", "bsc", "ba", "bbs"))
df["edu_master"]   = df.get("education", np.nan).map(lambda s: has_kw(s, "master", "msc", "ma", "mba"))
df["edu_cse"]      = df.get("education", np.nan).map(lambda s: has_kw(s, "cse", "computer science"))
df["edu_diploma"]  = df.get("education", np.nan).map(lambda s: has_kw(s, "diploma"))

# Perks from remuneration_package
rp = df.get("remuneration_package", np.nan)
df["perk_bonus"]       = rp.map(lambda s: has_kw(s, "bonus", "festival bonus", "performance bonus"))
df["perk_insurance"]   = rp.map(lambda s: has_kw(s, "insurance", "health", "medical"))
df["perk_pf"]          = rp.map(lambda s: has_kw(s, "provident fund"))
df["perk_gratuity"]    = rp.map(lambda s: has_kw(s, "gratuity"))
df["perk_mobile_bill"] = rp.map(lambda s: has_kw(s, "mobile bill"))

In [12]:
# Experience buckets
def exp_bucket(row):
    lo = row["min_exp_yrs"]
    hi = row["max_exp_yrs"]
    x = lo if pd.notna(lo) else hi
    if pd.isna(x): return "unknown"
    if x <= 1: return "entry"
    if x <= 4: return "mid"
    return "senior"

df["exp_bucket"] = df.apply(exp_bucket, axis=1)

# Simple imputations (tweak if you prefer model-based imputers)
for c in ["min_exp_yrs","max_exp_yrs","min_age","max_age","vacancy_n","posting_age_days","days_until_deadline"]:
    if c in df.columns:
        df[c] = df[c].fillna(df[c].median())

for c in ["employment_status_norm","location_main","salary_period_norm"]:
    if c in df.columns:
        df[c] = df[c].fillna("unknown")

In [13]:
cat_cols = [c for c in ["employment_status_norm", "exp_bucket", "salary_period_norm"] if c in df.columns]

# Optional: include location if you cap to Top-N to avoid explosion
TOP_N_LOCATIONS = 25
if "location_main" in df.columns:
    top_loc = df["location_main"].value_counts().nlargest(TOP_N_LOCATIONS).index
    df["location_capped"] = np.where(df["location_main"].isin(top_loc), df["location_main"], "other")
    cat_cols.append("location_capped")

ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
X_cat = ohe.fit_transform(df[cat_cols]) if cat_cols else csr_matrix((len(df), 0))

print("Categorical dims:", X_cat.shape)

Categorical dims: (5548, 39)




In [14]:
cat_cols = [c for c in ["employment_status_norm", "exp_bucket", "salary_period_norm"] if c in df.columns]

# Optional: include location if you cap to Top-N to avoid explosion
TOP_N_LOCATIONS = 25
if "location_main" in df.columns:
    top_loc = df["location_main"].value_counts().nlargest(TOP_N_LOCATIONS).index
    df["location_capped"] = np.where(df["location_main"].isin(top_loc), df["location_main"], "other")
    cat_cols.append("location_capped")

ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
X_cat = ohe.fit_transform(df[cat_cols]) if cat_cols else csr_matrix((len(df), 0))

print("Categorical dims:", X_cat.shape)

Categorical dims: (5548, 39)




In [15]:
num_cols = [
    "min_exp_yrs","max_exp_yrs","min_age","max_age",
    "min_salary_bdt_mo","max_salary_bdt_mo","vacancy_n",
    "posting_age_days","days_until_deadline",
    "male_required","female_required",
    "edu_bachelor","edu_master","edu_cse","edu_diploma",
    "perk_bonus","perk_insurance","perk_pf","perk_gratuity","perk_mobile_bill"
]
num_cols = [c for c in num_cols if c in df.columns]

X_num = csr_matrix(df[num_cols].fillna(0.0).values) if num_cols else csr_matrix((len(df), 0))
print("Numeric dims:", X_num.shape)

Numeric dims: (5548, 16)


In [16]:
# Toggle to include TF‑IDF features from short fields
USE_TFIDF = True

text_for_tfidf = []
if USE_TFIDF:
    # Combine compact textual columns (avoid extremely long blobs)
    cols = [c for c in ["title","additional_requirements","education"] if c in df.columns]
    def join_text(row):
        parts = [str(row[c]) for c in cols if pd.notna(row[c])]
        return " | ".join(parts) if parts else ""
    text_for_tfidf = [join_text(row) for _, row in df.iterrows()]

    tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2), stop_words="english")
    X_txt = tfidf.fit_transform(text_for_tfidf)
else:
    X_txt = csr_matrix((len(df), 0))

print("Text dims:", X_txt.shape)

Text dims: (5548, 2000)


In [17]:
# Scale numeric block only (keeps sparsity via inplace replacement afterwards)
scaler = MinMaxScaler()
if X_num.shape[1] > 0:
    X_num_dense = X_num.toarray()
    X_num_scaled = scaler.fit_transform(X_num_dense)
    X_num = csr_matrix(X_num_scaled)

# Final X = [numeric | categorical | text]
from scipy.sparse import hstack
X = hstack([X_num, X_cat, X_txt]).tocsr()
print("Final feature matrix:", X.shape)

# Optional target (if available)
y = df["job_category_id"] if "job_category_id" in df.columns else None
if y is not None:
    print("Target distribution (head):")
    print(y.value_counts().head())

Final feature matrix: (5548, 2055)
Target distribution (head):
job_category_id
9    397
6    396
4    393
5    393
1    377
Name: count, dtype: int64


In [18]:
OUTPUT_PREFIX = "/kaggle/working/bdjobs_ml_ready"

# Cleaned tabular (no high-cardinality OHE exploded columns; safe to inspect)
save_cols = sorted(set(
    ["title","company_name","location_main","employment_status_norm","exp_bucket","job_category_id",
     "min_exp_yrs","max_exp_yrs","min_age","max_age","min_salary_bdt_mo","max_salary_bdt_mo",
     "vacancy_n","posting_age_days","days_until_deadline","male_required","female_required",
     "edu_bachelor","edu_master","edu_cse","edu_diploma","perk_bonus","perk_insurance","perk_pf","perk_gratuity","perk_mobile_bill"]
).intersection(df.columns))

df_out = df[save_cols].copy()
df_out.to_csv(f"{OUTPUT_PREFIX}_clean.csv", index=False)

# Sparse matrices & encoders (if you want to train later in a separate notebook)
import joblib
joblib.dump(X, f"{OUTPUT_PREFIX}_X_sparse.joblib")
if y is not None:
    joblib.dump(y.values, f"{OUTPUT_PREFIX}_y.npy")
joblib.dump(ohe, f"{OUTPUT_PREFIX}_ohe.joblib")
joblib.dump(scaler, f"{OUTPUT_PREFIX}_scaler.joblib")
if "tfidf" in locals():
    joblib.dump(tfidf, f"{OUTPUT_PREFIX}_tfidf.joblib")

print("Saved:")
print(f"- Clean CSV: {OUTPUT_PREFIX}_clean.csv")
print(f"- Sparse X:  {OUTPUT_PREFIX}_X_sparse.joblib")
if y is not None:
    print(f"- y target:  {OUTPUT_PREFIX}_y.npy")

Saved:
- Clean CSV: /kaggle/working/bdjobs_ml_ready_clean.csv
- Sparse X:  /kaggle/working/bdjobs_ml_ready_X_sparse.joblib
- y target:  /kaggle/working/bdjobs_ml_ready_y.npy


In [26]:
import pandas as pd
import numpy as np
import re

In [27]:
file_path = "/kaggle/input/jobsdatast/bdjobs.csv"

df = pd.read_csv(file_path)
print(df.shape)
df.head()

(5548, 20)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,Min Salary,Bonus,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,,1,0,0,1,1,Manager - Compliance & Inventory,2.0,MBA,True
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,,1,0,0,1,0,Deputy Manager/ Manager – Accounts,2.0,CA,True
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,,0,0,0,0,0,Senior Accountant,1.0,Other,False
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,,1,0,0,1,0,ACCOUNTS,5.0,Other,False
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,25000.0,1,0,0,1,0,Accountant & Finance Officer,2.0,BCom,False


In [28]:
# Some files include trailing spaces (e.g., "Max Salary ")
df.columns = [c.strip() for c in df.columns]
df.columns

Index(['Company Name', 'Employment Status', 'Experience Cat',
       'job_category_id', 'Location', 'Max Age', 'Max Experience',
       'Max Salary', 'Min Age', 'Min Experience', 'Min Salary', 'Bonus',
       'Gratuity', 'Insurance', 'Mobile Bill', 'Provident Fund', 'job title',
       'vacancy', 'Degree', 'In Dhaka'],
      dtype='object')

In [29]:
def normalize_whitespace(s):
    if pd.isna(s):
        return s
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

text_cols = ["Company Name", "Employment Status", "Experience Cat", "Location", "job title", "Degree"]
for c in text_cols:
    if c in df.columns:
        df[c] = df[c].apply(normalize_whitespace)

In [30]:
df.head(10)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,Min Salary,Bonus,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,,1,0,0,1,1,Manager - Compliance & Inventory,2.0,MBA,True
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,,1,0,0,1,0,Deputy Manager/ Manager – Accounts,2.0,CA,True
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,,0,0,0,0,0,Senior Accountant,1.0,Other,False
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,,1,0,0,1,0,ACCOUNTS,5.0,Other,False
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,25000.0,1,0,0,1,0,Accountant & Finance Officer,2.0,BCom,False
5,CavinKare (Bangladesh) Private Limited,full_time,mid,1,Demra,40.0,5.0,,25.0,3.0,,1,1,1,1,0,Officer (Finance & Accounts),1.0,Unknown,False
6,Ocean Group,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,,1,0,0,0,0,Accounts Officer,2.0,Other,True
7,Linde Bangladesh Limited,full_time,senior,1,Dhaka,35.0,5.0,,25.0,5.0,,1,1,1,0,1,Senior Executive - Finance,1.0,Other,True
8,A Reputed Group of Companies,full_time,senior,1,Chattogram,50.0,5.0,,35.0,12.0,,1,1,0,1,1,Head of Internal Audit (CTG Regional),1.0,CA,False
9,Initiative for Right View (IRV),full_time,mid,1,Khulna Sadar,40.0,5.0,,25.0,2.0,,0,0,0,0,0,Accountant,1.0,Other,False


In [31]:
# Ensure numeric
for col in ["Min Salary", "Max Salary"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

if "Min Salary" in df.columns and "Max Salary" in df.columns:
    # Compute average salary from min & max (when available)
    df["avg_salary"] = df[["Min Salary", "Max Salary"]].mean(axis=1, skipna=True)

    # Fill missing avg_salary with mean of existing values
    mean_salary = df["avg_salary"].mean(skipna=True)
    df["avg_salary"] = df["avg_salary"].fillna(mean_salary)
else:
    df["avg_salary"] = np.nan

In [32]:
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,...,Bonus,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka,avg_salary
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,...,1,0,0,1,1,Manager - Compliance & Inventory,2.0,MBA,True,33750.002525
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,...,1,0,0,1,0,Deputy Manager/ Manager – Accounts,2.0,CA,True,33750.002525
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,...,0,0,0,0,0,Senior Accountant,1.0,Other,False,33750.002525
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,...,1,0,0,1,0,ACCOUNTS,5.0,Other,False,33750.002525
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,...,1,0,0,1,0,Accountant & Finance Officer,2.0,BCom,False,27500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Amin Physiotherapy & Fitness Center,full_time,unknown,92,Sylhet,33.0,5.0,25000.0,22.0,3.0,...,0,0,0,0,0,Medical Technologist / Physiotherapist,4.0,CA,False,20000.000000
5544,E-Learning And Earning Ltd (Barishal Branch),full_time,entry,92,Barishal Sadar,35.0,5.0,7000.0,18.0,1.0,...,1,0,0,0,0,অফিস সহায়ক (পুরুষ),1.0,Other,False,7000.000000
5545,Techno Health Bangladesh,full_time,entry,92,Uttara,35.0,3.0,30000.0,25.0,1.0,...,0,0,0,0,0,Clinical Physiotherapist,10.0,Other,False,25000.000000
5546,ASPC ManipulationTherapy centre,full_time,unknown,92,Mohammadpur,35.0,5.0,20000.0,25.0,3.0,...,1,0,0,0,0,Medical Technologist (Physiotherapy),5.0,CA,False,18000.000000


In [33]:
# Ensure numeric
for col in ["Min Salary", "Max Salary"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

if "Min Salary" in df.columns and "Max Salary" in df.columns:
    # Compute average salary
    df["avg_salary"] = df[["Min Salary", "Max Salary"]].mean(axis=1, skipna=True)

    # Compute mean salary (excluding NaN)
    mean_salary = df["avg_salary"].mean(skipna=True)

    # Fill missing avg_salary with dataset mean
    df["avg_salary"] = df["avg_salary"].fillna(mean_salary)

    # Round to 2 decimals
    df["avg_salary"] = df["avg_salary"].round(2)
else:
    df["avg_salary"] = np.nan

In [34]:
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,...,Bonus,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka,avg_salary
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,...,1,0,0,1,1,Manager - Compliance & Inventory,2.0,MBA,True,33750.0
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,...,1,0,0,1,0,Deputy Manager/ Manager – Accounts,2.0,CA,True,33750.0
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,...,0,0,0,0,0,Senior Accountant,1.0,Other,False,33750.0
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,...,1,0,0,1,0,ACCOUNTS,5.0,Other,False,33750.0
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,...,1,0,0,1,0,Accountant & Finance Officer,2.0,BCom,False,27500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Amin Physiotherapy & Fitness Center,full_time,unknown,92,Sylhet,33.0,5.0,25000.0,22.0,3.0,...,0,0,0,0,0,Medical Technologist / Physiotherapist,4.0,CA,False,20000.0
5544,E-Learning And Earning Ltd (Barishal Branch),full_time,entry,92,Barishal Sadar,35.0,5.0,7000.0,18.0,1.0,...,1,0,0,0,0,অফিস সহায়ক (পুরুষ),1.0,Other,False,7000.0
5545,Techno Health Bangladesh,full_time,entry,92,Uttara,35.0,3.0,30000.0,25.0,1.0,...,0,0,0,0,0,Clinical Physiotherapist,10.0,Other,False,25000.0
5546,ASPC ManipulationTherapy centre,full_time,unknown,92,Mohammadpur,35.0,5.0,20000.0,25.0,3.0,...,1,0,0,0,0,Medical Technologist (Physiotherapy),5.0,CA,False,18000.0


In [35]:
degree_map = {
    r"\b(m\.?b\.?a|master|ms|msc|m\.sc)\b": "masters",
    r"\b(b\.?b\.?a|b\.?s|bsc|b\.sc|bachelor|bcom|b\.com|bba|b\.a)\b": "bachelors",
    r"\b(phd|doctorate)\b": "phd",
    r"\b(acca|cfa|cma|icab|ca)\b": "professional",
    r"\b(hsc|diploma)\b": "diploma"
}
def normalize_degree(val):
    if pd.isna(val):
        return np.nan
    s = str(val).lower()
    for pattern, label in degree_map.items():
        if re.search(pattern, s):
            return label
    tokens = {"mba":"masters","bcom":"bachelors","bba":"bachelors","bsc":"bachelors","msc":"masters"}
    return tokens.get(s, "other")

df["Degree"] = df["Degree"].apply(normalize_degree) if "Degree" in df.columns else np.nan


In [36]:
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,...,Bonus,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka,avg_salary
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,...,1,0,0,1,1,Manager - Compliance & Inventory,2.0,masters,True,33750.0
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,...,1,0,0,1,0,Deputy Manager/ Manager – Accounts,2.0,professional,True,33750.0
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,...,0,0,0,0,0,Senior Accountant,1.0,other,False,33750.0
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,...,1,0,0,1,0,ACCOUNTS,5.0,other,False,33750.0
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,...,1,0,0,1,0,Accountant & Finance Officer,2.0,bachelors,False,27500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Amin Physiotherapy & Fitness Center,full_time,unknown,92,Sylhet,33.0,5.0,25000.0,22.0,3.0,...,0,0,0,0,0,Medical Technologist / Physiotherapist,4.0,professional,False,20000.0
5544,E-Learning And Earning Ltd (Barishal Branch),full_time,entry,92,Barishal Sadar,35.0,5.0,7000.0,18.0,1.0,...,1,0,0,0,0,অফিস সহায়ক (পুরুষ),1.0,other,False,7000.0
5545,Techno Health Bangladesh,full_time,entry,92,Uttara,35.0,3.0,30000.0,25.0,1.0,...,0,0,0,0,0,Clinical Physiotherapist,10.0,other,False,25000.0
5546,ASPC ManipulationTherapy centre,full_time,unknown,92,Mohammadpur,35.0,5.0,20000.0,25.0,3.0,...,1,0,0,0,0,Medical Technologist (Physiotherapy),5.0,professional,False,18000.0


In [37]:
df['Degree']

0            masters
1       professional
2              other
3              other
4          bachelors
            ...     
5543    professional
5544           other
5545           other
5546    professional
5547           other
Name: Degree, Length: 5548, dtype: object

In [38]:
for col in ["Bonus","Gratuity","Insurance","Mobile Bill","Provident Fund"]:
    if col in df.columns:
        s = pd.to_numeric(df[col], errors="coerce")
        df[col] = s.fillna(0).astype(int).astype(bool)

In [39]:
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,...,Bonus,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka,avg_salary
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,...,True,False,False,True,True,Manager - Compliance & Inventory,2.0,masters,True,33750.0
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,...,True,False,False,True,False,Deputy Manager/ Manager – Accounts,2.0,professional,True,33750.0
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,...,False,False,False,False,False,Senior Accountant,1.0,other,False,33750.0
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,...,True,False,False,True,False,ACCOUNTS,5.0,other,False,33750.0
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,...,True,False,False,True,False,Accountant & Finance Officer,2.0,bachelors,False,27500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Amin Physiotherapy & Fitness Center,full_time,unknown,92,Sylhet,33.0,5.0,25000.0,22.0,3.0,...,False,False,False,False,False,Medical Technologist / Physiotherapist,4.0,professional,False,20000.0
5544,E-Learning And Earning Ltd (Barishal Branch),full_time,entry,92,Barishal Sadar,35.0,5.0,7000.0,18.0,1.0,...,True,False,False,False,False,অফিস সহায়ক (পুরুষ),1.0,other,False,7000.0
5545,Techno Health Bangladesh,full_time,entry,92,Uttara,35.0,3.0,30000.0,25.0,1.0,...,False,False,False,False,False,Clinical Physiotherapist,10.0,other,False,25000.0
5546,ASPC ManipulationTherapy centre,full_time,unknown,92,Mohammadpur,35.0,5.0,20000.0,25.0,3.0,...,True,False,False,False,False,Medical Technologist (Physiotherapy),5.0,professional,False,18000.0


In [40]:
df.head(10)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,...,Bonus,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka,avg_salary
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,...,True,False,False,True,True,Manager - Compliance & Inventory,2.0,masters,True,33750.0
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,...,True,False,False,True,False,Deputy Manager/ Manager – Accounts,2.0,professional,True,33750.0
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,...,False,False,False,False,False,Senior Accountant,1.0,other,False,33750.0
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,...,True,False,False,True,False,ACCOUNTS,5.0,other,False,33750.0
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,...,True,False,False,True,False,Accountant & Finance Officer,2.0,bachelors,False,27500.0
5,CavinKare (Bangladesh) Private Limited,full_time,mid,1,Demra,40.0,5.0,,25.0,3.0,...,True,True,True,True,False,Officer (Finance & Accounts),1.0,other,False,33750.0
6,Ocean Group,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,...,True,False,False,False,False,Accounts Officer,2.0,other,True,33750.0
7,Linde Bangladesh Limited,full_time,senior,1,Dhaka,35.0,5.0,,25.0,5.0,...,True,True,True,False,True,Senior Executive - Finance,1.0,other,True,33750.0
8,A Reputed Group of Companies,full_time,senior,1,Chattogram,50.0,5.0,,35.0,12.0,...,True,True,False,True,True,Head of Internal Audit (CTG Regional),1.0,professional,False,33750.0
9,Initiative for Right View (IRV),full_time,mid,1,Khulna Sadar,40.0,5.0,,25.0,2.0,...,False,False,False,False,False,Accountant,1.0,other,False,33750.0


In [41]:
df.columns.tolist()

['Company Name',
 'Employment Status',
 'Experience Cat',
 'job_category_id',
 'Location',
 'Max Age',
 'Max Experience',
 'Max Salary',
 'Min Age',
 'Min Experience',
 'Min Salary',
 'Bonus',
 'Gratuity',
 'Insurance',
 'Mobile Bill',
 'Provident Fund',
 'job title',
 'vacancy',
 'Degree',
 'In Dhaka',
 'avg_salary']

In [42]:
# Ensure numeric
for col in ["Min Age", "Max Age"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

if "Min Age" in df.columns and "Max Age" in df.columns:
    # Compute average age
    df["avg_age"] = df[["Min Age", "Max Age"]].mean(axis=1, skipna=True)

    # Fill missing avg_age with dataset mean
    mean_age = df["avg_age"].mean(skipna=True)
    df["avg_age"] = df["avg_age"].fillna(mean_age)

    # Round to 2 decimals
    df["avg_age"] = df["avg_age"].round(2)
else:
    df["avg_age"] = np.nan

In [43]:
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Company Name,Employment Status,Experience Cat,job_category_id,Location,Max Age,Max Experience,Max Salary,Min Age,Min Experience,...,Gratuity,Insurance,Mobile Bill,Provident Fund,job title,vacancy,Degree,In Dhaka,avg_salary,avg_age
0,Lal Teer Livestock Ltd.,full_time,senior,1,Dhaka,35.0,5.0,,35.0,5.0,...,False,False,True,True,Manager - Compliance & Inventory,2.0,masters,True,33750.0,35.0
1,SQ Group of Companies,full_time,mid,1,Dhaka,45.0,6.0,,28.0,4.0,...,False,False,True,False,Deputy Manager/ Manager – Accounts,2.0,professional,True,33750.0,36.5
2,A Reputed Apartment & Developers Company,full_time,senior,1,Chattogram,35.0,5.0,,25.0,5.0,...,False,False,False,False,Senior Accountant,1.0,other,False,33750.0,30.0
3,MUNIA OVERSEAS (RL-2452),full_time,mid,1,Uttara Sector 17,35.0,6.0,,23.0,2.0,...,False,False,True,False,ACCOUNTS,5.0,other,False,33750.0,29.0
4,Sino Bangladesh Trade International Ltd,full_time,mid,1,Banani,35.0,5.0,30000.0,25.0,2.0,...,False,False,True,False,Accountant & Finance Officer,2.0,bachelors,False,27500.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,Amin Physiotherapy & Fitness Center,full_time,unknown,92,Sylhet,33.0,5.0,25000.0,22.0,3.0,...,False,False,False,False,Medical Technologist / Physiotherapist,4.0,professional,False,20000.0,27.5
5544,E-Learning And Earning Ltd (Barishal Branch),full_time,entry,92,Barishal Sadar,35.0,5.0,7000.0,18.0,1.0,...,False,False,False,False,অফিস সহায়ক (পুরুষ),1.0,other,False,7000.0,26.5
5545,Techno Health Bangladesh,full_time,entry,92,Uttara,35.0,3.0,30000.0,25.0,1.0,...,False,False,False,False,Clinical Physiotherapist,10.0,other,False,25000.0,30.0
5546,ASPC ManipulationTherapy centre,full_time,unknown,92,Mohammadpur,35.0,5.0,20000.0,25.0,3.0,...,False,False,False,False,Medical Technologist (Physiotherapy),5.0,professional,False,18000.0,30.0


In [44]:
# Drop raw min/max columns that are now replaced by engineered ones
cols_to_drop = [
    "Min Salary", "Max Salary",
    "Min Age", "Max Age",
    "Max Experience"
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# Check remaining columns
print(df.columns.tolist())

['Company Name', 'Employment Status', 'Experience Cat', 'job_category_id', 'Location', 'Min Experience', 'Bonus', 'Gratuity', 'Insurance', 'Mobile Bill', 'Provident Fund', 'job title', 'vacancy', 'Degree', 'In Dhaka', 'avg_salary', 'avg_age']


In [45]:
df.isnull().sum()

Company Name          0
Employment Status     0
Experience Cat        0
job_category_id       0
Location              0
Min Experience        0
Bonus                 0
Gratuity              0
Insurance             0
Mobile Bill           0
Provident Fund        0
job title             0
vacancy               0
Degree                0
In Dhaka             78
avg_salary            0
avg_age               0
dtype: int64

In [46]:
df.isnull().sum()

Company Name          0
Employment Status     0
Experience Cat        0
job_category_id       0
Location              0
Min Experience        0
Bonus                 0
Gratuity              0
Insurance             0
Mobile Bill           0
Provident Fund        0
job title             0
vacancy               0
Degree                0
In Dhaka             78
avg_salary            0
avg_age               0
dtype: int64

In [47]:
# Fill NaN in "In Dhaka" with False
if "In Dhaka" in df.columns:
    df["In Dhaka"] = df["In Dhaka"].fillna(False)

# Ensure it's boolean type
df["In Dhaka"] = df["In Dhaka"].astype(bool)

# Check result
print(df["In Dhaka"].value_counts(dropna=False))

In Dhaka
False    4140
True     1408
Name: count, dtype: int64


  df["In Dhaka"] = df["In Dhaka"].fillna(False)


In [24]:
df.isnull().sum()

Company Name         0
Employment Status    0
Experience Cat       0
job_category_id      0
Location             0
Max Experience       0
Bonus                0
Gratuity             0
Insurance            0
Mobile Bill          0
Provident Fund       0
job title            0
vacancy              0
Degree               0
In Dhaka             0
avg_salary           0
avg_age              0
dtype: int64

In [48]:
# Save cleaned DataFrame to CSV
output_path = "bdjobsdotcom_cleaned.csv"   # this will save in your working directory (Kaggle/Colab/local)
df.to_csv(output_path, index=False)

print(f"Cleaned dataset saved as: {output_path}")

Cleaned dataset saved as: bdjobsdotcom_cleaned.csv
