In [19]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable


In [20]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import  root_mean_squared_error, mean_absolute_error, r2_score

In [21]:
df=pd.read_csv('employess_cleandata.csv')
df

Unnamed: 0,Department,Job_Title,Experience_Years,Education_Level,Salary
0,Engineering,Engineer,1,Master,90000
1,Sales,Executive,33,Master,195000
2,Engineering,Intern,1,Bachelor,35000
3,Finance,Analyst,9,Bachelor,75000
4,HR,Analyst,2,Master,70000
...,...,...,...,...,...
9995,Sales,Executive,28,Master,185000
9996,Sales,Executive,9,PhD,165000
9997,Sales,Executive,30,PhD,200000
9998,Engineering,Manager,14,Master,135000


In [22]:
# Features and target
X = df[['Department','Job_Title','Education_Level','Experience_Years']]
y = df['Salary']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify categorical features (by column index)
cat_features = [0,1,2]  # Department, Job_Title, Education_Level

# Initialize CatBoost Regressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=cat_features,
    verbose=100
)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("===== Model Evaluation =====")
print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)

0:	learn: 42414.3953001	total: 34.6ms	remaining: 34.5s
100:	learn: 4370.5579547	total: 3.29s	remaining: 29.3s
200:	learn: 4256.0053422	total: 6.27s	remaining: 24.9s
300:	learn: 4211.0943847	total: 9.25s	remaining: 21.5s
400:	learn: 4180.3644695	total: 13.6s	remaining: 20.3s
500:	learn: 4148.8247553	total: 17.6s	remaining: 17.5s
600:	learn: 4124.6756742	total: 21s	remaining: 14s
700:	learn: 4105.9970672	total: 24.5s	remaining: 10.5s
800:	learn: 4088.5360116	total: 28s	remaining: 6.94s
900:	learn: 4071.2411544	total: 31.2s	remaining: 3.43s
999:	learn: 4055.3824339	total: 34.2s	remaining: 0us
===== Model Evaluation =====
RMSE: 4220.078117809612
MAE: 3393.5865519440167
R2 Score: 0.9915503247801469


In [23]:
#Average salary in dataset
avg_salary = y.mean()
print("Average Salary:", avg_salary)


Average Salary: 115381.5


In [24]:
print(f"MAE as % of Avg Salary: {mae / avg_salary * 100:.2f}%")

MAE as % of Avg Salary: 2.94%


In [25]:
# ========== FINAL TRAINING ==========
# Retrain model on full dataset (train + test)
final_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=cat_features,
    verbose=100,
    random_seed=42
)

final_model.fit(X, y, cat_features=cat_features)



0:	learn: 42065.5255282	total: 33.5ms	remaining: 33.5s
100:	learn: 4385.4692222	total: 3.22s	remaining: 28.6s
200:	learn: 4242.4553042	total: 6.41s	remaining: 25.5s
300:	learn: 4203.0536388	total: 10.4s	remaining: 24.1s
400:	learn: 4181.5091560	total: 14s	remaining: 20.9s
500:	learn: 4161.6943897	total: 17s	remaining: 16.9s
600:	learn: 4144.3233149	total: 20s	remaining: 13.3s
700:	learn: 4130.6733861	total: 23.3s	remaining: 9.93s
800:	learn: 4115.9206263	total: 27.2s	remaining: 6.76s
900:	learn: 4103.8330643	total: 30.9s	remaining: 3.4s
999:	learn: 4091.2130084	total: 34.5s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x224ec823920>

In [26]:
# Save model
final_model.save_model("catboost_salary_model.cbm")
print("✅ Final model saved as 'catboost_salary_model.cbm'")

✅ Final model saved as 'catboost_salary_model.cbm'


In [28]:
# Predict on test data
y_pred = final_model.predict(X_test)

# Show first 5 predictions
print("Sample predictions:", y_pred[:5])


Sample predictions: [137586.85060659  75283.99750199  67818.45685331 126831.45619211
 142248.84401905]


In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Reloaded Model Evaluation:")
print("RMSE:", rmse)
print("MAE:", mae)
print("R² Score:", r2)


Reloaded Model Evaluation:
RMSE: 4107.742297454522
MAE: 3291.307828151837
R² Score: 0.9919941874777202
