# **Loading**

In [4]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression 


# from dataset.load import load_df
# from utils import evaluate

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [19]:
df = pd.read_csv('../dataset/clean_data.csv')
print(f"Dataframe shape: {df.shape}")
df.T

Dataframe shape: (39772, 48)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39762,39763,39764,39765,39766,39767,39768,39769,39770,39771
Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,47686,47687,47688,47689,47690,47692,47693,47694,47697,47698
RemoteWork,Hybrid,Remote,Remote,Remote,Remote,Hybrid,Remote,Remote,Hybrid,Hybrid,...,Remote,Person,Hybrid,Remote,Remote,Hybrid,Hybrid,Remote,Hybrid,Person
EdLevel,Bachelor’s degree,Bachelor’s degree,Less than a Bachelors,Less than a Bachelors,Bachelor’s degree,Master’s degree,Bachelor’s degree,Bachelor’s degree,Master’s degree,Master’s degree,...,Bachelor’s degree,Bachelor’s degree,Bachelor’s degree,Less than a Bachelors,Bachelor’s degree,Bachelor’s degree,Post grad,Master’s degree,Master’s degree,Master’s degree
YearsCodePro,7.0,4.0,21.0,3.0,3.0,15.0,9.0,9.0,0.5,7.0,...,1.0,6.0,8.0,0.5,8.0,8.0,5.0,24.0,9.0,9.0
DevType,"Developer, front-end","Developer, full-stack","Developer, back-end","Developer, full-stack","Developer, full-stack",System administrator,"Developer, full-stack","Developer, full-stack","Developer, QA or test","Developer, full-stack",...,"Developer, desktop or enterprise applications","Developer, back-end","Developer, front-end","Developer, full-stack",Engineering manager,"Developer, front-end","Developer, mobile","Developer, back-end",Other,"Developer, full-stack"
Country,USA,Philippines,UK,USA,USA,Finland,Australia,USA,USA,Netherlands,...,Canada,Spain,USA,Canada,USA,Sweden,Mexico,Brazil,France,Greece
Age,25-34,25-34,35-44,35-44,25-34,Over 45,25-34,25-34,35-44,25-34,...,25-34,25-34,35-44,25-34,25-34,25-34,25-34,35-44,25-34,25-34
Salary,156000.0,23456.0,96828.0,135000.0,80000.0,64254.0,78003.0,75000.0,150000.0,187407.0,...,44611.0,32127.0,75000.0,63199.0,195000.0,52981.0,28625.0,50719.0,64254.0,61041.0
HTML/CSS,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
JavaScript,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0


In [21]:
df.columns

Index(['Unnamed: 0', 'RemoteWork', 'EdLevel', 'YearsCodePro', 'DevType',
       'Country', 'Age', 'Salary', 'HTML/CSS', 'JavaScript', 'Python',
       'Bash/Shell (all shells)', 'SQL', 'TypeScript', 'Java', 'C#',
       'Other language', 'PostgreSQL', 'Redis', 'Elasticsearch', 'MongoDB',
       'MariaDB', 'Microsoft SQL Server', 'MySQL', 'SQLite', 'Other database',
       'Amazon Web Services (AWS)', 'Google Cloud', 'Cloudflare', 'Firebase',
       'Digital Ocean', 'Microsoft Azure', 'Other platform', 'Docker',
       'Kubernetes', 'npm', 'Pip', 'Webpack', 'Yarn', 'Homebrew',
       'Other ToolsTech', 'Vim', 'Visual Studio Code', 'IntelliJ IDEA',
       'Android Studio', 'Notepad++', 'Visual Studio', 'Other CollabTool'],
      dtype='object')

In [7]:
df['EdLevel'].unique()

array(['Bachelor’s degree', 'Less than a Bachelors', 'Master’s degree',
       'Post grad'], dtype=object)

# **Get Train-Test split**

In [8]:
print("*" * 25)
train = df.sample(frac=0.95, random_state=42)
test = df.drop(train.index)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("*" * 25)

X_train = train.drop(["Salary","Unnamed: 0"], axis=1)
y_train = train["Salary"].values
X_test= test.drop(["Salary","Unnamed: 0"], axis=1)
y_test = test["Salary"].values

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print("*" * 25)

*************************
Train shape: (37783, 48)
Test shape: (1989, 48)
*************************
X_train shape: (37783, 46)
y_train shape: (37783,)
X_test shape: (1989, 46)
y_test shape: (1989,)
*************************


# **Model Selection**

In [9]:
scoring = ["neg_root_mean_squared_error",  
            "neg_mean_absolute_error", "r2"]

In [10]:
def print_metrics(metrics_dict):
    fit_times = metrics_dict['fit_time']
    score_times = metrics_dict['score_time']
    rmse = metrics_dict['test_neg_root_mean_squared_error']
    mae = metrics_dict['test_neg_mean_absolute_error']
    r2 = metrics_dict['test_r2']

    data = {
        'RMSE': [-1 * val for val in rmse],
        'MAE': [-1 * val for val in mae],
        'R-squared': r2
    }

    df = pd.DataFrame(data)

    mean_values = {
        'RMSE': -1 * np.mean(rmse),
        'MAE': -1 * np.mean(mae),
        'R-squared': np.mean(r2)
    }

    mean_df = pd.DataFrame(mean_values, index=['Mean'])
    df = pd.concat([df, mean_df])

    # Làm tròn các giá trị trong DataFrame
    df = df.round(decimals=4)

    return df


## **Linear Regression**

In [11]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = LinearRegression()

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores_lr = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print_metrics(scores_lr)


Unnamed: 0,RMSE,MAE,R-squared
0,40922.2269,32546.8588,0.3052
1,41032.837,32365.5266,0.2877
2,41354.0506,32642.9131,0.2773
3,40634.0392,32045.3903,0.2954
4,41000.1524,32337.1015,0.2732
Mean,40988.6612,32387.5581,0.2878


## **Decision Tree**

In [12]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = DecisionTreeRegressor(random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores_dt = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print_metrics(scores_dt)


Unnamed: 0,RMSE,MAE,R-squared
0,45501.5341,33360.8536,0.141
1,46301.29,33749.2044,0.0931
2,45940.1947,33689.2134,0.1081
3,46592.6597,33998.1573,0.0736
4,46452.9465,33978.9618,0.0671
Mean,46157.725,33755.2781,0.0966


## **AdaBoost**

In [13]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = AdaBoostRegressor(estimator=DecisionTreeRegressor(), n_estimators=200, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores_ab = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print_metrics(scores_ab)



Unnamed: 0,RMSE,MAE,R-squared
0,32288.7323,23435.8127,0.5674
1,32636.7915,23656.9167,0.5494
2,32458.8682,23557.6773,0.5547
3,32073.2045,23152.6887,0.561
4,32604.3448,23641.8573,0.5404
Mean,32412.3883,23488.9906,0.5546


## **Bagging**

In [14]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")
model = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores_bg = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print_metrics(scores_bg)

Unnamed: 0,RMSE,MAE,R-squared
0,32101.6934,23832.6049,0.5724
1,32406.816,24062.6329,0.5557
2,32257.0825,23931.1861,0.5603
3,31989.6694,23524.8385,0.5633
4,32493.4722,24123.6458,0.5435
Mean,32249.7467,23894.9816,0.5591


## **RandomForest**

In [15]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = RandomForestRegressor(n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores_rf = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print_metrics(scores_rf)

Unnamed: 0,RMSE,MAE,R-squared
0,32089.2584,23805.0931,0.5728
1,32375.1992,24050.423,0.5566
2,32303.0805,23964.8389,0.559
3,31985.1738,23518.0202,0.5634
4,32458.979,24093.7778,0.5445
Mean,32242.3382,23886.4306,0.5593


## **Gradient Boost**

In [16]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor(n_estimators=200)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores_gb = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print_metrics(scores_gb)

Unnamed: 0,RMSE,MAE,R-squared
0,32081.0318,23870.7548,0.573
1,31914.0406,23728.0761,0.5691
2,32273.3605,23888.5212,0.5598
3,31604.7046,23214.8163,0.5738
4,32037.2541,23661.8825,0.5563
Mean,31982.0783,23672.8102,0.5664


In [17]:
from xgboost import XGBRegressor

transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = XGBRegressor(n_estimators=200)  # Sử dụng XGBRegressor thay thế cho GradientBoostingRegressor

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores_xgb = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print_metrics(scores_xgb)


Unnamed: 0,RMSE,MAE,R-squared
0,32552.3932,23856.3487,0.5603
1,32386.9023,23817.3766,0.5563
2,32406.5597,23685.7386,0.5562
3,32043.8369,23370.5634,0.5618
4,32613.977,24041.1404,0.5401
Mean,32400.7338,23754.2335,0.555


## **Find out best model**

In [18]:
def summary(scores_lists):
    metrics = {'RMSE': [], 'MAE': [], 'R2-SCORE': []}

    for scores in scores_lists:
        metrics['RMSE'].append(-1 * np.mean(scores['test_neg_root_mean_squared_error']))
        metrics['MAE'].append(-1 * np.mean(scores['test_neg_mean_absolute_error']))
        metrics['R2-SCORE'].append(np.mean(scores['test_r2']))
    
    metrics = pd.DataFrame(metrics, index=['Linear Regression', 'Decision Tree', 'Ada Boosting', 'Bagging', 'Random Forest', 'Gradient Boosting'])

    sorted_metrics = metrics.sort_values(by=['RMSE', 'MAE', 'R2-SCORE'], ascending=[False, False, True])

    return sorted_metrics


In [None]:
scores_lists = [scores_lr, scores_dt, scores_ab, scores_bg, scores_rf, scores_gb]
summary(scores_lists)

Unnamed: 0,RMSE,MAE,R2-SCORE
Decision Tree,46157.725023,33755.278116,0.096571
Linear Regression,40988.661247,32387.558059,0.28777
Ada Boosting,32412.388255,23488.990561,0.554604
Bagging,32249.746698,23894.981642,0.559053
Random Forest,32242.338191,23886.430587,0.55926
Gradient Boosting,31982.244523,23672.787386,0.566387


# **Hyperparameter Tuning**

In [None]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork","DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor(random_state=42)

params = {
    "n_estimators": [*range(200, 510, 50)],
    "loss": ['squared_error','absolute_error', 'huber', 'quantile'],
    "learning_rate": [0.01, 0.1, 0.2, 0.3, 0.4],
    "criterion": ['friedman_mse', 'squared_error']

}

grid = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=-1, verbose=1, cv=3, refit="r2", error_score='raise')

pipe = Pipeline([
    ("preprocess", transform),
    ("grid", grid)
])

pipe.fit(X_train.head(10000), y_train[:10000])
print(f"The best params: {pipe['grid'].best_params_}")
print(f"The best score: {pipe['grid'].best_score_}")

Fitting 3 folds for each of 280 candidates, totalling 840 fits
The best params: {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 500}
The best score: 0.5664734570024211


# **Train & Save Best Model**

In [None]:
from sklearn import metrics 

def evaluate(y_true, y_pred):
    rmse = metrics.mean_squared_error(y_true=y_true, y_pred= y_pred, squared=False)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    metrics_dict = {
        "Metrics": ["Root Mean Square Error (RMSE)", 
                    "Mean Absolute Error (MAE)", 
                    "R2-score (R2)"],
        "Values": [rmse, 
                    mae, 
                    r2]
    }

    metrics_df = pd.DataFrame(metrics_dict)
    print(metrics_df)

In [None]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["RemoteWork", "DevType"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor(criterion='friedman_mse', 
                                learning_rate=0.1, 
                                loss='huber', 
                                n_estimators= 500)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

                         Metrics        Values
0  Root Mean Square Error (RMSE)  31252.959641
1      Mean Absolute Error (MAE)  22512.408547
2                  R2-score (R2)      0.585251


In [3]:
joblib.dump(pipe, "../dataset/best_model.joblib")

NameError: name 'pipe' is not defined