In [2]:
single_opt = ['EdLevel','DevType','Country','AISelect']
multi_opt = ['Employment','CodingActivities','LanguageHaveWorkedWith','DatabaseHaveWorkedWith',
             'PlatformHaveWorkedWith','WebframeHaveWorkedWith','MiscTechHaveWorkedWith']
number = ['YearsCode','YearsCodePro','WorkExp']

In [3]:
import joblib
import pandas as pd

def map_years(arr):
    dfy = pd.DataFrame(arr, columns=number)
    dfy = dfy.replace({'Less than 1 year': 0.5, 'More than 50 years': 50})
    return dfy.astype(float)

def split_semi(X):
    return X.iloc[:, 0].fillna('').str.split(';')

def tokenize_list(tokens):
    return tokens

def identity(x):
    return x

preprocessor = joblib.load("../models/preprocessor.pkl") 

In [4]:
df = pd.read_csv("../data/processed/salary_only.csv")

In [5]:
df['CompTotal']

0        2040000.0
1          28000.0
2          85000.0
3          50000.0
4         110000.0
           ...    
33735      36000.0
33736      40000.0
33737      61000.0
33738      58000.0
33739      55000.0
Name: CompTotal, Length: 33740, dtype: float64

In [6]:
from sklearn.model_selection import train_test_split

X = preprocessor.transform(df.drop(columns=['CompTotal']))
Y = df['CompTotal']

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [7]:
x_train

<26992x446 sparse matrix of type '<class 'numpy.float64'>'
	with 685129 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(x_train,y_train)

In [9]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# your column lists
single_opt = ['EdLevel','DevType','Country','AISelect']
multi_opt  = [
    'Employment','CodingActivities','LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith','PlatformHaveWorkedWith',
    'WebframeHaveWorkedWith','MiscTechHaveWorkedWith'
]
number     = ['YearsCode','YearsCodePro','WorkExp']

# transformer functions
def map_years(X):
    return X.replace({'Less than 1 year':0.5, 'More than 50 years':50}).astype(float)

def split_semi(X):
    # X is a Series
    return X.fillna('').str.split(';')

# build ColumnTransformer
preprocessor = ColumnTransformer([
    # numeric years → map → impute → scale
    ('num', Pipeline([
        ('map',   FunctionTransformer(map_years, validate=False)),
        ('imp',   SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    ]), number),

    # single‐choice → impute → one‑hot
    ('cat', Pipeline([
        ('imp', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), single_opt),

    # multi‐choice → split → vectorize
    *[
      (col, Pipeline([
           ('split', FunctionTransformer(lambda df, c=col: split_semi(df[c]), validate=False)),
           ('vec',   CountVectorizer(tokenizer=lambda x: x,
                                     preprocessor=lambda x: x,
                                     token_pattern=None,
                                     binary=True))
       ]), [col])
      for col in multi_opt
    ]
], remainder='drop')

# full end‑to‑end pipeline
pipeline = Pipeline([
    ('pre', preprocessor),
    ('rf',  RandomForestRegressor(n_estimators=100, random_state=42))
])

# load & prepare
df = pd.read_csv("../data/raw/survey_results_public.csv")
y  = pd.to_numeric(df['CompTotal'], errors='coerce').fillna(0)

# split, train, evaluate
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

print("Test RMSE:", mean_squared_error(y_test, preds))


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Custom Transformer for multi-select categorical features
class MultiSelectTransformer(BaseEstimator, TransformerMixin):
    """
    Custom transformer for multi-select categorical features.
    It splits the string by a delimiter and then applies a multi-hot encoding.
    """
    def __init__(self, delimiter=';'):
        self.delimiter = delimiter
        self.categories_ = []

    def fit(self, X, y=None):
        # Find all unique categories
        all_categories = set()
        for _, row in X.items():
            if pd.notna(row):
                all_categories.update(cat.strip() for cat in row.split(self.delimiter))
        self.categories_ = sorted(list(all_categories))
        return self

    def transform(self, X, y=None):
        # Create the multi-hot encoded matrix
        num_samples = len(X)
        num_categories = len(self.categories_)
        encoded_matrix = np.zeros((num_samples, num_categories))

        for i, row in enumerate(X):
            if pd.notna(row):
                for j, cat in enumerate(self.categories_):
                    if cat in [c.strip() for c in row.split(self.delimiter)]:
                        encoded_matrix[i, j] = 1
        return encoded_matrix

# Select features for the model
# These would be chosen based on the EDA and domain knowledge
features = [
    'Country', 'EdLevel', 'YearsCodePro', 'DevType',
    'LanguageHaveWorkedWith', 'WebframeHaveWorkedWith',
    'DatabaseHaveWorkedWith', 'PlatformHaveWorkedWith'
]
target = 'ConvertedCompYearly'

# Drop rows where target is missing
df_model = df.dropna(subset=[target] + features).copy()

# Split the data
X = df_model[features]
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define which columns go to which transformer
numerical_features = ['YearsCodePro']
categorical_features = ['Country', 'EdLevel', 'DevType']
multi_select_features = [
    'LanguageHaveWorkedWith', 'WebframeHaveWorkedWith',
    'DatabaseHaveWorkedWith', 'PlatformHaveWorkedWith'
]

# Create the preprocessing pipelines for each feature type
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

multi_select_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('multi_hot', MultiSelectTransformer())
])

# Create the master preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('multi', multi_select_transformer, multi_select_features[0]), # Example for one multi-select, expand as needed
        # You can add more multi-select features here
    ],
    remainder='passthrough'
)

In [5]:
df['OpSysPersonal use']

0                                            MacOS;Windows
1                                Other Linux-based;Windows
2                                                    MacOS
3                                                    MacOS
4                                            MacOS;Windows
                               ...                        
33735                                                  NaN
33736                                          iOS;Windows
33737                                                 Arch
33738                                                MacOS
33739    Cygwin;Debian;iOS;iPadOS;MacOS;Ubuntu;Windows;...
Name: OpSysPersonal use, Length: 33740, dtype: object

In [10]:
for i in df.columns:
    print(i,end='\t')

ResponseId	MainBranch	Age	Employment	RemoteWork	Check	CodingActivities	EdLevel	LearnCode	LearnCodeOnline	TechDoc	YearsCode	YearsCodePro	DevType	OrgSize	PurchaseInfluence	BuyNewTool	BuildvsBuy	TechEndorse	Country	Currency	CompTotal	LanguageHaveWorkedWith	LanguageWantToWorkWith	LanguageAdmired	DatabaseHaveWorkedWith	DatabaseWantToWorkWith	DatabaseAdmired	PlatformHaveWorkedWith	PlatformWantToWorkWith	PlatformAdmired	WebframeHaveWorkedWith	WebframeWantToWorkWith	WebframeAdmired	EmbeddedHaveWorkedWith	EmbeddedWantToWorkWith	EmbeddedAdmired	MiscTechHaveWorkedWith	MiscTechWantToWorkWith	MiscTechAdmired	ToolsTechHaveWorkedWith	ToolsTechWantToWorkWith	ToolsTechAdmired	NEWCollabToolsHaveWorkedWith	NEWCollabToolsWantToWorkWith	NEWCollabToolsAdmired	OpSysPersonal use	OpSysProfessional use	OfficeStackAsyncHaveWorkedWith	OfficeStackAsyncWantToWorkWith	OfficeStackAsyncAdmired	OfficeStackSyncHaveWorkedWith	OfficeStackSyncWantToWorkWith	OfficeStackSyncAdmired	AISearchDevHaveWorkedWith	AISearchDevWantTo

In [1]:




import os
import pandas as pd
import numpy as np
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
import seaborn as sns

# df = pd.read_csv("../data/processed/salary_only.csv")

print("Initializing DagsHub and MLflow...")

DAGSHUB_REPO_OWNER = os.getenv("DAGSHUB_REPO_OWNER", "malhar.c.prajapati")
DAGSHUB_REPO_NAME = os.getenv("DAGSHUB_REPO_NAME", "Stack-overflow-survey-2024-salary-prediction")

dagshub.init(repo_owner=DAGSHUB_REPO_OWNER, repo_name=DAGSHUB_REPO_NAME, mlflow=True)
mlflow.set_experiment("Salary Prediction Experiments")


print("Loading data...")


try:
    df = pd.read_csv('../data/raw/survey_results_public.csv')
    print("Data loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'survey_results_public.csv' not found. Please place it in the same directory.")
    exit()


print("Defining features and preprocessing pipeline...")



FEATURE_COLUMNS = [
    'Country', 'EdLevel', 'YearsCodePro', 'MainBranch', 'RemoteWork', 'Age',
    'LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith', 'WebframeHaveWorkedWith',
    'OpSysPersonal use' 
]
TARGET_COLUMN = 'ConvertedCompYearly'



df_model = df.dropna(subset=[TARGET_COLUMN]).copy()

df_model.dropna(subset=FEATURE_COLUMNS, how='all', inplace=True)


df_model['YearsCodePro'] = pd.to_numeric(df_model['YearsCodePro'], errors='coerce')


df_model[TARGET_COLUMN] = np.log1p(df_model[TARGET_COLUMN])

X = df_model[FEATURE_COLUMNS]
y = df_model[TARGET_COLUMN]


class MultiSelectBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = None
        self.classes_ = None

    def fit(self, X, y=None):
        series = X.fillna('').astype(str).apply(lambda x: x.split(';'))
        self.mlb = MultiLabelBinarizer(sparse_output=False)
        self.mlb.fit(series)
        self.classes_ = self.mlb.classes_
        return self

    def transform(self, X, y=None):
        series = X.fillna('').astype(str).apply(lambda x: x.split(';'))
        encoded_matrix = self.mlb.transform(series)
        df = pd.DataFrame(encoded_matrix, columns=self.classes_, index=X.index)
        if '' in df.columns:
            df = df.drop(columns=[''])
        return df



numerical_features = ['YearsCodePro']
categorical_features = ['Country', 'EdLevel', 'MainBranch', 'RemoteWork', 'OpSysPersonal use', 'Age']
multi_select_features = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith', 'WebframeHaveWorkedWith']


numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        
        ('lang', MultiSelectBinarizer(), 'LanguageHaveWorkedWith'),
        ('db', MultiSelectBinarizer(), 'DatabaseHaveWorkedWith'),
        ('web', MultiSelectBinarizer(), 'WebframeHaveWorkedWith')
    ],
    remainder='drop', 
    n_jobs=-1 
)


print("Splitting data and starting model training...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


with mlflow.start_run(run_name="LGBM Regressor Run") as run:
    
    mlflow.log_param("model_class", "LGBMRegressor")

    
    lgbm_params = {
        'n_estimators': 250,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': -1,
        'random_state': 42,
        'n_jobs': -1,
        'colsample_bytree': 0.8
    }
    mlflow.log_params(lgbm_params)

    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LGBMRegressor(**lgbm_params))
    ])

    
    print("Fitting the model pipeline...")
    model.fit(X_train, y_train)
    print("Model training complete.")

    
    y_pred = model.predict(X_test)

    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Evaluation Metrics (on log-transformed salary):")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R2 Score: {r2:.4f}")

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    
    try:
        print("Generating and logging feature importance plot...")
        
        ohe_feature_names = list(model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
        lang_feature_names = list(model.named_steps['preprocessor'].named_transformers_['lang'].classes_)
        db_feature_names = list(model.named_steps['preprocessor'].named_transformers_['db'].classes_)
        web_feature_names = list(model.named_steps['preprocessor'].named_transformers_['web'].classes_)

        
        feature_names = numerical_features + ohe_feature_names + lang_feature_names + db_feature_names + web_feature_names
        
        
        feature_names = [f for f in feature_names if f != '']

        importances = model.named_steps['regressor'].feature_importances_

        
        if len(feature_names) == len(importances):
            feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
            feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).head(25)

            plt.figure(figsize=(12, 8))
            sns.barplot(x='importance', y='feature', data=feature_importance_df)
            plt.title('Top 25 Feature Importances')
            plt.tight_layout()
            
            
            plot_path = "feature_importance.png"
            plt.savefig(plot_path)
            plt.close()

            
            mlflow.log_artifact(plot_path, "plots")
            print("Feature importance plot logged.")
        else:
            print(f"Warning: Mismatch in feature names ({len(feature_names)}) and importances ({len(importances)}). Skipping plot.")

    except Exception as e:
        print(f"Could not generate feature importance plot: {e}")


    
    print("Logging model to MLflow...")
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="salary-predictor-model",
        registered_model_name="stackoverflow-salary-predictor" 
    )
    print("Model logged successfully.")

print(f"\n✅ Training script finished. View the run in DagsHub/MLflow: {run.info.artifact_uri}")


Initializing DagsHub and MLflow...


Loading data...
Data loaded successfully.
Dataset shape: (65437, 114)
Defining features and preprocessing pipeline...
Splitting data and starting model training...
Fitting the model pipeline...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 655
[LightGBM] [Info] Number of data points in the train set: 18748, number of used features: 304
[LightGBM] [Info] Start training from score 10.786639
Model training complete.




Evaluation Metrics (on log-transformed salary):
  RMSE: 1.0486
  MAE: 0.5866
  R2 Score: 0.4479
Generating and logging feature importance plot...
Feature importance plot logged.
Logging model to MLflow...


Registered model 'stackoverflow-salary-predictor' already exists. Creating a new version of this model...
2025/07/27 14:04:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: stackoverflow-salary-predictor, version 2
Created version '2' of model 'stackoverflow-salary-predictor'.


Model logged successfully.
🏃 View run LGBM Regressor Run at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/3199f3dbfe614ad59f363c7eecd51379
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

✅ Training script finished. View the run in DagsHub/MLflow: mlflow-artifacts:/b66ac9a49b554a539f4bb427870fb357/3199f3dbfe614ad59f363c7eecd51379/artifacts


In [28]:
# Add this debug step before creating the pipeline
print("Actual columns in X_train:", X_train.columns.tolist())
def split_multiselect(X):
    """Splits semicolon-separated strings into a binary matrix."""
    # Handle different input types (Series, DataFrame, or array)
    if isinstance(X, pd.DataFrame):
        # If it's a DataFrame, extract the first column
        X = X.iloc[:, 0]
    elif not isinstance(X, pd.Series):
        # If it's not a Series, convert to Series
        X = pd.Series(X)
    
    # Split the strings and binarize
    lists = X.fillna('').str.split(';')
    mlb = MultiLabelBinarizer()
    return mlb.fit_transform(lists)

# Update the multi_pipeline to use this improved function
def build_preprocessor():
    # Ordinal pipeline for age
    ord_age_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # Categorical pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Multi-select pipeline
    multi_pipeline = Pipeline([
        ('split', FunctionTransformer(split_multiselect, validate=False))
    ])

    # Get EXACT column names from your DataFrame
    actual_columns = X_train.columns.tolist()
    
    # Find the OS column - it might be named differently
    os_columns = [col for col in actual_columns if 'OpSys' in col]
    os_column = os_columns[0] if os_columns else None
    
    if not os_column:
        raise ValueError("No OS column found in data")
    
    print(f"Using OS column: {os_column}")

    # Build transformer with verified columns
    transformers = [
        # Ordinal age
        ('ord', ord_age_pipeline, ['Age']),
        
        # Categorical features
        ('cat', cat_pipeline, [
            'Country',
            'EdLevel',
            'DevType',
            'RemoteWork',
            os_column  # Use the actual column name
        ]),
        
        # Multi-select features
        ('lang', multi_pipeline, ['LanguageHaveWorkedWith']),
        ('web',  multi_pipeline, ['WebframeHaveWorkedWith']),
        ('db',   multi_pipeline, ['DatabaseHaveWorkedWith']),
        ('plat', multi_pipeline, ['PlatformHaveWorkedWith']),
    ]
    
    return ColumnTransformer(transformers, remainder='drop')

# Build the preprocessor with dynamic column detection
preprocessor = build_preprocessor()

# Test the preprocessor on a small sample before full training
try:
    print("Testing preprocessor on small sample...")
    preprocessor.fit_transform(X_train.head())
    print("Preprocessor test successful!")
except Exception as e:
    print(f"Preprocessor error: {e}")
    # Print which columns are causing issues
    for name, _, cols in preprocessor.transformers:
        missing = [col for col in cols if col not in X_train.columns]
        if missing:
            print(f"Transformer '{name}' missing columns: {missing}")

Actual columns in X_train: ['Country', 'EdLevel', 'DevType', 'RemoteWork', 'Age', 'LanguageHaveWorkedWith', 'WebframeHaveWorkedWith', 'DatabaseHaveWorkedWith', 'PlatformHaveWorkedWith', 'OpSysPersonal use']
Using OS column: OpSysPersonal use
Testing preprocessor on small sample...
Preprocessor test successful!


In [29]:
import numpy as np
import pandas as pd
import mlflow
import dagshub
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
# Correctly import the builder function from your preprocessing file
from preprocessing import build_preprocessor

# --- MLflow & DagsHub Initialization ---
dagshub.init(repo_owner='malhar.c.prajapati',
             repo_name='Stack-overflow-survey-2024-salary-prediction',
             mlflow=True)

print("Loading and preparing data...")
# It's good practice to specify the parent directory for clarity
df = pd.read_csv('../data/processed/features_labels.csv')

# --- Data Preparation ---
# Drop rows where the target is missing, as they cannot be used for training
df.dropna(subset=['ConvertedCompYearly'], inplace=True)
X = df.drop(columns=['ConvertedCompYearly'])
# Use log1p for a more stable target variable
y = np.log1p(df['ConvertedCompYearly'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preparation complete.")
print(f"Training on {len(X_train)} samples, testing on {len(X_test)} samples.")


# --- Model and Preprocessing Setup ---
# Build the preprocessor using your corrected function

# Define the models and their hyperparameter grids for GridSearchCV
models_and_params = {
    'gbr': (GradientBoostingRegressor(random_state=42), {
        'model__n_estimators': [100, 200],
        'model__max_depth': [3, 5]
    }),
    'rf': (RandomForestRegressor(random_state=42), {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 20]
    }),
    'ridge': (Ridge(), {
        'model__alpha': [0.1, 1.0, 10.0]
    })
}

# --- Training and Experiment Tracking Loop ---
for model_name, (estimator, params) in models_and_params.items():
    # Create the full pipeline: Preprocessing -> Model
    # This is the object that will be saved and used in the backend
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', estimator)
    ])

    # Set up GridSearchCV
    grid_search = GridSearchCV(full_pipeline, params, cv=3,
                               scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)

    print(f"\n--- Starting GridSearchCV for {model_name} ---")
    
    # Start an MLflow run
    with mlflow.start_run(run_name=f"GridSearch_{model_name}") as run:
        mlflow.log_param("model_name", model_name)
        
        # Train the grid search
        grid_search.fit(X_train, y_train)

        # Log the best parameters found by the search
        mlflow.log_params(grid_search.best_params_)
        
        # Evaluate the best estimator on the test set
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        metrics = {
            'mae': mean_absolute_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'r2': r2_score(y_test, y_pred)
        }
        
        # Log the performance metrics
        mlflow.log_metrics(metrics)
        
        # Log the best model pipeline
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="model",
            registered_model_name=f"{model_name}_salary_predictor"
        )
        
        print(f"--- Results for {model_name} ---")
        print(f"Best Params: {grid_search.best_params_}")
        print(f"Metrics: {metrics}")
        print(f"MLflow Run ID: {run.info.run_id}")

print("\n--- All models trained and logged to MLflow. ---")


Loading and preparing data...
Data preparation complete.
Training on 18748 samples, testing on 4687 samples.

--- Starting GridSearchCV for gbr ---
Fitting 3 folds for each of 4 candidates, totalling 12 fits


Successfully registered model 'gbr_salary_predictor'.
2025/07/27 19:40:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gbr_salary_predictor, version 1
Created version '1' of model 'gbr_salary_predictor'.


--- Results for gbr ---
Best Params: {'model__max_depth': 5, 'model__n_estimators': 200}
Metrics: {'mae': 0.6119540927268493, 'rmse': 1.075558024253791, 'r2': 0.4190943392858837}
MLflow Run ID: a480b13f96bf4d4db5ebb319d2558e00
🏃 View run GridSearch_gbr at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/a480b13f96bf4d4db5ebb319d2558e00
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for rf ---
Fitting 3 folds for each of 4 candidates, totalling 12 fits


Successfully registered model 'rf_salary_predictor'.
2025/07/27 19:46:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_salary_predictor, version 1
Created version '1' of model 'rf_salary_predictor'.


--- Results for rf ---
Best Params: {'model__max_depth': 20, 'model__n_estimators': 200}
Metrics: {'mae': 0.6593767852301139, 'rmse': 1.1333069639679252, 'r2': 0.3550396239549919}
MLflow Run ID: fee8289484964d1db970c036bde5e494
🏃 View run GridSearch_rf at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/fee8289484964d1db970c036bde5e494
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for ridge ---
Fitting 3 folds for each of 3 candidates, totalling 9 fits


Successfully registered model 'ridge_salary_predictor'.
2025/07/27 19:46:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_salary_predictor, version 1
Created version '1' of model 'ridge_salary_predictor'.


--- Results for ridge ---
Best Params: {'model__alpha': 10.0}
Metrics: {'mae': 0.6247712567342067, 'rmse': 1.0665746278674635, 'r2': 0.42875762550798957}
MLflow Run ID: 1ce59028dd0946f5a7005747e4a3bf00
🏃 View run GridSearch_ridge at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/1ce59028dd0946f5a7005747e4a3bf00
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- All models trained and logged to MLflow. ---


In [36]:
import numpy as np
import pandas as pd
import mlflow
import dagshub
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder, StandardScaler, OrdinalEncoder,
    FunctionTransformer, MultiLabelBinarizer
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.feature_selection import SelectKBest, f_regression

# --- Feature Configuration ---
single_opt = ['EdLevel', 'DevType', 'Country', 'AISelect', 'RemoteWork']
multi_opt = ['Employment', 'CodingActivities', 'LanguageHaveWorkedWith',
             'DatabaseHaveWorkedWith', 'PlatformHaveWorkedWith',
             'WebframeHaveWorkedWith', 'MiscTechHaveWorkedWith']
number = ['YearsCode', 'YearsCodePro', 'WorkExp']  # Numeric fields (exclude Age)
ordinal = ['Age']  # Ordinal age feature

# --- Improved Preprocessing Functions ---
def split_multiselect(X):
    """Splits semicolon-separated strings into a binary matrix."""
    if isinstance(X, pd.DataFrame):
        X = X.iloc[:, 0]
    elif not isinstance(X, pd.Series):
        X = pd.Series(X)
    lists = X.fillna('').str.split(';')
    mlb = MultiLabelBinarizer()
    return mlb.fit_transform(lists)


def convert_numeric_years(X):
    """Converts special string labels to numeric for years columns, handles 1D or 2D inputs."""
    arr = np.array(X)
    # Single feature case
    if arr.ndim == 1:
        series = pd.Series(arr).replace({
            'Less than 1 year': 0.5,
            'More than 50 years': 51
        })
        return pd.to_numeric(series, errors='coerce').values.reshape(-1, 1)
    # Multiple features: apply to each column
    df = pd.DataFrame(arr)
    for col in df.columns:
        df[col] = pd.Series(df[col]).replace({
            'Less than 1 year': 0.5,
            'More than 50 years': 51
        })
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df.values


def build_preprocessor():
    """Builds a comprehensive preprocessor handling numeric, ordinal, categorical, and multi-select features."""
    # Numeric pipeline: convert special strings, impute, scale
    num_pipeline = Pipeline([
        ('convert', FunctionTransformer(convert_numeric_years, validate=False)),
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Ordinal pipeline for age
    ord_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # Single-choice categorical pipeline
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Multi-select pipeline
    multi_pipeline = Pipeline([
        ('split', FunctionTransformer(split_multiselect, validate=False))
    ])

    # Assemble ColumnTransformer
    transformer_list = []
    transformer_list.append(('num', num_pipeline, number))
    transformer_list.append(('ord', ord_pipeline, ordinal))
    transformer_list.append(('cat', cat_pipeline, single_opt))
    for col in multi_opt:
        transformer_list.append((f'multi_{col}', multi_pipeline, [col]))

    return ColumnTransformer(transformers=transformer_list, remainder='drop')

# --- MLflow & DagsHub Initialization ---
dagshub.init(
    repo_owner='malhar.c.prajapati',
    repo_name='Stack-overflow-survey-2024-salary-prediction',
    mlflow=True
)

# --- Data Loading & Preparation ---
print("Loading data...")
df = pd.read_csv('../data/raw/survey_results_public.csv', low_memory=False)
all_features = single_opt + multi_opt + number + ordinal + ['ConvertedCompYearly']
df = df.loc[:, df.columns.intersection(all_features + ['ConvertedCompYearly'])]
# Drop rows with missing target
(df.dropna(subset=['ConvertedCompYearly'], inplace=True))
X = df.drop(columns=['ConvertedCompYearly'])
y = np.log1p(df['ConvertedCompYearly'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

# --- Preprocessor Test ---
preprocessor = build_preprocessor()
try:
    preprocessor.fit_transform(X_train.head())
    print("Preprocessor test passed.")
except Exception as e:
    print(f"Preprocessor error: {e}")

# --- Model & Hyperparameter Configuration ---
models_and_params = {
    'gbr': (GradientBoostingRegressor(random_state=42), {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [3, 5, 7],
        'model__learning_rate': [0.1, 0.05],
        'model__subsample': [0.8, 1.0]
    }),
    'rf': (RandomForestRegressor(random_state=42), {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [10, 20, None],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2]
    }),
    'xgb': (XGBRegressor(random_state=42, eval_metric='mae'), {
        'model__n_estimators': [100, 200],
        'model__max_depth': [3, 5, 7],
        'model__learning_rate': [0.1, 0.05],
        'model__subsample': [0.8, 1.0],
        'model__colsample_bytree': [0.8, 1.0]
    }),
    'lgbm': (LGBMRegressor(random_state=42), {
        'model__n_estimators': [100, 200],
        'model__max_depth': [3, 5, -1],
        'model__learning_rate': [0.1, 0.05],
        'model__num_leaves': [31, 63],
        'model__subsample': [0.8, 1.0]
    }),
    'elasticnet': (ElasticNet(random_state=42), {
        'model__alpha': [0.1, 1.0, 10.0],
        'model__l1_ratio': [0.2, 0.5, 0.8]
    })
}

# --- Training Loop ---
for model_name, (estimator, params) in models_and_params.items():
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('selector', SelectKBest(score_func=f_regression, k='all')),
        ('model', estimator)
    ])
    grid = GridSearchCV(
        full_pipeline, params, cv=3,
        scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1
    )
    print(f"\n--- Training {model_name} ---")
    with mlflow.start_run(run_name=f"GridSearch_{model_name}") as run:
        mlflow.log_param('model', model_name)
        grid.fit(X_train, y_train)
        best = grid.best_estimator_
        y_pred = best.predict(X_test)
        # Convert back to original scale
        y_true = np.expm1(y_test)
        y_pred_orig = np.expm1(y_pred)
        metrics = {
            'mae': mean_absolute_error(y_true, y_pred_orig),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred_orig)),
            'r2': r2_score(y_true, y_pred_orig)
        }
        mlflow.log_params(grid.best_params_)
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(
            sk_model=best,
            artifact_path='model',
            registered_model_name=f"{model_name}_salary_predictor"
        )
        print(f"Results for {model_name}: {metrics}")
print("\nAll models complete.")


Loading data...
Training on 18748 samples, testing on 4687 samples.
Preprocessor test passed.

--- Training gbr ---


  df[col] = pd.Series(df[col]).replace({


Fitting 3 folds for each of 36 candidates, totalling 108 fits


  df[col] = pd.Series(df[col]).replace({
  df[col] = pd.Series(df[col]).replace({
Registered model 'gbr_salary_predictor' already exists. Creating a new version of this model...
2025/07/27 20:30:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gbr_salary_predictor, version 2
Created version '2' of model 'gbr_salary_predictor'.


Results for gbr: {'mae': 30708.47122212532, 'rmse': 81725.05402335647, 'r2': 0.28755861130940485}
🏃 View run GridSearch_gbr at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/9dd6478582dd4131aa3224d19f4e4fcb
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Training rf ---
Fitting 3 folds for each of 36 candidates, totalling 108 fits


  df[col] = pd.Series(df[col]).replace({
  df[col] = pd.Series(df[col]).replace({
Registered model 'rf_salary_predictor' already exists. Creating a new version of this model...
2025/07/27 21:02:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_salary_predictor, version 2
Created version '2' of model 'rf_salary_predictor'.


Results for rf: {'mae': 31953.23711696943, 'rmse': 83202.54398212278, 'r2': 0.261565600891849}
🏃 View run GridSearch_rf at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/e05c392c987e4013b1543ff9c896769c
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Training xgb ---
Fitting 3 folds for each of 48 candidates, totalling 144 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]
  df[col] = pd.Series(df[col]).replace({
  df[col] = pd.Series(df[col]).replace({


🏃 View run GridSearch_xgb at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/503a95c3a0514793880eef8990586e6d
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0


AttributeError: 'super' object has no attribute '__sklearn_tags__'

### Exp-1

In [1]:
import numpy as np
import pandas as pd
import mlflow
import dagshub
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dagshub.init(
    repo_owner='malhar.c.prajapati',
    repo_name='Stack-overflow-survey-2024-salary-prediction',
    mlflow=True
)

print("Loading pre-processed data...")
# Assuming 'final_dataset.csv' is in a 'data/processed' subdirectory relative to the script
df = pd.read_csv('../data/processed/final_dataset.csv', index_col=0)
X = df.drop(columns=['ConvertedCompYearly'])
y = df['ConvertedCompYearly'] 

if len(X) != len(y):
    raise ValueError(
        f"Mismatch in number of samples between processed features ({len(X)}) "
        f"and target variable ({len(y)}). Please regenerate processed.csv."
    )

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Data loaded. Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

models_and_params = {
    'xgb': (XGBRegressor(random_state=42, n_jobs=-1, eval_metric='mae'), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    }),
    'rf': (RandomForestRegressor(random_state=42, n_jobs=-1), {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_leaf': [2, 4]
    }),
    'lgbm': (LGBMRegressor(random_state=42, n_jobs=-1), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'num_leaves': [31, 60]
    }),
    'gbr': (GradientBoostingRegressor(random_state=42), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    })
}

for model_name, (estimator, params) in models_and_params.items():
    
    full_pipeline = Pipeline([
        ('model', estimator)
    ])
    
    grid_search = GridSearchCV(
        estimator=estimator, 
        param_grid=params, 
        cv=3,
        scoring='neg_mean_squared_error',
        verbose=1, 
        n_jobs=-1
    )
    
    print(f"\n--- Starting GridSearchCV for {model_name} ---")
    
    with mlflow.start_run(run_name=f"GridSearch_{model_name}_on_processed_data") as run:
        mlflow.log_param('model_name', model_name)
        
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        
        # The model predicts on the same scale as the training data.
        # Since 'y_train' appears to be the original salary, 'y_pred' will be too.
        y_pred = best_model.predict(X_test)
        
        # FIX: The error was caused by applying np.expm1 to the original salary values,
        # which caused a numeric overflow. The target variable 'y' is already on the
        # original scale, so we can calculate metrics directly without transformation.
        
        metrics = {
            'mae': mean_absolute_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'r2': r2_score(y_test, y_pred)
        }
        
        print(f"--- Results for {model_name} ---")
        print(f"Best Params: {grid_search.best_params_}")
        print(f"Metrics: {metrics}")
        
        
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics(metrics)
        
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path='model',
            registered_model_name=f"{model_name}_salary_predictor_processed"
        )

print("\n--- All model training experiments are complete. ---")


Loading pre-processed data...
Data loaded. Training on 18748 samples, testing on 4687 samples.

--- Starting GridSearchCV for xgb ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
--- Results for xgb ---
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Metrics: {'mae': 24424.664173338646, 'rmse': 34636.882065153375, 'r2': 0.641098601225861}


Registered model 'xgb_salary_predictor_processed' already exists. Creating a new version of this model...
2025/07/31 19:32:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgb_salary_predictor_processed, version 2
Created version '2' of model 'xgb_salary_predictor_processed'.


🏃 View run GridSearch_xgb_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/42889009b4d14b6e833bf27f9879839d
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for rf ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
--- Results for rf ---
Best Params: {'max_depth': 20, 'min_samples_leaf': 4, 'n_estimators': 200}
Metrics: {'mae': 25945.204901618305, 'rmse': 36336.52970706808, 'r2': 0.6050114875211025}


Registered model 'rf_salary_predictor_processed' already exists. Creating a new version of this model...
2025/07/31 19:35:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_salary_predictor_processed, version 3
Created version '3' of model 'rf_salary_predictor_processed'.


🏃 View run GridSearch_rf_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/ed83afc2a4d84f958f371fd9783dc1fe
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for lgbm ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1661
[LightGBM] [Info] Number of data points in the train set: 18748, number of used features: 14
[LightGBM] [Info] Start training from score 77627.108065
--- Results for lgbm ---
Best Params: {'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 60}
Metrics: {'mae': 24288.446540825695, 'rmse': 34620.81134140928, 'r2': 0.6414315680994465}


Registered model 'lgbm_salary_predictor_processed' already exists. Creating a new version of this model...
2025/07/31 19:35:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lgbm_salary_predictor_processed, version 3
Created version '3' of model 'lgbm_salary_predictor_processed'.


🏃 View run GridSearch_lgbm_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/8a8ed8e59aaf4a1f84b187d57cf4aab6
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for gbr ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
--- Results for gbr ---
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Metrics: {'mae': 24521.71021744618, 'rmse': 34775.58970738377, 'r2': 0.6382183161863455}


Successfully registered model 'gbr_salary_predictor_processed'.
2025/07/31 19:36:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gbr_salary_predictor_processed, version 1
Created version '1' of model 'gbr_salary_predictor_processed'.


🏃 View run GridSearch_gbr_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/d391a7bdba1b4efc97fe8abdc20eeec2
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- All model training experiments are complete. ---


1.1 Where we added 1 more column

In [2]:
import numpy as np
import pandas as pd
import mlflow
import dagshub
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dagshub.init(
    repo_owner='malhar.c.prajapati',
    repo_name='Stack-overflow-survey-2024-salary-prediction',
    mlflow=True
)

print("Loading pre-processed data...")
# Assuming 'final_dataset.csv' is in a 'data/processed' subdirectory relative to the script
df = pd.read_csv("final_dataset.csv", index_col=0)
X = df.drop(columns=['ConvertedCompYearly'])
y = df['ConvertedCompYearly'] 

if len(X) != len(y):
    raise ValueError(
        f"Mismatch in number of samples between processed features ({len(X)}) "
        f"and target variable ({len(y)}). Please regenerate processed.csv."
    )

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Data loaded. Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

models_and_params = {
    'xgb': (XGBRegressor(random_state=42, n_jobs=-1, eval_metric='mae'), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    }),
    'rf': (RandomForestRegressor(random_state=42, n_jobs=-1), {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_leaf': [2, 4]
    }),
    'lgbm': (LGBMRegressor(random_state=42, n_jobs=-1), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'num_leaves': [31, 60]
    }),
    'gbr': (GradientBoostingRegressor(random_state=42), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    })
}

for model_name, (estimator, params) in models_and_params.items():
    
    full_pipeline = Pipeline([
        ('model', estimator)
    ])
    
    grid_search = GridSearchCV(
        estimator=estimator, 
        param_grid=params, 
        cv=3,
        scoring='neg_mean_squared_error',
        verbose=1, 
        n_jobs=-1
    )
    
    print(f"\n--- Starting GridSearchCV for {model_name} ---")
    
    with mlflow.start_run(run_name=f"GridSearch_{model_name}_on_processed_data") as run:
        mlflow.log_param('model_name', model_name)
        
        # Log which database feature was used in the model training
        mlflow.log_param("database_feature_name", "DatabaseHaveWorkedWith")
        
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        
        # The model predicts on the same scale as the training data.
        # Since 'y_train' appears to be the original salary, 'y_pred' will be too.
        y_pred = best_model.predict(X_test)
        
        metrics = {
            'mae': mean_absolute_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'r2': r2_score(y_test, y_pred)
        }
        
        print(f"--- Results for {model_name} ---")
        print(f"Best Params: {grid_search.best_params_}")
        print(f"Metrics: {metrics}")
        
        
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics(metrics)
        
        # Add an input_example to log the model signature and resolve the warning.
        # This helps MLflow understand the model's input schema.
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path='model',
            registered_model_name=f"{model_name}_salary_predictor_processed",
            input_example=X_train.head()
        )

print("\n--- All model training experiments are complete. ---")


Loading pre-processed data...
Data loaded. Training on 18748 samples, testing on 4687 samples.

--- Starting GridSearchCV for xgb ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
--- Results for xgb ---
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Metrics: {'mae': 24153.799660886354, 'rmse': 34384.392642434046, 'r2': 0.6463120334776091}


Registered model 'xgb_salary_predictor_processed' already exists. Creating a new version of this model...
2025/07/31 21:09:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgb_salary_predictor_processed, version 3
Created version '3' of model 'xgb_salary_predictor_processed'.


🏃 View run GridSearch_xgb_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/5dc4d0250b454cc383772eb2951d6df2
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for rf ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
--- Results for rf ---
Best Params: {'max_depth': 20, 'min_samples_leaf': 4, 'n_estimators': 200}
Metrics: {'mae': 25825.046018947633, 'rmse': 36229.57622105682, 'r2': 0.6073332956747615}


Registered model 'rf_salary_predictor_processed' already exists. Creating a new version of this model...
2025/07/31 21:12:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_salary_predictor_processed, version 4
Created version '4' of model 'rf_salary_predictor_processed'.


🏃 View run GridSearch_rf_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/dc37c107fd504905acf4bfca1043a97b
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for lgbm ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1888
[LightGBM] [Info] Number of data points in the train set: 18748, number of used features: 15
[LightGBM] [Info] Start training from score 77627.108065
--- Results for lgbm ---
Best Params: {'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 31}
Metrics: {'mae': 24192.365091005893, 'rmse': 34522.2372346251, 'r2': 0.6434705292156202}


Registered model 'lgbm_salary_predictor_processed' already exists. Creating a new version of this model...
2025/07/31 21:12:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lgbm_salary_predictor_processed, version 4
Created version '4' of model 'lgbm_salary_predictor_processed'.


🏃 View run GridSearch_lgbm_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/046e9bfc7eba4583a798ab2b1f83ed3f
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- Starting GridSearchCV for gbr ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
--- Results for gbr ---
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Metrics: {'mae': 24120.810023335376, 'rmse': 34380.76423639723, 'r2': 0.6463866752405156}


Registered model 'gbr_salary_predictor_processed' already exists. Creating a new version of this model...
2025/07/31 21:13:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gbr_salary_predictor_processed, version 2
Created version '2' of model 'gbr_salary_predictor_processed'.


🏃 View run GridSearch_gbr_on_processed_data at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/92b60943c4834c549dc31c26d5b90f7d
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

--- All model training experiments are complete. ---


### Exp-2

In [None]:
import numpy as np
import pandas as pd
import mlflow
import dagshub
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from typing import Dict, Any, Tuple, List


DATA_PATH = "../data/processed/final_dataset.csv"
TARGET_COLUMN = 'ConvertedCompYearly'
TEST_SIZE = 0.2
RANDOM_STATE = 42


MODELS_AND_PARAMS = {
    'xgb': (XGBRegressor(random_state=RANDOM_STATE, n_jobs=-1, eval_metric='mae'), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 1.0],
        'colsample_bytree': [0.7, 1.0]
    }),
    'rf': (RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1), {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_leaf': [2, 4, 6],
        'max_features': ['sqrt', 'log2']
    }),
    'lgbm': (LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 60, 90],
        'reg_alpha': [0.1, 0.5],
        'reg_lambda': [0.1, 0.5]
    }),
    'gbr': (GradientBoostingRegressor(random_state=RANDOM_STATE), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 1.0]
    })
}

def load_data(path: str) -> Tuple[pd.DataFrame, pd.Series]:
    """Loads the preprocessed dataset from a CSV file."""
    print(f"Loading pre-processed data from {path}...")
    try:
        df = pd.read_csv(path, index_col=0)
    except FileNotFoundError:
        print(f"Error: The file {path} was not found.")
        raise
        
    X = df.drop(columns=[TARGET_COLUMN])
    y = df[TARGET_COLUMN]

    if len(X) != len(y):
        raise ValueError(
            f"Mismatch in number of samples between features ({len(X)}) "
            f"and target ({len(y)})."
        )
    return X, y

def train_and_log_model(
    model_name: str, 
    estimator: Any, 
    params: Dict[str, Any], 
    X_train: pd.DataFrame, 
    y_train: pd.Series, 
    X_test: pd.DataFrame, 
    y_test: pd.Series
) -> None:
    """Performs GridSearchCV, logs results and artifacts to MLflow."""
    
    grid_search = GridSearchCV(
        estimator=estimator, 
        param_grid=params, 
        cv=3,
        scoring='neg_mean_squared_error',
        verbose=1, 
        n_jobs=-1
    )
    
    print(f"\n--- Starting GridSearchCV for {model_name} ---")
    
    with mlflow.start_run(run_name=f"GridSearch_{model_name}_final") as run:
        
        mlflow.set_tag("model_name", model_name)
        mlflow.log_param("cv_folds", 3)
        
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        
        y_pred = best_model.predict(X_test)
        
        
        metrics = {
            'mae': mean_absolute_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'r2': r2_score(y_test, y_pred)
        }
        
        print(f"--- Results for {model_name} ---")
        print(f"Best Params: {grid_search.best_params_}")
        print(f"Metrics: {metrics}")
        
        
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics(metrics)
        
        if hasattr(best_model, 'feature_importances_'):
            feature_imp = pd.DataFrame(sorted(zip(best_model.feature_importances_, X_train.columns)), columns=['Value','Feature'])
            plt.figure(figsize=(10, 6))
            sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(15))
            plt.title(f'Feature Importance for {model_name}')
            plt.tight_layout()
            mlflow.log_figure(plt.gcf(), "feature_importance.png")
            plt.close()

        
        plt.figure(figsize=(10, 10))
        sns.scatterplot(x=y_test, y=y_pred)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2)
        plt.xlabel('Actual Salary')
        plt.ylabel('Predicted Salary')
        plt.title('Actual vs. Predicted Salary')
        mlflow.log_figure(plt.gcf(), "actual_vs_predicted.png")
        plt.close()

        
        residuals = y_test - y_pred
        plt.figure(figsize=(10, 6))
        sns.histplot(residuals, kde=True)
        plt.xlabel('Residuals')
        plt.title('Distribution of Residuals')
        mlflow.log_figure(plt.gcf(), "residuals_distribution.png")
        plt.close()

        
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path='model',
            registered_model_name=f"{model_name}_salary_predictor_final",
            input_example=X_train.head()
        )

def main():
    """Main function to run the training pipeline."""
    dagshub.init(
        repo_owner='malhar.c.prajapati',
        repo_name='Stack-overflow-survey-2024-salary-prediction',
        mlflow=True
    )
    
    X, y = load_data(DATA_PATH)
    print("Printing size of X and y",X.shape,y.shape)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )
    print(f"Data split. Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

    for model_name, (estimator, params) in MODELS_AND_PARAMS.items():
        train_and_log_model(model_name, estimator, params, X_train, y_train, X_test, y_test)

    print("\n--- All model training experiments are complete. ---")

if __name__ == "__main__":
    main()


Loading pre-processed data from ../data/processed/final_dataset.csv...
Printing size of X and y (23435, 24) (23435,)
Data split. Training on 18748 samples, testing on 4687 samples.

--- All model training experiments are complete. ---
