In [2]:
single_opt = ['EdLevel','DevType','Country','AISelect']
multi_opt = ['Employment','CodingActivities','LanguageHaveWorkedWith','DatabaseHaveWorkedWith',
             'PlatformHaveWorkedWith','WebframeHaveWorkedWith','MiscTechHaveWorkedWith']
number = ['YearsCode','YearsCodePro','WorkExp']

In [3]:
import joblib
import pandas as pd

def map_years(arr):
    dfy = pd.DataFrame(arr, columns=number)
    dfy = dfy.replace({'Less than 1 year': 0.5, 'More than 50 years': 50})
    return dfy.astype(float)

def split_semi(X):
    return X.iloc[:, 0].fillna('').str.split(';')

def tokenize_list(tokens):
    return tokens

def identity(x):
    return x

preprocessor = joblib.load("../models/preprocessor.pkl") 

In [4]:
df = pd.read_csv("../data/processed/salary_only.csv")

In [5]:
df['CompTotal']

0        2040000.0
1          28000.0
2          85000.0
3          50000.0
4         110000.0
           ...    
33735      36000.0
33736      40000.0
33737      61000.0
33738      58000.0
33739      55000.0
Name: CompTotal, Length: 33740, dtype: float64

In [6]:
from sklearn.model_selection import train_test_split

X = preprocessor.transform(df.drop(columns=['CompTotal']))
Y = df['CompTotal']

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [7]:
x_train

<26992x446 sparse matrix of type '<class 'numpy.float64'>'
	with 685129 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(x_train,y_train)

In [9]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# your column lists
single_opt = ['EdLevel','DevType','Country','AISelect']
multi_opt  = [
    'Employment','CodingActivities','LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith','PlatformHaveWorkedWith',
    'WebframeHaveWorkedWith','MiscTechHaveWorkedWith'
]
number     = ['YearsCode','YearsCodePro','WorkExp']

# transformer functions
def map_years(X):
    return X.replace({'Less than 1 year':0.5, 'More than 50 years':50}).astype(float)

def split_semi(X):
    # X is a Series
    return X.fillna('').str.split(';')

# build ColumnTransformer
preprocessor = ColumnTransformer([
    # numeric years → map → impute → scale
    ('num', Pipeline([
        ('map',   FunctionTransformer(map_years, validate=False)),
        ('imp',   SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    ]), number),

    # single‐choice → impute → one‑hot
    ('cat', Pipeline([
        ('imp', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), single_opt),

    # multi‐choice → split → vectorize
    *[
      (col, Pipeline([
           ('split', FunctionTransformer(lambda df, c=col: split_semi(df[c]), validate=False)),
           ('vec',   CountVectorizer(tokenizer=lambda x: x,
                                     preprocessor=lambda x: x,
                                     token_pattern=None,
                                     binary=True))
       ]), [col])
      for col in multi_opt
    ]
], remainder='drop')

# full end‑to‑end pipeline
pipeline = Pipeline([
    ('pre', preprocessor),
    ('rf',  RandomForestRegressor(n_estimators=100, random_state=42))
])

# load & prepare
df = pd.read_csv("../data/raw/survey_results_public.csv")
y  = pd.to_numeric(df['CompTotal'], errors='coerce').fillna(0)

# split, train, evaluate
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

print("Test RMSE:", mean_squared_error(y_test, preds))


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Custom Transformer for multi-select categorical features
class MultiSelectTransformer(BaseEstimator, TransformerMixin):
    """
    Custom transformer for multi-select categorical features.
    It splits the string by a delimiter and then applies a multi-hot encoding.
    """
    def __init__(self, delimiter=';'):
        self.delimiter = delimiter
        self.categories_ = []

    def fit(self, X, y=None):
        # Find all unique categories
        all_categories = set()
        for _, row in X.items():
            if pd.notna(row):
                all_categories.update(cat.strip() for cat in row.split(self.delimiter))
        self.categories_ = sorted(list(all_categories))
        return self

    def transform(self, X, y=None):
        # Create the multi-hot encoded matrix
        num_samples = len(X)
        num_categories = len(self.categories_)
        encoded_matrix = np.zeros((num_samples, num_categories))

        for i, row in enumerate(X):
            if pd.notna(row):
                for j, cat in enumerate(self.categories_):
                    if cat in [c.strip() for c in row.split(self.delimiter)]:
                        encoded_matrix[i, j] = 1
        return encoded_matrix

# Select features for the model
# These would be chosen based on the EDA and domain knowledge
features = [
    'Country', 'EdLevel', 'YearsCodePro', 'DevType',
    'LanguageHaveWorkedWith', 'WebframeHaveWorkedWith',
    'DatabaseHaveWorkedWith', 'PlatformHaveWorkedWith'
]
target = 'ConvertedCompYearly'

# Drop rows where target is missing
df_model = df.dropna(subset=[target] + features).copy()

# Split the data
X = df_model[features]
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define which columns go to which transformer
numerical_features = ['YearsCodePro']
categorical_features = ['Country', 'EdLevel', 'DevType']
multi_select_features = [
    'LanguageHaveWorkedWith', 'WebframeHaveWorkedWith',
    'DatabaseHaveWorkedWith', 'PlatformHaveWorkedWith'
]

# Create the preprocessing pipelines for each feature type
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

multi_select_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('multi_hot', MultiSelectTransformer())
])

# Create the master preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('multi', multi_select_transformer, multi_select_features[0]), # Example for one multi-select, expand as needed
        # You can add more multi-select features here
    ],
    remainder='passthrough'
)

In [5]:
df['OpSysPersonal use']

0                                            MacOS;Windows
1                                Other Linux-based;Windows
2                                                    MacOS
3                                                    MacOS
4                                            MacOS;Windows
                               ...                        
33735                                                  NaN
33736                                          iOS;Windows
33737                                                 Arch
33738                                                MacOS
33739    Cygwin;Debian;iOS;iPadOS;MacOS;Ubuntu;Windows;...
Name: OpSysPersonal use, Length: 33740, dtype: object

In [10]:
for i in df.columns:
    print(i,end='\t')

ResponseId	MainBranch	Age	Employment	RemoteWork	Check	CodingActivities	EdLevel	LearnCode	LearnCodeOnline	TechDoc	YearsCode	YearsCodePro	DevType	OrgSize	PurchaseInfluence	BuyNewTool	BuildvsBuy	TechEndorse	Country	Currency	CompTotal	LanguageHaveWorkedWith	LanguageWantToWorkWith	LanguageAdmired	DatabaseHaveWorkedWith	DatabaseWantToWorkWith	DatabaseAdmired	PlatformHaveWorkedWith	PlatformWantToWorkWith	PlatformAdmired	WebframeHaveWorkedWith	WebframeWantToWorkWith	WebframeAdmired	EmbeddedHaveWorkedWith	EmbeddedWantToWorkWith	EmbeddedAdmired	MiscTechHaveWorkedWith	MiscTechWantToWorkWith	MiscTechAdmired	ToolsTechHaveWorkedWith	ToolsTechWantToWorkWith	ToolsTechAdmired	NEWCollabToolsHaveWorkedWith	NEWCollabToolsWantToWorkWith	NEWCollabToolsAdmired	OpSysPersonal use	OpSysProfessional use	OfficeStackAsyncHaveWorkedWith	OfficeStackAsyncWantToWorkWith	OfficeStackAsyncAdmired	OfficeStackSyncHaveWorkedWith	OfficeStackSyncWantToWorkWith	OfficeStackSyncAdmired	AISearchDevHaveWorkedWith	AISearchDevWantTo

In [1]:




import os
import pandas as pd
import numpy as np
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
import seaborn as sns

# df = pd.read_csv("../data/processed/salary_only.csv")

print("Initializing DagsHub and MLflow...")

DAGSHUB_REPO_OWNER = os.getenv("DAGSHUB_REPO_OWNER", "malhar.c.prajapati")
DAGSHUB_REPO_NAME = os.getenv("DAGSHUB_REPO_NAME", "Stack-overflow-survey-2024-salary-prediction")

dagshub.init(repo_owner=DAGSHUB_REPO_OWNER, repo_name=DAGSHUB_REPO_NAME, mlflow=True)
mlflow.set_experiment("Salary Prediction Experiments")


print("Loading data...")


try:
    df = pd.read_csv('../data/raw/survey_results_public.csv')
    print("Data loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'survey_results_public.csv' not found. Please place it in the same directory.")
    exit()


print("Defining features and preprocessing pipeline...")



FEATURE_COLUMNS = [
    'Country', 'EdLevel', 'YearsCodePro', 'MainBranch', 'RemoteWork', 'Age',
    'LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith', 'WebframeHaveWorkedWith',
    'OpSysPersonal use' 
]
TARGET_COLUMN = 'ConvertedCompYearly'



df_model = df.dropna(subset=[TARGET_COLUMN]).copy()

df_model.dropna(subset=FEATURE_COLUMNS, how='all', inplace=True)


df_model['YearsCodePro'] = pd.to_numeric(df_model['YearsCodePro'], errors='coerce')


df_model[TARGET_COLUMN] = np.log1p(df_model[TARGET_COLUMN])

X = df_model[FEATURE_COLUMNS]
y = df_model[TARGET_COLUMN]


class MultiSelectBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = None
        self.classes_ = None

    def fit(self, X, y=None):
        series = X.fillna('').astype(str).apply(lambda x: x.split(';'))
        self.mlb = MultiLabelBinarizer(sparse_output=False)
        self.mlb.fit(series)
        self.classes_ = self.mlb.classes_
        return self

    def transform(self, X, y=None):
        series = X.fillna('').astype(str).apply(lambda x: x.split(';'))
        encoded_matrix = self.mlb.transform(series)
        df = pd.DataFrame(encoded_matrix, columns=self.classes_, index=X.index)
        if '' in df.columns:
            df = df.drop(columns=[''])
        return df



numerical_features = ['YearsCodePro']
categorical_features = ['Country', 'EdLevel', 'MainBranch', 'RemoteWork', 'OpSysPersonal use', 'Age']
multi_select_features = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith', 'WebframeHaveWorkedWith']


numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        
        ('lang', MultiSelectBinarizer(), 'LanguageHaveWorkedWith'),
        ('db', MultiSelectBinarizer(), 'DatabaseHaveWorkedWith'),
        ('web', MultiSelectBinarizer(), 'WebframeHaveWorkedWith')
    ],
    remainder='drop', 
    n_jobs=-1 
)


print("Splitting data and starting model training...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


with mlflow.start_run(run_name="LGBM Regressor Run") as run:
    
    mlflow.log_param("model_class", "LGBMRegressor")

    
    lgbm_params = {
        'n_estimators': 250,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': -1,
        'random_state': 42,
        'n_jobs': -1,
        'colsample_bytree': 0.8
    }
    mlflow.log_params(lgbm_params)

    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LGBMRegressor(**lgbm_params))
    ])

    
    print("Fitting the model pipeline...")
    model.fit(X_train, y_train)
    print("Model training complete.")

    
    y_pred = model.predict(X_test)

    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Evaluation Metrics (on log-transformed salary):")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R2 Score: {r2:.4f}")

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    
    try:
        print("Generating and logging feature importance plot...")
        
        ohe_feature_names = list(model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
        lang_feature_names = list(model.named_steps['preprocessor'].named_transformers_['lang'].classes_)
        db_feature_names = list(model.named_steps['preprocessor'].named_transformers_['db'].classes_)
        web_feature_names = list(model.named_steps['preprocessor'].named_transformers_['web'].classes_)

        
        feature_names = numerical_features + ohe_feature_names + lang_feature_names + db_feature_names + web_feature_names
        
        
        feature_names = [f for f in feature_names if f != '']

        importances = model.named_steps['regressor'].feature_importances_

        
        if len(feature_names) == len(importances):
            feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
            feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).head(25)

            plt.figure(figsize=(12, 8))
            sns.barplot(x='importance', y='feature', data=feature_importance_df)
            plt.title('Top 25 Feature Importances')
            plt.tight_layout()
            
            
            plot_path = "feature_importance.png"
            plt.savefig(plot_path)
            plt.close()

            
            mlflow.log_artifact(plot_path, "plots")
            print("Feature importance plot logged.")
        else:
            print(f"Warning: Mismatch in feature names ({len(feature_names)}) and importances ({len(importances)}). Skipping plot.")

    except Exception as e:
        print(f"Could not generate feature importance plot: {e}")


    
    print("Logging model to MLflow...")
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="salary-predictor-model",
        registered_model_name="stackoverflow-salary-predictor" 
    )
    print("Model logged successfully.")

print(f"\n✅ Training script finished. View the run in DagsHub/MLflow: {run.info.artifact_uri}")


Initializing DagsHub and MLflow...


Loading data...
Data loaded successfully.
Dataset shape: (65437, 114)
Defining features and preprocessing pipeline...
Splitting data and starting model training...
Fitting the model pipeline...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 655
[LightGBM] [Info] Number of data points in the train set: 18748, number of used features: 304
[LightGBM] [Info] Start training from score 10.786639
Model training complete.




Evaluation Metrics (on log-transformed salary):
  RMSE: 1.0486
  MAE: 0.5866
  R2 Score: 0.4479
Generating and logging feature importance plot...
Feature importance plot logged.
Logging model to MLflow...


Registered model 'stackoverflow-salary-predictor' already exists. Creating a new version of this model...
2025/07/27 14:04:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: stackoverflow-salary-predictor, version 2
Created version '2' of model 'stackoverflow-salary-predictor'.


Model logged successfully.
🏃 View run LGBM Regressor Run at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0/runs/3199f3dbfe614ad59f363c7eecd51379
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/Stack-overflow-survey-2024-salary-prediction.mlflow/#/experiments/0

✅ Training script finished. View the run in DagsHub/MLflow: mlflow-artifacts:/b66ac9a49b554a539f4bb427870fb357/3199f3dbfe614ad59f363c7eecd51379/artifacts
