In [86]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [115]:
df.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,salary,salary_currency,salary_in_usd,employee_residence,company_location,experience_years,company_size_num,seniority,remote
0,183,2021,SE,FT,45000,GBP,61896,GB,GB,0,0.0,0.0,1
1,92,2021,MI,FT,1450000,INR,19609,IN,IN,0,0.0,4.0,1
2,317,2022,SE,FT,120160,USD,120160,US,US,0,0.0,0.0,1
3,600,2022,EN,FT,67000,USD,67000,CA,CA,0,0.0,0.0,0
4,454,2022,EN,FT,125000,USD,125000,US,US,0,0.0,0.0,0


In [126]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

def load_and_clean(path):
    logging.info(f"Loading data from {path}...")
    df = pd.read_parquet(path)
    df.columns = df.columns.str.strip()
    df = df.dropna()
    logging.info(f"Loaded {len(df)} rows with {len(df.columns)} columns.")
    return df

def feature_engineering(df):
    df = df.copy()
    if 'experience' in df.columns:
        df['experience_years'] = df['experience'].str.extract(r'(\d+)').astype(float).fillna(0)
    else:
        df['experience_years'] = 0

    size_map = {
        '1-10': 5, '11-50': 30, '51-200': 125, '201-500': 350,
        '501-1000': 750, '1001-5000': 3000, '5001-10,000': 7500, '10,001+': 15000
    }
    if 'company_size' in df.columns:
        df['company_size_num'] = df['company_size'].map(size_map).fillna(0)
    else:
        df['company_size_num'] = 0

    seniority_map = {
        'intern': 0.5, 'internship': 0.5, 'junior': 1, 'senior': 3,
        'lead': 4, 'manager': 5, 'director': 6, 'principal': 7
    }
    if 'job_title' in df.columns:
        df['seniority'] = 0
        for key, val in seniority_map.items():
            mask = df['job_title'].str.lower().str.contains(key, na=False)
            df.loc[mask, 'seniority'] = val
    else:
        df['seniority'] = 0

    if 'remote_ratio' in df.columns:
        df['remote'] = (df['remote_ratio'] > 0).astype(int)
    else:
        df['remote'] = 0

    return df

def drop_original_columns(df):
    to_drop = ['experience', 'company_size', 'job_title', 'remote_ratio']
    return df.drop(columns=[c for c in to_drop if c in df.columns])

def main():
    df_train = load_and_clean('train2.parquet')
    df_val = load_and_clean('validation2.parquet')

    logging.info("Applying feature engineering...")
    df_train = feature_engineering(df_train)
    df_train = drop_original_columns(df_train)
    df_val = feature_engineering(df_val)
    df_val = drop_original_columns(df_val)

    X_train = df_train.drop('salary_in_usd', axis=1)
    y_train = np.log1p(df_train['salary_in_usd'])

    X_val = df_val.drop('salary_in_usd', axis=1)
    y_val = np.log1p(df_val['salary_in_usd'])

    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

    logging.info(f"Categorical columns: {categorical_cols}")
    logging.info(f"Numerical columns: {numerical_cols}")

    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
    numerical_transformer = Pipeline([
        ('impute', FunctionTransformer(lambda x: np.nan_to_num(x, nan=0))),
        ('scale', StandardScaler())
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
    ])

    param_dist = {
        'regressor__n_estimators': [100, 200, 300, 400],
        'regressor__max_depth': [None, 10, 20, 30, 40],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4],
        'regressor__max_features': ['auto', 'sqrt', 'log2']
    }

    logging.info("Starting hyperparameter tuning with RandomizedSearchCV...")
    search = RandomizedSearchCV(model, param_distributions=param_dist,
                                n_iter=20, cv=5, scoring='neg_mean_absolute_error',
                                verbose=2, random_state=42, n_jobs=-1)

    search.fit(X_train, y_train)

    logging.info(f"Best parameters found: {search.best_params_}")
    best_model = search.best_estimator_

    logging.info("Evaluating best model on validation data...")
    y_pred_log = best_model.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_actual = np.expm1(y_val)

    mae = mean_absolute_error(y_val_actual, y_pred)
    percentage_mae = (mae / np.mean(y_val_actual)) * 100

    logging.info(f"Validation MAE: ${mae:,.2f}")
    logging.info(f"Validation MAE Percentage: {percentage_mae:.2f}%")

if __name__ == "__main__":
    from sklearn.preprocessing import FunctionTransformer
    main()


2025-06-17 06:11:23,257 INFO Loading data from train2.parquet...
2025-06-17 06:11:23,281 INFO Loaded 477 rows with 12 columns.
2025-06-17 06:11:23,282 INFO Loading data from validation2.parquet...
2025-06-17 06:11:23,302 INFO Loaded 69 rows with 12 columns.
2025-06-17 06:11:23,302 INFO Applying feature engineering...
2025-06-17 06:11:23,326 INFO Categorical columns: ['experience_level', 'employment_type', 'salary_currency', 'employee_residence', 'company_location']
2025-06-17 06:11:23,326 INFO Numerical columns: ['Unnamed: 0', 'work_year', 'salary', 'experience_years', 'company_size_num', 'seniority']
2025-06-17 06:11:23,330 INFO Starting hyperparameter tuning with RandomizedSearchCV...


Fitting 5 folds for each of 20 candidates, totalling 100 fits


2025-06-17 06:11:38,113 INFO Best parameters found: {'regressor__n_estimators': 300, 'regressor__min_samples_split': 5, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'auto', 'regressor__max_depth': None}
2025-06-17 06:11:38,113 INFO Evaluating best model on validation data...
2025-06-17 06:11:38,166 INFO Validation MAE: $5,595.34
2025-06-17 06:11:38,167 INFO Validation MAE Percentage: 4.72%
