# Salary Prediction from LinkedIn Job Postings - Evaluate Stats Model

In [1]:
import pandas as pd, numpy as np
import salary
from sklearn.base import BaseEstimator

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np


## Evaluate Mean & Median by Norm Title Models

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [3]:
class MeanSalaryEstimator(BaseEstimator):
    def fit(self, X: pd.DataFrame, y):
        Xy = X.copy()
        Xy['salary'] = y
        self._salaries_by_title = Xy.groupby('norm_title')['salary'].mean()
        self._global_salary = Xy['salary'].mean()
        return self
    
    def predict(self, X):
        return np.array([
            self._salaries_by_title.loc[title] if title in self._salaries_by_title.index else self._global_salary
            for title in X['norm_title']
        ])

model = MeanSalaryEstimator().fit(X_train, y_train)
y_pred = model.predict(X_test)
salary.evaluate_test_predictions(y_pred)

Test R2: 0.1879
Test RMSE: 52536.1178
Test MAE: 36461.6068
On average, our predicted salaries are $36461.61 off the true salaries
This is 16.03% better than a naive global mean


In [None]:
class MedianSalaryEstimator(BaseEstimator):
    def fit(self, X: pd.DataFrame, y):
        Xy = X.copy()
        Xy['salary'] = y
        self._salaries_by_title = Xy.groupby('norm_title')['salary'].median()
        self._global_salary = Xy['salary'].median()
        return self
    
    def predict(self, X):
        return np.array([
            self._salaries_by_title.loc[title] if title in self._salaries_by_title.index else self._global_salary
            for title in X['norm_title']
        ])
    
model = MedianSalaryEstimator().fit(X_train, y_train)
y_pred = model.predict(X_test)
salary.evaluate_test_predictions(y_pred)

Test R2: 0.1391
Test RMSE: 54090.8397
Test MAE: 34812.4539
On average, our predicted salaries are $34812.45 off the true salaries
This is 19.82% better than a naive global mean
