# Evaluate Stats Model

In [1]:
import pandas as pd, numpy as np
import salary
from sklearn.base import BaseEstimator

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np
  from .autonotebook import tqdm as notebook_tqdm


## Evaluate Mean & Median Models

In [2]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [3]:
class MeanSalaryByTitleEstimator(BaseEstimator):
    def fit(self, X: pd.DataFrame, y):
        Xy = X.copy()
        Xy['salary'] = y
        self._salaries_by_title = Xy.groupby('title')['salary'].mean()
        self._global_salary = Xy['salary'].mean()
        return self
    
    def predict(self, X):
        return np.array([
            self._salaries_by_title.loc[title] if title in self._salaries_by_title.index else self._global_salary
            for title in X['title']
        ])

model = MeanSalaryByTitleEstimator().fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_title_result = salary.evaluate_test_predictions(y_pred)

Test size: 10000
Test R2: 0.2231
Test RMSE: 52820.8142
Test MAE: 34806.1266


In [4]:
class MedianSalaryByTitleEstimator(BaseEstimator):
    def fit(self, X: pd.DataFrame, y):
        Xy = X.copy()
        Xy['salary'] = y
        self._salaries_by_title = Xy.groupby('title')['salary'].median()
        self._global_salary = Xy['salary'].median()
        return self
    
    def predict(self, X):
        return np.array([
            self._salaries_by_title.loc[title] if title in self._salaries_by_title.index else self._global_salary
            for title in X['title']
        ])
    
model = MedianSalaryByTitleEstimator().fit(X_train, y_train)
y_pred = model.predict(X_test)
median_title_result = salary.evaluate_test_predictions(y_pred)

Test size: 10000
Test R2: 0.1763
Test RMSE: 54388.2055
Test MAE: 34139.0367


In [5]:
class MeanSalaryByTitleLocationEstimator(BaseEstimator):
    def __init__(self):
        self._title_estimator = MeanSalaryByTitleEstimator()

    def fit(self, X: pd.DataFrame, y):
        Xy = X.copy()
        Xy['salary'] = y
        self._salaries_by_title_location = Xy.groupby(['title', 'location'])['salary'].mean()
        self._title_estimator = self._title_estimator.fit(X, y)
        return self
    
    def predict(self, X):
        preds = []
        title_preds = self._title_estimator.predict(X)
        for (i, (idx, row)) in enumerate(X.iterrows()):
            if (row['title'], row['location']) in self._salaries_by_title_location.index:
                pred = self._salaries_by_title_location.loc[row['title'], row['location']]
            else:
                pred = title_preds[i]

            preds.append(pred)

        return preds

model = MeanSalaryByTitleLocationEstimator().fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_title_location_result = salary.evaluate_test_predictions(y_pred)

Test size: 10000
Test R2: 0.2217
Test RMSE: 52870.6352
Test MAE: 34788.8125


In [6]:
class MedianSalaryByTitleLocationEstimator(BaseEstimator):
    def __init__(self):
        self._title_estimator = MedianSalaryByTitleEstimator()

    def fit(self, X: pd.DataFrame, y):
        Xy = X.copy()
        Xy['salary'] = y
        self._salaries_by_title_location = Xy.groupby(['title', 'location'])['salary'].median()
        self._title_estimator = self._title_estimator.fit(X, y)
        return self
    
    def predict(self, X):
        preds = []
        title_preds = self._title_estimator.predict(X)
        for (i, (idx, row)) in enumerate(X.iterrows()):
            if (row['title'], row['location']) in self._salaries_by_title_location.index:
                pred = self._salaries_by_title_location.loc[row['title'], row['location']]
            else:
                pred = title_preds[i]

            preds.append(pred)

        return preds

model = MedianSalaryByTitleLocationEstimator().fit(X_train, y_train)
y_pred = model.predict(X_test)
median_title_location_result = salary.evaluate_test_predictions(y_pred)

Test size: 10000
Test R2: 0.1761
Test RMSE: 54395.1415
Test MAE: 34113.4747
