In [1]:
import numpy as np
import pandas as pd
import dirty_cat
from dirty_cat import datasets

employee_salaries = datasets.fetch_employee_salaries()

In [2]:
data = employee_salaries.X
data.columns

Index(['gender', 'department', 'department_name', 'division',
       'assignment_category', 'employee_position_title',
       'underfilled_job_title', 'date_first_hired', 'year_first_hired'],
      dtype='object')

In [3]:
y = employee_salaries.y
print(y.name)

current_annual_salary


In [4]:
ml_df = data[['year_first_hired', 'assignment_category', 'employee_position_title']].dropna()

In [5]:
ml_df.head(10)

Unnamed: 0,year_first_hired,assignment_category,employee_position_title
0,1986,Fulltime-Regular,Office Services Coordinator
1,1988,Fulltime-Regular,Master Police Officer
2,1989,Fulltime-Regular,Social Worker IV
3,2014,Fulltime-Regular,Resident Supervisor II
4,2007,Fulltime-Regular,Planning Specialist III
5,2007,Fulltime-Regular,Police Officer III
6,2016,Fulltime-Regular,Accountant/Auditor II
7,2014,Fulltime-Regular,Administrative Specialist II
8,2016,Fulltime-Regular,Firefighter/Rescuer III
9,2007,Fulltime-Regular,Police Aide


In [6]:
X = ml_df[['employee_position_title', 'year_first_hired', 'assignment_category']]

In [7]:
ml_df['employee_position_title'].shape

(9228,)

In [8]:
ml_df['employee_position_title'].value_counts()

Police Officer III                                    883
Firefighter/Rescuer III                               694
Bus Operator                                          638
Manager III                                           243
Correctional Officer III (Corporal)                   228
                                                     ... 
Survey Crew Leader                                      1
Animal Care Attendant Supervisor                        1
Director Office of Consumer Protection                  1
Chief Administrative Officer                            1
Supervisor Transportation Systems Technical Center      1
Name: employee_position_title, Length: 385, dtype: int64

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='char', ngram_range=(3, 3)).fit(ml_df['employee_position_title'])
cv.transform(ml_df['employee_position_title']).shape

(9228, 1264)

In [10]:
list(cv.vocabulary_)[:10]

['off', 'ffi', 'fic', 'ice', 'ce ', 'e s', ' se', 'ser', 'erv', 'rvi']

In [11]:


mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)
mod.fit_transform(data[['employee_position_title']]).shape

  mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)


(9228, 200)

In [12]:
# all in all

from sklearn import set_config

set_config(display="diagram")

from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

method = {
    'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
    'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
    'sim_enc_all': dirty_cat.SimilarityEncoder(),
    'one-hot': OneHotEncoder(handle_unknown='ignore')
}

results = []

for k, encoder in method.items():
    pipe = Pipeline([
        ('split', FeatureUnion([
            ('cat', Pipeline([
                ('grab', ColumnSelector(['employee_position_title'])),
                ('handle', encoder)
            ])),
            ('one-hot', Pipeline([
                ('grab', ColumnSelector('assignment_category')),
                ('handle', OneHotEncoder(handle_unknown='ignore'))
            ])),
            ('floats', Pipeline([
                ('grab', ColumnSelector('year_first_hired')),
                ('scale', StandardScaler())
            ])),
        ])),
        ('mod', Ridge())
    ])

    grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
    res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
    res_df['key'] = k
    results.append(res_df)

  'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
  'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
  'sim_enc_all': dirty_cat.SimilarityEncoder(),


In [13]:
plt_df = pd.concat(results)[['mean_test_neg_mean_absolute_error', 'mean_test_r2', 'key']]
plt_df.sort_values('mean_test_r2', ascending=False).reset_index()

Unnamed: 0,index,mean_test_neg_mean_absolute_error,mean_test_r2,key
0,0,-6314.401344,0.902066,sim_enc_all
1,0,-6564.292035,0.875512,sim_enc300
2,0,-6394.404808,0.861479,one-hot
3,0,-7765.907348,0.78957,sim_enc100
