In [1]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from category_encoders import *
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

### encode 

In [4]:
encoder_types=['BackwardDifferenceEncoder',
#  'BaseNEncoder',
#  'BinaryEncoder',
 'CatBoostEncoder',
#  'HashingEncoder',
#  'HelmertEncoder',
#  'JamesSteinEncoder',
#  'LeaveOneOutEncoder',
#  'MEstimateEncoder',
#  'OneHotEncoder',
#  'OrdinalEncoder',
#  'SumEncoder',
#  'PolynomialEncoder',
 'TargetEncoder',
#  'WOEEncoder'
              ]

def encode_categories(X,y='',encoder_type='',data_type='train',encoder_obj='NULL'):
    if data_type=='train':
        encoder_obj=eval(encoder_type+'()')
        X=encoder_obj.fit_transform(X,y)
    else:
        X=encoder_obj.transform(X)
    return encoder_obj,X

# encoder,X_train=encode_categories(X_train,y_train,'TargetEncoder','train','')
# encoder,X_test=encode_categories(X_test,'','TargetEncoder','test',encoder)
    

### train 

In [3]:
def train_models(X,y):
    
    #catboost
    print(" training cat")
    model_cat = CatBoostClassifier(iterations=50
#                                    ,task_type="GPU", devices='0:1'
                                  )
    model_cat.fit(X, y.reshape(-1,1), verbose=0)

    #xgboost
    print(" training xgb")
    model_xgb = XGBClassifier(
#         tree_method='gpu_hist', gpu_id=0
    )
    model_xgb.fit(X, y)
    
    #lightgbm
#     d_train = lgbm.Dataset(X, label=y)
    print(" training lgbm")
    model_gbm=lgbm.LGBMClassifier()
    model_gbm.fit(X,y)
    
    #RF
    print(' training RF')
    model_rf=RandomForestClassifier()
    model_rf.fit(X,y)
    
    return model_cat,model_xgb,model_gbm,model_rf
    
# model_cat,model_xgb,model_gbm,model_rf=train_models(X_train,y_train)

### evaluate 

In [4]:
# evaluate predictions
def test_models(model_cat,model_xgb,model_gbm,model_rf,X,y):
    
    #catboost
    accuracy = accuracy_score(y, model_cat.predict(X))
    f1=f1_score(y, model_cat.predict(X),average='micro')
    print(" cat:" ,accuracy * 100.0, f1*100.0)

    #xgboost
    accuracy = accuracy_score(y, model_xgb.predict(X))
    f1=f1_score(y, model_xgb.predict(X),average='micro')
    print(" xgb: ",accuracy * 100.0, f1*100.0)

    #lightgbm
    accuracy = accuracy_score(y, model_gbm.predict(X))
    f1=f1_score(y, model_gbm.predict(X),average='micro')
    print(" lgbm: ",accuracy * 100.0, f1*100.0)
    
    #random forest
    accuracy = accuracy_score(y, model_rf.predict(X))
    f1=f1_score(y, model_rf.predict(X),average='micro')
    print(" rf: ",accuracy * 100.0, f1*100.0)

    
# test_models(model_cat,model_xgb,model_gbm,model_rf,X_test,y_test)
            

###  run to get all models performance

In [5]:
def work(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    for encoding in encoder_types:
        print(encoding)

        encoder,X_train_encoded=encode_categories(X=X_train,y=y_train,encoder_type=encoding,data_type='train')
        encoder,X_test_encoded=encode_categories(X=X_test,data_type='test',encoder_obj=encoder)

        model_cat,model_xgb,model_gbm,model_rf=train_models(X_train_encoded,y_train)

        test_models(model_cat,model_xgb,model_gbm,model_rf,X_test_encoded,y_test)


### get df 

In [6]:
df=pd.read_csv('income.csv')

df=df.dropna()
df=df[df['Income in EUR']>0]
df['Profession']=df.Profession.map(lambda x:x[:3])
df['Income in EUR']=pd.cut(np.log(df['Income in EUR'].values.ravel()),bins=5)
df['Income in EUR']=df['Income in EUR'].astype(str)

df.head()

X=df[['Year of Record','Gender','Age','Country','Profession','University Degree']]
y=df['Income in EUR']

encoder_of_target = OrdinalEncoder()
y=encoder_of_target.fit_transform(np.array(y).reshape(-1,1)).ravel()

work(X,y)

BackwardDifferenceEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 83.05147305549964 83.05147305549964
 xgb:  83.74941279108785 83.74941279108785
 lgbm:  81.81330112073016 81.81330112073016
 rf:  80.83014562781021 80.83014562781021
CatBoostEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 81.07845111066372 81.07845111066372
 xgb:  81.45090933494397 81.45090933494397
 lgbm:  80.55835178847057 80.55835178847057
 rf:  80.65901617341117 80.65901617341117
TargetEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 83.35682168981947 83.35682168981947
 xgb:  84.00107375343937 84.00107375343937
 lgbm:  83.1286490839541 83.12864908395412
 rf:  81.55492919938258 81.55492919938257


In [7]:
df=pd.read_csv('abalone.data',header=None)
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

encoder_of_target = OrdinalEncoder()
y=encoder_of_target.fit_transform(np.array(y).reshape(-1,1)).ravel()

work(X,y)

BackwardDifferenceEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 26.54097171863669 26.54097171863669
 xgb:  24.437998549673676 24.437998549673676
 lgbm:  25.308194343727337 25.308194343727337
 rf:  25.380710659898476 25.380710659898476
CatBoostEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 26.32342277012328 26.32342277012328
 xgb:  23.930384336475708 23.930384336475704
 lgbm:  24.147933284989122 24.147933284989122
 rf:  25.598259608411894 25.598259608411894
TargetEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 26.46845540246556 26.46845540246556
 xgb:  24.437998549673676 24.437998549673676
 lgbm:  25.308194343727337 25.308194343727337
 rf:  25.308194343727337 25.308194343727337


In [8]:
df=pd.read_csv('adult.data',header=None)
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

encoder_of_target = OrdinalEncoder()
y=encoder_of_target.fit_transform(np.array(y).reshape(-1,1)).ravel()

work(X,y)

BackwardDifferenceEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 86.91606179043364 86.91606179043364
 xgb:  86.95328494323469 86.95328494323469
 lgbm:  87.40926856504746 87.40926856504746
 rf:  85.38991252559092 85.38991252559092
CatBoostEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 87.19523543644146 87.19523543644146
 xgb:  86.86022706123208 86.86022706123208
 lgbm:  87.1859296482412 87.1859296482412
 rf:  86.0785408524102 86.0785408524102
TargetEncoder
 training cat
 training xgb
 training lgbm
 training RF
 cat: 87.51163223525033 87.51163223525033
 xgb:  86.80439233203052 86.80439233203052
 lgbm:  87.42788014144797 87.42788014144797
 rf:  85.47366461939326 85.47366461939326


In [2]:
df=pd.read_csv('adult.data',header=None)
X=df.iloc[:,:-1]
y=df.iloc[:,-1]


In [20]:
encoder,X_train=encode_categories(X[[8]],y,'HelmertEncoder','train','')

In [24]:
X[8].value_counts()

 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: 8, dtype: int64

In [21]:
X_train

Unnamed: 0,intercept,8_0,8_1,8_2,8_3
0,1,-1.0,-1.0,-1.0,-1.0
1,1,-1.0,-1.0,-1.0,-1.0
2,1,-1.0,-1.0,-1.0,-1.0
3,1,1.0,-1.0,-1.0,-1.0
4,1,1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...
32556,1,-1.0,-1.0,-1.0,-1.0
32557,1,-1.0,-1.0,-1.0,-1.0
32558,1,-1.0,-1.0,-1.0,-1.0
32559,1,-1.0,-1.0,-1.0,-1.0


### To do: 
F1 score for multiclass <br>
best hyperparameters