In [11]:
# !pip install catboost
# !pip install category_encoders
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score,jaccard_score,multilabel_confusion_matrix,log_loss
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.inspection import permutation_importance
import pandas as pd
import numpy as np
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [12]:
df = pd.read_csv('../../land_prices.csv')

In [13]:
df.head()

Unnamed: 0,State,LandCategory,Region,Region or State,Year,Acre Value
0,Northeast,Farm Real Estate,Northeast,Region,1997,2240.0
1,Connecticut,Farm Real Estate,Northeast,State,1997,5950.0
2,Delaware,Farm Real Estate,Northeast,State,1997,2580.0
3,Maine,Farm Real Estate,Northeast,State,1997,1170.0
4,Maryland,Farm Real Estate,Northeast,State,1997,3150.0


In [14]:

df.dropna(inplace=True)
df['Acre Value']=pd.Series(pd.cut(df.iloc[:,-1],2)).astype(str)

In [15]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]
y=pd.Series(LabelEncoder().fit_transform(y))

In [16]:
#label encoder to encode unseen values too
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)


In [17]:
#encodings and split

def onehot_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    obj_cols=X.select_dtypes('object').columns
    enc=ce.OneHotEncoder(cols=obj_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train, X_test,y_train, y_test

def target_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
#     obj_cols=X.select_dtypes('object').columns
    enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test
    
def onehot_target(X,y,ratio,thresh):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    low_card_cols,high_card_cols=[],[]
    obj_cols=X.select_dtypes('object').columns
    for col in obj_cols:
        if X_train[col].nunique()<=thresh:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    print(low_card_cols,high_card_cols)
    
    enc=ce.OneHotEncoder(cols=low_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    enc=ce.TargetEncoder(cols=high_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test

def target_encode_multiclass(X,y,ratio):
    # class_names=y.unique()
    # y_classes=pd.DataFrame(columns=class_names)

    # for class_ in class_names:
    #   y_class_=y.map(lambda x: 1 if x==class_ else 0)
    #   y_classes[class_]=y_class_

    X_train, X_test, y_train, y_test = train_test_split(X, y.astype(str), test_size=ratio, random_state=42)

    enc=ce.OneHotEncoder().fit(y_train)
    y_train_onehot=enc.transform(y_train)
    y_test_onehot=enc.transform(y_test)
    class_names=y_train_onehot.columns

    X_train_obj=X_train.select_dtypes('object')
    X_test_obj=X_test.select_dtypes('object')
    X_train=X_train.select_dtypes(exclude='object')
    X_test=X_test.select_dtypes(exclude='object')

    for class_ in class_names:
      
      enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train_obj,y_train_onehot[class_])
      temp=enc.transform(X_train_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_train=pd.concat([X_train,temp],axis=1)
      temp=enc.transform(X_test_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_test=pd.concat([X_test,temp],axis=1)
      
    return X_train, X_test, y_train.astype(int), y_test.astype(int)

def onehot_target_encode_multiclass(X,y,ratio,thresh):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y.astype(str), test_size=ratio, random_state=42)

    low_card_cols,high_card_cols=[],[]
    obj_cols=X.select_dtypes('object').columns
    for col in obj_cols:
        if X_train[col].nunique()<=thresh:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    enc=ce.OneHotEncoder(cols=low_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)

    enc=ce.OneHotEncoder().fit(y_train)
    y_train_onehot=enc.transform(y_train)
    y_test_onehot=enc.transform(y_test)
    class_names=y_train_onehot.columns

    X_train_obj=X_train.select_dtypes('object')
    X_test_obj=X_test.select_dtypes('object')
    X_train=X_train.select_dtypes(exclude='object')
    X_test=X_test.select_dtypes(exclude='object')

    for class_ in class_names:
      
      enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train_obj,y_train_onehot[class_])
      temp=enc.transform(X_train_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_train=pd.concat([X_train,temp],axis=1)
      temp=enc.transform(X_test_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_test=pd.concat([X_test,temp],axis=1)
      
    return X_train, X_test, y_train.astype(int), y_test.astype(int)

In [18]:

def train_evaluate( X_train,X_test,y_train, y_test,encoding):

  # try:
  print("lightgbm training with gridsearch")

  model = lgbm.LGBMClassifier(boosting_type='goss')
  grid = GridSearchCV(estimator=model, param_grid = parameters)
  grid.fit(X_train, y_train,verbose=0)    

  model=grid.best_estimator_
  evaluate(model,X_test,y_test,encoding)
  # except: pass

  try:
    print("catboost training with gridsearch")

    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test,encoding)
  except:pass

  try:
    print("xgboost training with gridsearch")

    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    grid = GridSearchCV(estimator=model, param_grid = parameters_xgb)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test,encoding)
  except:pass

  try:  
    print("RF training with gridsearch")
    
    model=RandomForestClassifier()
    grid = GridSearchCV(estimator=model, param_grid = parameters_RF)
    grid.fit(X_train.fillna(value=0), y_train)    

    model=grid.best_estimator_
    evaluate(model,X_test.fillna(value=0),y_test,encoding)
  except:pass
    
def train_evaluate_with_cat_feat( X_train,X_test,y_train, y_test,cat_features=None):
  try:
    print("lightgbm training with gridsearch")
    for col in cat_features:
        le = LabelEncoderExt()
        le.fit(X_train[col])
        X_train[col]=le.transform(X_train[col])
        X_test[col]=le.transform(X_test[col])

    fit_params={'categorical_feature':cat_features}
    model = lgbm.LGBMClassifier(boosting_type='goss')
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test,encoding)
  except:pass

  try:
    print("catboost training with gridsearch")
    fit_params={'cat_features':cat_features}
    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test,encoding)
  except:pass

In [22]:
#evaluation function

# def evaluate(model,X_test,y_test):
    
#     pred=model.predict(X_test)
#     pred_proba=model.predict_proba(X_test)
#     # print(pd.Series(pred).value_counts())
#     print('accuracy:',accuracy_score(y_test,pred))
#     print('f1 macro:',f1_score(y_test,pred, average='macro'))
#     # print('f1_micro:',f1_score(y_test,pred, average='micro'))
# #     print(pd.Series(pred).unique())
#     print('log_loss:',log_loss(y_test,pred_proba,labels=[i for i in range(10)]))
def evaluate(model,X_test,y_test,encoding):
    global results
    pred=model.predict(X_test)
    pred_proba=model.predict_proba(X_test)
    
    mod=model
    acc=accuracy_score(y_test,pred)
    f1=f1_score(y_test,pred, average='macro')
    loss=log_loss(y_test,pred_proba,labels=[i for i in range(2)])
    cols=list(X_test.columns)
    imp=list(permutation_importance(model,X_test,y_test)['importances_mean'])
    
    print('accuracy:',acc)
    print('f1 macro:',f1)
    # print('f1_micro:',f1_score(y_test,pred, average='micro'))
#     print(pd.Series(pred).unique())
    print('log_loss:',loss)

    result_this=pd.Series({'encoding':encoding,'model':mod, 'accuracy':acc, 'f1':f1, 'loss':loss, 'cols':cols, 'importance':imp})
    results=results.append(result_this,ignore_index=True)


In [None]:
# X=df.iloc[:,:-1]
# y=df.iloc[:,-1:]
# X_train,X_test,y_train, y_test=onehot_all(df.iloc[:,:-1],y,.2)
# X_train,X_test,y_train, y_test=target_all(df.iloc[:,:-1],y,ratio=.2)
# X_train,X_test,y_train, y_test=onehot_target(df.iloc[:,:-1],y,ratio=.2,thresh=10)


# parameters = {'depth'         : [6,8,10],
#               'learning_rate' : [.01,.05,.1,.2],
#               'n_estimators': [500,1000],
#               # 'reg_lambda': [0,0.5,1.0]
#               # 'iterations'    : [500,1000,2000]
#               }
parameters = {
#     'depth'         : [4],
#               'learning_rate' : [.2],
              'iterations'    : [50]
              }
# parameters_xgb = {
# 'n_estimators': [500,1000],
#     # 'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [6,8,10],
#     'criterion' :['gini', 'entropy'],
#     # 'reg_lambda': [0,0.5,1.0]
#         }

parameters_xgb = {
# 'n_estimators': [50],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4],
#     'criterion' :['gini', 'entropy']
        }
parameters_RF={
#     'max_depth':[6,8,10], 'n_estimators':[500,1000]
}
# parameters_RF={'max_depth':[2], 'n_estimators':[10]}

results=pd.DataFrame(columns=['encoding','model','accuracy','f1','loss','cols','importance'])

print("##############################")
print("now with target for all")
X_train,X_test,y_train, y_test=target_encode_multiclass(X,y,.3)
train_evaluate( X_train,X_test,y_train, y_test, encoding='target')
print("##############################")
print("now with mixed for all")
X_train,X_test,y_train, y_test=onehot_target_encode_multiclass(X,y,.2,8)
train_evaluate( X_train,X_test,y_train, y_test,encoding='mixed')
print("##############################")
print("now with one hot for all")
X_train,X_test,y_train, y_test=onehot_all(X,y,.2)
train_evaluate( X_train,X_test,y_train, y_test,encoding='onehot')
print("##############################")
print("now with native cat features support")
X_train,X_test,y_train, y_test=train_test_split(X, y, test_size=.2, random_state=42)
train_evaluate_with_cat_feat(X_train,X_test,y_train, y_test,cat_features=list(X_train.select_dtypes('object').columns),encoding='native')


##############################
now with target for all
lightgbm training with gridsearch
accuracy: 0.9922928709055877
f1 macro: 0.9453684210526316
log_loss: 0.03762231754588973
catboost training with gridsearch
accuracy: 0.9913294797687862
f1 macro: 0.9393073159956602
log_loss: 0.033443797770118154
xgboost training with gridsearch
accuracy: 0.9913294797687862
f1 macro: 0.9407845946134392
log_loss: 0.03427115390787866
RF training with gridsearch
accuracy: 0.9922928709055877
f1 macro: 0.9453684210526316
log_loss: 0.07878869399311096
##############################
now with mixed for all
lightgbm training with gridsearch
accuracy: 0.9884393063583815
f1 macro: 0.8970238095238094
log_loss: 0.04934726402698487
catboost training with gridsearch




In [None]:
X_train,X_test,y_train, y_test=onehot_all(X,y,.2)
# for col in list(X_train.select_dtypes('object').columns):
#     le = LabelEncoderExt()
#     le.fit(X_train[col])
#     X_train[col]=le.transform(X_train[col])
#     X_test[col]=le.transform(X_test[col])

model = lgbm.LGBMClassifier()
# CatBoostClassifier(task_type="GPU", devices='0:1')
# lgbm.LGBMClassifier()
# RandomForestClassifier()
# XGBClassifier(tree_method='gpu_hist', gpu_id=0)
model.fit(X_train, y_train
          ,verbose=0
#           ,eval_set=(X_test,y_test)
         )

In [None]:
for key in res:
    res[key]=str(res[key])

In [None]:
results=results.append(res,ignore_index=True)

In [None]:
results