In [1]:
!pip install catboost
!pip install category_encoders
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.metrics import accuracy_score, f1_score,jaccard_score,multilabel_confusion_matrix,log_loss
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)



  import pandas.util.testing as tm


In [2]:
import pandas as pd
df = pd.read_csv('training.csv')

In [3]:

X=df.iloc[:,2:]
y=df.iloc[:,1]

In [4]:
#label encoder to encode unseen values too
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)


In [5]:
#encodings and split

def onehot_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    obj_cols=X.select_dtypes('object').columns
    enc=ce.OneHotEncoder(cols=obj_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train, X_test,y_train, y_test

def target_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
#     obj_cols=X.select_dtypes('object').columns
    enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test
    
def onehot_target(X,y,ratio,thresh):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    low_card_cols,high_card_cols=[],[]
    obj_cols=X.select_dtypes('object').columns
    for col in obj_cols:
        if X_train[col].nunique()<=thresh:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    print(low_card_cols,high_card_cols)
    
    enc=ce.OneHotEncoder(cols=low_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    enc=ce.TargetEncoder(cols=high_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test
            

In [6]:

def train_evaluate( X_train,X_test,y_train, y_test,cat_features=None):

  try:
    print("lightgbm training with gridsearch")

    model = lgbm.LGBMClassifier(boosting_type='goss')
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except: pass

  try:
    print("catboost training with gridsearch")

    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:
    print("xgboost training with gridsearch")

    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    grid = GridSearchCV(estimator=model, param_grid = parameters_xgb)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:  
    print("RF training with gridsearch")
    
    model=RandomForestClassifier()
    grid = GridSearchCV(estimator=model, param_grid = parameters_RF)
    grid.fit(X_train.fillna(value=0), y_train)    

    model=grid.best_estimator_
    evaluate(model,X_test.fillna(value=0),y_test)
  except:pass
    
def train_evaluate_with_cat_feat( X_train,X_test,y_train, y_test,cat_features=None):
  try:
    print("lightgbm training with gridsearch")
    for col in cat_features:
        le = LabelEncoderExt()
        le.fit(X_train[col])
        X_train[col]=le.transform(X_train[col])
        X_test[col]=le.transform(X_test[col])

    fit_params={'categorical_feature':cat_features}
    model = lgbm.LGBMClassifier(boosting_type='goss')
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:
    print("catboost training with gridsearch")
    fit_params={'cat_features':cat_features}
    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

In [7]:
#evaluation function

def evaluate(model,X_test,y_test):
    
    pred=model.predict(X_test)
    pred_proba=model.predict_proba(X_test)

    print('accuracy:',accuracy_score(y_test,pred))
    print('f1 macro:',f1_score(y_test,pred, average='macro'))
    print('f1_micro:',f1_score(y_test,pred, average='micro'))
#     print(pd.Series(pred).unique())
    print('log_loss:',log_loss(y_test,pred_proba))


In [None]:
# X=df.iloc[:,:-1]
# y=df.iloc[:,-1:]
# X_train,X_test,y_train, y_test=onehot_all(df.iloc[:,:-1],y,.2)
# X_train,X_test,y_train, y_test=target_all(df.iloc[:,:-1],y,ratio=.2)
# X_train,X_test,y_train, y_test=onehot_target(df.iloc[:,:-1],y,ratio=.2,thresh=10)


parameters = {'depth'         : [6,8],
              'learning_rate' : [.01,.05,.1,.2],
              'iterations'    : [500,1000]
              }
# parameters = {'depth'         : [4],
#               'learning_rate' : [.2],
#               'iterations'    : [50]
#               }
parameters_xgb = {
'n_estimators': [500,1000],
    # 'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [6,8],
    # 'criterion' :['gini', 'entropy']
        }

# parameters_xgb = {
# 'n_estimators': [50],
# #     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4],
# #     'criterion' :['gini', 'entropy']
#         }
parameters_RF={'max_depth':[6,8], 'n_estimators':[500,1000]}
# parameters_RF={'max_depth':[2], 'n_estimators':[10]}

print("first with one hot for all")
X_train,X_test,y_train, y_test=onehot_all(X,y,.2)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with target encoding")
X_train,X_test,y_train, y_test=target_all(X,y,ratio=.2)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with mix encodings")
X_train,X_test,y_train, y_test=onehot_target(X,y,ratio=.2,thresh=5)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with native cat features support")

X_train,X_test,y_train, y_test=train_test_split(X, y, test_size=.2, random_state=42)
train_evaluate_with_cat_feat(X_train,X_test,y_train, y_test,cat_features=list(X_train.select_dtypes('object').columns))


first with one hot for all
lightgbm training with gridsearch
accuracy: 0.9045694320750839
f1 macro: 0.6658191851532467
f1_micro: 0.9045694320750839
log_loss: 0.29183143346439044
catboost training with gridsearch
accuracy: 0.9035418236623964
f1 macro: 0.6626310911995932
f1_micro: 0.9035418236623964
log_loss: 0.29006753211122477
xgboost training with gridsearch
