In [1]:
!pip install catboost
!pip install category_encoders
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.metrics import accuracy_score, f1_score,jaccard_score,multilabel_confusion_matrix,log_loss
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)



  import pandas.util.testing as tm


In [2]:
import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data',header=None)

Attribute Information:

0. Wife's age (numerical)
1. Wife's education (categorical) 1=low, 2, 3, 4=high
2. Husband's education (categorical) 1=low, 2, 3, 4=high
3. Number of children ever born (numerical)
4. Wife's religion (binary) 0=Non-Islam, 1=Islam
5. Wife's now working? (binary) 0=Yes, 1=No
6. Husband's occupation (categorical) 1, 2, 3, 4
7. Standard-of-living index (categorical) 1=low, 2, 3, 4=high
8. Media exposure (binary) 0=Good, 1=Not good
9. Contraceptive method used (class attribute) 1=No-use, 2=Long-term, 3=Short-term



In [3]:
df[1]=df[1].astype(str)
df[2]=df[2].astype(str)
df[4]=df[4].astype(str)
df[5]=df[5].astype(str)
df[6]=df[6].astype(str)
df[7]=df[7].astype(str)
df[8]=df[8].astype(str)
# df[9]=df[9].astype(str)

X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [4]:
#label encoder to encode unseen values too
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)


In [5]:
#encodings and split

def onehot_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    obj_cols=X.select_dtypes('object').columns
    enc=ce.OneHotEncoder(cols=obj_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train, X_test,y_train, y_test

def target_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
#     obj_cols=X.select_dtypes('object').columns
    enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test
    
def onehot_target(X,y,ratio,thresh):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    low_card_cols,high_card_cols=[],[]
    obj_cols=X.select_dtypes('object').columns
    for col in obj_cols:
        if X_train[col].nunique()<=thresh:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    print(low_card_cols,high_card_cols)
    
    enc=ce.OneHotEncoder(cols=low_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    enc=ce.TargetEncoder(cols=high_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test

def target_encode_multiclass(X,y,ratio):
    class_names=y.unique()
    y_classes=pd.DataFrame(columns=class_names)

    for class_ in class_names:
      y_class_=y.map(lambda x: 1 if x==class_ else 0)
      y_classes[class_]=y_class_

    X_train, X_test, y_train, y_test = train_test_split(X, y_classes, test_size=ratio, random_state=42)

    X_train_obj=X_train.select_dtypes('object')
    X_test_obj=X_test.select_dtypes('object')
    X_train=X_train.select_dtypes(exclude='object')
    X_test=X_test.select_dtypes(exclude='object')

    for class_ in class_names:
      
      enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train_obj,y_train[class_])
      temp=enc.transform(X_train_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_train=pd.concat([X_train,temp],axis=1)
      temp=enc.transform(X_test_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_test=pd.concat([X_test,temp],axis=1)
      
    y_train=y_train.apply(lambda row: 1 if row[1]==1 else 2 if row[2]==1 else 3 if row[3]==1 else 0, axis=1)
    y_test=y_test.apply(lambda row: 1 if row[1]==1 else 2 if row[2]==1 else 3 if row[3]==1 else 0, axis=1)

    return X_train, X_test, y_train, y_test
            
def onehot_target_encode_multiclass(X,y,ratio,thresh):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)

    low_card_cols,high_card_cols=[],[]
    obj_cols=X.select_dtypes('object').columns
    for col in obj_cols:
        if X_train[col].nunique()<=thresh:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    enc=ce.OneHotEncoder(cols=low_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)


    enc=ce.OneHotEncoder(handle_missing='return_nan').fit(y_train)
    y_train_wide=enc.transform(y_train)
    y_test_wide=enc.transform(y_test)


    class_names=y_train_wide.columns
    # y_classes=pd.DataFrame(columns=class_names)

    # for class_ in class_names:
    #   y_class_=y.map(lambda x: 1 if x==class_ else 0)
    #   y_classes[class_]=y_class_

    
    X_train_obj=X_train.select_dtypes('object')
    X_test_obj=X_test.select_dtypes('object')
    X_train=X_train.select_dtypes(exclude='object')
    X_test=X_test.select_dtypes(exclude='object')

    for class_ in class_names:
      
      enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train_obj,y_train_wide[class_])
      temp=enc.transform(X_train_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_train=pd.concat([X_train,temp],axis=1)
      temp=enc.transform(X_test_obj)
      temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
      X_test=pd.concat([X_test,temp],axis=1)
      
    # y_train=y_train.apply(lambda row: 1 if row[1]==1 else 2 if row[2]==1 else 3 if row[3]==1 else 0, axis=1)
    # y_test=y_test.apply(lambda row: 1 if row[1]==1 else 2 if row[2]==1 else 3 if row[3]==1 else 0, axis=1)

    return X_train, X_test, y_train, y_test

In [6]:

def train_evaluate( X_train,X_test,y_train, y_test,cat_features=None):

  # try:
  print("lightgbm training with gridsearch")

  model = lgbm.LGBMClassifier(boosting_type='goss')
  grid = GridSearchCV(estimator=model, param_grid = parameters)
  grid.fit(X_train, y_train,verbose=0)    

  model=grid.best_estimator_
  evaluate(model,X_test,y_test)
  # except: pass

  try:
    print("catboost training with gridsearch")

    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:
    print("xgboost training with gridsearch")

    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    grid = GridSearchCV(estimator=model, param_grid = parameters_xgb)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:  
    print("RF training with gridsearch")
    
    model=RandomForestClassifier()
    grid = GridSearchCV(estimator=model, param_grid = parameters_RF)
    grid.fit(X_train.fillna(value=0), y_train)    

    model=grid.best_estimator_
    evaluate(model,X_test.fillna(value=0),y_test)
  except:pass
    
def train_evaluate_with_cat_feat( X_train,X_test,y_train, y_test,cat_features=None):
  try:
    print("lightgbm training with gridsearch")
    for col in cat_features:
        le = LabelEncoderExt()
        le.fit(X_train[col])
        X_train[col]=le.transform(X_train[col])
        X_test[col]=le.transform(X_test[col])

    fit_params={'categorical_feature':cat_features}
    model = lgbm.LGBMClassifier(boosting_type='goss')
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:
    print("catboost training with gridsearch")
    fit_params={'cat_features':cat_features}
    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

In [7]:
#evaluation function

def evaluate(model,X_test,y_test):
    
    pred=model.predict(X_test)
    pred_proba=model.predict_proba(X_test)

    print('accuracy:',accuracy_score(y_test,pred))
    print('f1 macro:',f1_score(y_test,pred, average='macro'))
    # print('f1_micro:',f1_score(y_test,pred, average='micro'))
#     print(pd.Series(pred).unique())
    print('log_loss:',log_loss(y_test,pred_proba,labels=[1,2,3]))


In [8]:
# X=df.iloc[:,:-1]
# y=df.iloc[:,-1:]
# X_train,X_test,y_train, y_test=onehot_all(df.iloc[:,:-1],y,.2)
# X_train,X_test,y_train, y_test=target_all(df.iloc[:,:-1],y,ratio=.2)
# X_train,X_test,y_train, y_test=onehot_target(df.iloc[:,:-1],y,ratio=.2,thresh=10)


parameters = {'depth'         : [6,8,10],
              'learning_rate' : [.01,.05,.1,.2],
              'iterations'    : [500,1000,2000]
              }
# parameters = {'depth'         : [4],
#               'learning_rate' : [.2],
#               'iterations'    : [50]
#               }
parameters_xgb = {
'n_estimators': [500,1000,2000],
    # 'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [6,8,10],
    'criterion' :['gini', 'entropy']
        }

# parameters_xgb = {
# 'n_estimators': [50],
# #     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4],
# #     'criterion' :['gini', 'entropy']
#         }
parameters_RF={'max_depth':[6,8,10], 'n_estimators':[500,1000,2000]}
# parameters_RF={'max_depth':[2], 'n_estimators':[10]}

print("first with one hot for all")
X_train,X_test,y_train, y_test=onehot_all(X,y,.2)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with target for all")
X_train,X_test,y_train, y_test=target_encode_multiclass(X,y,.2)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with mixed for all")
X_train,X_test,y_train, y_test=onehot_target_encode_multiclass(X,y,.2,4)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with native cat features support")
X_train,X_test,y_train, y_test=train_test_split(X, y, test_size=.2, random_state=42)
train_evaluate_with_cat_feat(X_train,X_test,y_train, y_test,cat_features=list(X_train.select_dtypes('object').columns))


first with one hot for all
lightgbm training with gridsearch
accuracy: 0.5796610169491525
f1 macro: 0.5466243668948628
log_loss: 0.8830686769768217
catboost training with gridsearch
accuracy: 0.6101694915254238
f1 macro: 0.5809340588056185
log_loss: 0.8646339709229665
xgboost training with gridsearch
accuracy: 0.5423728813559322
f1 macro: 0.5212178756951967
log_loss: 1.1676180954529536
RF training with gridsearch
accuracy: 0.576271186440678
f1 macro: 0.5349156608559477
log_loss: 0.9132367951935777
now with target for all
lightgbm training with gridsearch
accuracy: 0.5898305084745763
f1 macro: 0.567771061965702
log_loss: 0.8615487327171879
catboost training with gridsearch
accuracy: 0.6067796610169491
f1 macro: 0.5705695346320346
log_loss: 0.8525584085735698
xgboost training with gridsearch
accuracy: 0.5254237288135594
f1 macro: 0.4990042069459362
log_loss: 1.1751445666814724
RF training with gridsearch
accuracy: 0.576271186440678
f1 macro: 0.5398320930295207
log_loss: 0.903382689403020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
New categorical_feature is [1, 2, 4, 5, 6, 7, 8]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


accuracy: 0.5796610169491525
f1 macro: 0.5487048998499381
log_loss: 0.8913872368199421
catboost training with gridsearch
accuracy: 0.5932203389830508
f1 macro: 0.5645626946793437
log_loss: 0.9148168200017099
