<a href="https://colab.research.google.com/github/mohannishant6/thesis/blob/master/notebooks/poker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install catboost
# !pip install category_encoders
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.metrics import accuracy_score, f1_score,jaccard_score,multilabel_confusion_matrix,log_loss
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data',header=None)
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9


In [3]:
df[0]=df[0].astype(str)
df[2]=df[2].astype(str)
df[4]=df[4].astype(str)
df[6]=df[6].astype(str)
df[8]=df[8].astype(str)
# df[10]=df[10].astype(str)

X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [4]:
#label encoder to encode unseen values too
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)


In [5]:
#encodings and split

def onehot_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    obj_cols=X.select_dtypes('object').columns
    enc=ce.OneHotEncoder(cols=obj_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train, X_test,y_train, y_test

def target_all(X,y,ratio):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
#     obj_cols=X.select_dtypes('object').columns
    enc=ce.TargetEncoder(handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test
    
def onehot_target(X,y,ratio,thresh):
    
    #split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    low_card_cols,high_card_cols=[],[]
    obj_cols=X.select_dtypes('object').columns
    for col in obj_cols:
        if X_train[col].nunique()<=thresh:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    print(low_card_cols,high_card_cols)
    
    enc=ce.OneHotEncoder(cols=low_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    enc=ce.TargetEncoder(cols=high_card_cols,handle_missing='return_nan').fit(X_train,y_train)
    X_train=enc.transform(X_train)
    X_test=enc.transform(X_test)
    
    return X_train,X_test,y_train, y_test
            

In [6]:

def train_evaluate( X_train,X_test,y_train, y_test,cat_features=None):

  try:
    print("lightgbm training with gridsearch")

    model = lgbm.LGBMClassifier(boosting_type='goss')
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except: pass

  try:
    print("catboost training with gridsearch")

    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:
    print("xgboost training with gridsearch")

    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    grid = GridSearchCV(estimator=model, param_grid = parameters_xgb)
    grid.fit(X_train, y_train,verbose=0)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:  
    print("RF training with gridsearch")
    
    model=RandomForestClassifier()
    grid = GridSearchCV(estimator=model, param_grid = parameters_RF)
    grid.fit(X_train, y_train)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass
    
def train_evaluate_with_cat_feat( X_train,X_test,y_train, y_test,cat_features=None):
  try:
    print("lightgbm training with gridsearch")
    for col in cat_features:
        le = LabelEncoderExt()
        le.fit(X_train[col])
        X_train[col]=le.transform(X_train[col])
        X_test[col]=le.transform(X_test[col])

    fit_params={'categorical_feature':cat_features}
    model = lgbm.LGBMClassifier(boosting_type='goss')
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

  try:
    print("catboost training with gridsearch")
    fit_params={'cat_features':cat_features}
    model = CatBoostClassifier(
        task_type="GPU", devices='0:1'
    )
    grid = GridSearchCV(estimator=model, param_grid = parameters)
    grid.fit(X_train, y_train,verbose=0,**fit_params)    

    model=grid.best_estimator_
    evaluate(model,X_test,y_test)
  except:pass

In [7]:
#evaluation function

def evaluate(model,X_test,y_test):
    
    pred=model.predict(X_test)
    pred_proba=model.predict_proba(X_test)

    print('accuracy:',accuracy_score(y_test,pred))
    print('f1 macro:',f1_score(y_test,pred, average='macro'))
    print('f1_micro:',f1_score(y_test,pred, average='micro'))
#     print(pd.Series(pred).unique())
    print('log_loss:',log_loss(y_test,pred_proba,labels=np.array([0,1,2,3,4,5,6,7,8,9])))


In [9]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1:]
# X_train,X_test,y_train, y_test=onehot_all(df.iloc[:,:-1],y,.2)
# X_train,X_test,y_train, y_test=target_all(df.iloc[:,:-1],y,ratio=.2)
# X_train,X_test,y_train, y_test=onehot_target(df.iloc[:,:-1],y,ratio=.2,thresh=10)


parameters = {'depth'         : [4,6,8],
              'learning_rate' : [.01,.05,.1,.2],
              'iterations'    : [100,500]
              }
# parameters = {'depth'         : [4],
#               'learning_rate' : [.2],
#               'iterations'    : [50]
#               }
parameters_xgb = {
'n_estimators': [100, 500],
    # 'max_features': ['auto', 'sqrt', 'log2'],
    'learning_rate':[.01,.05,.1,.2],
    'max_depth' : [4,6,8],
    # 'criterion' :['gini', 'entropy']
        }

# parameters_xgb = {
# 'n_estimators': [50],
# #     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4],
# #     'criterion' :['gini', 'entropy']
#         }
parameters_RF={'max_depth':[4,6,8], 'n_estimators':[100,500]}

print("first with one hot for all")
X_train,X_test,y_train, y_test=onehot_all(X,y,.2)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with target encoding")
X_train,X_test,y_train, y_test=target_all(X,y,ratio=.2)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with mix encodings")
X_train,X_test,y_train, y_test=onehot_target(X,y,ratio=.2,thresh=5)
train_evaluate( X_train,X_test,y_train, y_test)

print("now with native cat features support")

X_train,X_test,y_train, y_test=train_test_split(X, y, test_size=.2, random_state=42)
train_evaluate_with_cat_feat(X_train,X_test,y_train, y_test,cat_features=list(X_train.select_dtypes('object').columns))


first with one hot for all
lightgbm training with gridsearch
accuracy: 0.6151539384246302
f1 macro: 0.15983266840796576
f1_micro: 0.6151539384246302
log_loss: 0.8971268172475173
catboost training with gridsearch
accuracy: 0.8534586165533786
f1 macro: 0.26796949723972613
f1_micro: 0.8534586165533786
log_loss: 0.4879293372535875
xgboost training with gridsearch
accuracy: 0.826469412235106
f1 macro: 0.19804508633688395
f1_micro: 0.826469412235106
log_loss: 0.6434038983073692
RF training with gridsearch
accuracy: 0.5615753698520591
f1 macro: 0.12174584182551024
f1_micro: 0.5615753698520591
log_loss: 0.9486434287651889
now with target encoding
lightgbm training with gridsearch
accuracy: 0.6153538584566174
f1 macro: 0.15940669341010638
f1_micro: 0.6153538584566174
log_loss: 0.8943339722119548
catboost training with gridsearch
accuracy: 0.8818472610955618
f1 macro: 0.2840398868394021
f1_micro: 0.8818472610955618
log_loss: 0.4474361696112773
xgboost training with gridsearch
accuracy: 0.8256697

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
New categorical_feature is [0, 2, 4, 6, 8]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


accuracy: 0.6193522590963615
f1 macro: 0.16028297241303824
f1_micro: 0.6193522590963615
log_loss: 0.8980715226571092
catboost training with gridsearch
accuracy: 0.8456617353058776
f1 macro: 0.2726260352976164
f1_micro: 0.8456617353058776
log_loss: 0.5209796047979528
