In [66]:
!pip install xgboost lightgbm catboost ipywidgets

Collecting lightgbm
  Downloading lightgbm-3.3.3-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 1.3 MB/s eta 0:00:01
[?25hCollecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 565 kB/s eta 0:00:01
Collecting plotly
  Downloading plotly-5.11.0-py2.py3-none-any.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 421 kB/s eta 0:00:01
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 572 kB/s eta 0:00:01
Collecting tenacity>=6.2.0
  Downloading tenacity-8.1.0-py3-none-any.whl (23 kB)


Installing collected packages: lightgbm, tenacity, plotly, graphviz, catboost
Successfully installed catboost-1.1.1 graphviz-0.20.1 lightgbm-3.3.3 plotly-5.11.0 tenacity-8.1.0


In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder

In [3]:

data = pd.read_csv('adult.data', header = 0)

s = (data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income-pred']


In [4]:
data.describe()
data_copy = data.copy()

## Change pipeline:
Ordinal encoding is usable only for 'education', 'income-pred', 'workclass ???'

One-hot is better for 'relationship', 'race', 'native-country'?

In [5]:
ordinal_encoder = OrdinalEncoder()
onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()


ordinal_encoding_labels = ["education", "workclass"]
label_encoding_labels = ["income-pred"]
one_hot_encoding_labels = ["relationship", "race", "native-country", "marital-status"]


In [6]:
one_hot_encoded_data = pd.get_dummies(data_copy, columns=one_hot_encoding_labels)
print(one_hot_encoded_data.keys())

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'occupation', 'sex', 'capitl-gain', 'capital-loss', 'hours-per-week',
       'income-pred', 'relationship_ Husband', 'relationship_ Not-in-family',
       'relationship_ Other-relative', 'relationship_ Own-child',
       'relationship_ Unmarried', 'relationship_ Wife',
       'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black',
       'race_ Other', 'race_ White', 'native-country_ ?',
       'native-country_ Cambodia', 'native-country_ Canada',
       'native-country_ China', 'native-country_ Columbia',
       'native-country_ Cuba', 'native-country_ Dominican-Republic',
       'native-country_ Ecuador', 'native-country_ El-Salvador',
       'native-country_ England', 'native-country_ France',
       'native-country_ Germany', 'native-country_ Greece',
       'native-country_ Guatemala', 'native-country_ Haiti',
       'native-country_ Holand-Netherlands', 'native-country_ Honduras',
       'nat

In [7]:
final_data = one_hot_encoded_data.apply(label_encoder.fit_transform)


In [8]:
y_data = final_data.pop('income-pred')

In [9]:
y_data

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: income-pred, Length: 32561, dtype: int64

In [42]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# read data
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_data, y_data, test_size=.2)

# XGB

In [43]:
bst = XGBClassifier(n_estimators=10, learning_rate=5, objective='binary:logistic')
# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 74.76%


# CAT

In [52]:
from catboost import CatBoostClassifier, Pool, cv


# cat_features = [0]

# cv_dataset = Pool(data=X_train,
#                   label=y_train,
#                   cat_features=cat_features)

# params = {"iterations": 100,
#           "depth": 5,
#           "loss_function": "Logloss",
#           "verbose": False,
#           "roc_file": "roc-file"}

# scores = cv(cv_dataset,
#             params,
#             fold_count=5,
#             plot=False,
#             verbose=False)

ctbst = CatBoostClassifier(learning_rate=0.5,
                           n_estimators=10,
                          )

ctbst.fit(X_train,
       y_train,
       verbose=False,
       plot=True)

preds = ctbst.predict(X_test)
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Accuracy: 85.83%


# LightGBM

In [53]:
from lightgbm import LGBMClassifier
lgbst = LGBMClassifier(learning_rate=0.5, max_depth=-5,random_state=42)
lgbst.fit(X_train,y_train,eval_set=[(X_test,y_test),(X_train,y_train)],eval_metric='logloss')

preds = lgbst.predict(X_test)

accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[1]	training's binary_logloss: 0.385912	valid_0's binary_logloss: 0.387976
[2]	training's binary_logloss: 0.339816	valid_0's binary_logloss: 0.346993
[3]	training's binary_logloss: 0.314791	valid_0's binary_logloss: 0.324145
[4]	training's binary_logloss: 0.2994	valid_0's binary_logloss: 0.311762
[5]	training's binary_logloss: 0.289539	valid_0's binary_logloss: 0.304911
[6]	training's binary_logloss: 0.28299	valid_0's binary_logloss: 0.301909
[7]	training's binary_logloss: 0.276971	valid_0's binary_logloss: 0.299447
[8]	training's binary_logloss: 0.271565	valid_0's binary_logloss: 0.296037
[9]	training's binary_logloss: 0.268412	valid_0's binary_logloss: 0.294695
[10]	training's binary_logloss: 0.264431	valid_0's binary_logloss: 0.292138
[11]	training's binary_logloss: 0.261495	valid_0's binary_logloss: 0.291284
[12]	training's binary_logloss: 0.258972	valid_0's binary_logloss: 0.292025
[13]	training's binary_logloss: 0.256927	valid_0's binary_logloss: 0.292528
[14]	training's binary_l