# Inżynieria cech

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score, cross_validate

## Wczytanie danych

In [2]:
df = pd.read_csv("census_income_dataset.csv")

Z ramki danych zostają usunięte wszystkie wiersze zawierające wartość `NaN`.

In [3]:
df = df.replace({"?": np.nan}).dropna().reset_index(drop=True)

## Kolumny do usunięcia
- `fnlwgt` -> cecha nie wpływa w żaden sposób na podział danych, więc z niej rezygnujemy.
- `education` oraz `education_num` -> po przetestowaniu różnych kombinacji grupowania zdecydowaliśmy, że najlepszą opcją jest zrezygowanie z kolumny `education` i zostawienie `education_num` bez żadnych modyfikacji.
- `race` -> duża dysproporcja danych, więc pomijamy

In [4]:
features_to_drop = ["fnlwgt", "education", "race"]
df.drop(features_to_drop, axis=1, inplace=True)

### `hours_per_week` 
pogrupowanie rekordów w trzy kategorie względem wartości `40.0`

In [4]:
def hpw_encode(x):
    if x < 40:
        return "more_than_40"
    elif x > 40:
        return "less_than_40"
    else:
        return "exactly_40"

df.hours_per_week = df.hours_per_week.apply(hpw_encode)
df.hours_per_week.value_counts()

exactly_40      21358
less_than_40    13777
more_than_40    10087
Name: hours_per_week, dtype: int64

### `age`
Normalizacja kulumny  przy użyciu `MinMaxScaler()`

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df["age"] = scaler.fit_transform(df[["age"]])

### `native_country`
W tej kolumnie mamy 41 kategorii i oprócz USA, dla każdego państwa mamy bardzo mało obserwacji.
Poniżej państwa zostały zgrupowane według kontynetów. USA zostały jako oddzielna kategoria.

Wśród państw zostało wyodrębnione 5 kategorii.
- United States - Stany Zjednoczone
- LA - Ameryka Łacińska - zawiera Outlying-US
- ASIA - Azja
- EU - Europa
- NA - Ameryka Północna

In [6]:
countries_dict = {'United-States': 'United States', 
 'Cuba': 'LA',
 'Jamaica': 'LA',
 'South': 'ASIA',
 'Mexico': 'LA',
 'Puerto-Rico': 'LA',
 'Honduras': 'LA',
 'England': 'EU',
 'Canada': 'NA',
 'Germany': 'EU',
 'Iran': 'ASIA',
 'Philippines': 'ASIA',
 'Poland': 'EU',
 'Columbia': 'LA',
 'Cambodia': 'ASIA',
 'Thailand': 'ASIA',
 'Ecuador': 'LA',
 'Laos': 'ASIA',
 'Taiwan': 'ASIA',
 'Haiti': 'LA',
 'Portugal': 'EU',
 'Dominican-Republic': 'LA',
 'El-Salvador': 'LA',
 'France': 'EU',
 'Guatemala': 'LA',
 'Italy': 'EU',
 'China': 'ASIA',
 'India': 'ASIA',
 'Japan': 'ASIA',
 'Yugoslavia': 'EU',
 'Peru': 'LA',
 'Hong': 'LA',
 'Ireland': 'EU',
 'Trinadad&Tobago': 'LA',
 'Greece': 'EU',
 'Nicaragua': 'LA',
 'Vietnam': 'ASIA',
 'Outlying-US(Guam-USVI-etc)': 'LA',
 'Scotland': 'EU',
 'Hungary': 'EU',
 'Holand-Netherlands': 'EU'}

df["native_country"] = df["native_country"].replace(countries_dict)

### `Income level`
Zamiana wartości w kolumnie na 0 i 1.

In [8]:
X = df.loc[:, df.columns != "income_level"]
y = df.income_level

y = y.apply(lambda x: 1 if x == ">50K" else 0)

## Encoding
Kolumny przekształcone za pomocą `OneHotEncoder()`:
- `marital_status`
- `occupation`
-`relationship`
- `workclass`
-`hours per week` (3 kategorie)

In [9]:
from category_encoders import OneHotEncoder

ohe = OneHotEncoder(use_cat_names=True)
X = ohe.fit_transform(X, y)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 45)

## Modeling

In [11]:
indicators = ["recall", "accuracy", "roc auc", "f1"]
t = pd.DataFrame(columns = indicators) 

def print_results(name, res_):
    print(name, 'accuracy score: {0:0.4f}'. format(np.mean(res_["test_accuracy"])))
    print(name, 'recall score: {0:0.4f}'. format(np.mean(res_["test_recall"])))
    print(name, 'roc_auc score: {0:0.4f}'. format(np.mean(res_["test_roc_auc"])))
    print(name, 'f1 score: {0:0.4f}'. format(np.mean(res_["test_f1"])))

#### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score

lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
y_hat = lr.predict(X_test)
res_lr = cross_validate(LogisticRegression(max_iter=1000), X, y, cv=5, scoring=['recall', 'accuracy', 'roc_auc', 'f1'])
t.loc["Logistic Regression"] = [np.mean(res_lr["test_recall"]), np.mean(res_lr["test_accuracy"]), np.mean(res_lr["test_roc_auc"]), np.mean(res_lr["test_f1"])]
sorted(res_lr.keys())

print_results("Logistic Regression", res_lr)

Logistic Regression accuracy score: 0.7897
Logistic Regression recall score: 0.2637
Logistic Regression roc_auc score: 0.6172
Logistic Regression f1 score: 0.3834


#### Random forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=750, max_depth=4).fit(X_train, y_train)

y_hat = rf.predict(X_test)
res = cross_validate(RandomForestClassifier(n_estimators=1000, max_depth=6), X, y, cv=5, scoring=['recall', 'accuracy', 'roc_auc', 'f1'])
t.loc["Random Forest"] = [np.mean(res["test_recall"]), np.mean(res["test_accuracy"]), np.mean(res["test_roc_auc"]), np.mean(res["test_f1"])]

print_results("Random Forest", res)

Random Forest accuracy score: 0.8449
Random Forest recall score: 0.5027
Random Forest roc_auc score: 0.9022
Random Forest f1 score: 0.6164


#### Voting Classifier
Logistic Regression & Random Forest

In [14]:
from sklearn.ensemble import VotingClassifier

model_soft = VotingClassifier(estimators=[('RandomForest', rf), ('LR', lr)], voting='soft', weights=[0.4, 0.6])

res_ms = cross_validate(model_soft, X_train, y_train, cv=5, scoring=['recall', 'accuracy', 'roc_auc', 'f1'])
t.loc["Voting Classifier"] = [np.mean(res_ms["test_recall"]), np.mean(res_ms["test_accuracy"]), np.mean(res_ms["test_roc_auc"]), np.mean(res_ms["test_f1"])]

print_results("Voting Classifier",res_ms)

Voting Classifier accuracy score: 0.8106
Voting Classifier recall score: 0.3047
Voting Classifier roc_auc score: 0.8341
Voting Classifier f1 score: 0.4350


#### AdaBoost

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=1000)

res = cross_validate(AdaBoostClassifier(n_estimators=1000), X, y, cv=5, scoring=['recall', 'accuracy', 'roc_auc', 'f1'])
t.loc["AdaBoost"] = [np.mean(res["test_recall"]), np.mean(res["test_accuracy"]), np.mean(res["test_roc_auc"]), np.mean(res["test_f1"])]

print_results("AdaBoost", res)

AdaBoost accuracy score: 0.8688
AdaBoost recall score: 0.6500
AdaBoost roc_auc score: 0.9263
AdaBoost f1 score: 0.7106


#### CatBoost

In [19]:
from catboost import CatBoostClassifier

catb = CatBoostClassifier(verbose=False, learning_rate=0.04, depth=6)

res = cross_validate(CatBoostClassifier(verbose=False, learning_rate=0.04, depth=6), X, y, cv=5, scoring=['recall', 'accuracy', 'roc_auc', 'f1'])
t.loc["CatBoost"] = [np.mean(res["test_recall"]), np.mean(res["test_accuracy"]), np.mean(res["test_roc_auc"]), np.mean(res["test_f1"])]

print_results("CatBoost", res)

CatBoost accuracy score: 0.8709
CatBoost recall score: 0.6571
CatBoost roc_auc score: 0.9286
CatBoost f1 score: 0.7162


#### Gradient Boosting Classifier

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

model2 = GradientBoostingClassifier(random_state=1,
                                  learning_rate=0.3)

res_m = cross_validate(model2, X_train, y_train, cv=5, scoring=['recall','accuracy', 'roc_auc', 'f1'])
t.loc["Gradient Boosting"] = [np.mean(res_m["test_recall"]), np.mean(res_m["test_accuracy"]), np.mean(res_m["test_roc_auc"]), np.mean(res_m["test_f1"])]

print_results("Gradient Boosting" ,res_m)

Gradient Boosting accuracy score: 0.8698
Gradient Boosting recall score: 0.6528
Gradient Boosting roc_auc score: 0.9266
Gradient Boosting f1 score: 0.7128


#### XGBoost

In [21]:
from xgboost import XGBClassifier 

xg_boost=XGBClassifier(random_state=1,
                    learning_rate=0.4, # Szybkość "uczenia" się
                    booster='gbtree', # Jaki model wykorzystujemy (drzewo - gbtree, liniowe - gblinear)
                    max_depth=4, # Maksymalna głębokość drzewa 
                    eval_metric="logloss",
                    use_label_encoder=False)

xg_boost.fit(X_train, y_train)
y_hat = xg_boost.predict(X_test)
print("XGBoost accuracy score: {0:0.4f}". format(accuracy_score(y_test, y_hat)))

XGBoost accuracy score: 0.8659


In [25]:
res = cross_validate(xg_boost, X_train, y_train, cv=5, scoring=['recall', 'accuracy', 'roc_auc', 'f1'])
t.loc["XGBoost"] = [np.mean(res["test_recall"]), np.mean(res["test_accuracy"]), np.mean(res["test_roc_auc"]), np.mean(res["test_f1"])]

print_results("XGBoost", res)

XGBoost accuracy score: 0.8711
XGBoost recall score: 0.6650
XGBoost roc_auc score: 0.9279
XGBoost f1 score: 0.7185


## Summary
Podsumowanie wyników względem miary `roc auc`

In [26]:
t.sort_values(by = 'roc auc', ascending = False)

Unnamed: 0,recall,accuracy,roc auc,f1
CatBoost,0.65712,0.870926,0.928584,0.716182
XGBoost,0.665039,0.871064,0.927872,0.718521
AdaBoost,0.646949,0.86856,0.926875,0.709269
Gradient Boosting,0.65277,0.869825,0.926582,0.712818
Voting Classifier,0.560572,0.850543,0.906167,0.649907
Logistic Regression,0.606086,0.848326,0.903777,0.66451
Random Forest,0.507762,0.849299,0.902889,0.625517


Najwyższą jakość klasyfikacji uzyskały modele **CatBoost** oraz **XGBoost**