In [589]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix , classification_report, roc_auc_score, recall_score
from sklearn.preprocessing import PolynomialFeatures
import category_encoders as ce
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
#import ssl # probably not needed on your machine, delete before release
#ssl._create_default_https_context = ssl._create_unverified_context # probably not needed on your machine, delete before release

In [546]:
df = pd.read_csv("https://lovespreadsheet-tutorials.s3.amazonaws.com/APIDatasets/census_income_dataset.csv")

In [547]:
def pre_process(df):
    df[df == "?"] = np.nan
    df.dropna(inplace = True)
    data = df.drop(["fnlwgt", "income_level"], axis=1)
    target = np.array(df["income_level"])
    return data,target

In [548]:
def get_split(df,test_size_fraction):
    data, target = pre_process(df)
    X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                        test_size = test_size_fraction,
                                                        random_state = 42,
                                                       stratify=target)
    return X_train, X_test, y_train, y_test

In [549]:
def grouping_marital(df):
    res = df.copy()
    res['marital_status'] = res['marital_status'].replace(
        ['Widowed', 'Divorced', 'Separated', 'Never-married', 'Married-spouse-absent'], 'Living-Alone')
    res['marital_status'] = res['marital_status'].replace(
        ['Married-civ-spouse', 'Married-AF-spouse'], 'Married')
    return res

def grouping_ethnic(df):
    res = df.copy()
    res['race'] = res['race'].replace(['Asian-Pac-Islander', 'White'], '1stGroup')
    res['race'] = res['race'].replace(['Other', 'Black', 'Amer-Indian-Eskimo'], '2ndGroup')
    return res

def grouping_education(df):
    res = df.copy()
    res['education'] = res['education'].replace(
            ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th'], 'Obligatory')
    res['education'] = res['education'].replace(['HS-grad', 'Some-college'], 'HS-college')
    res['education'] = res['education'].replace(['Assoc-voc', 'Assoc-acdm'], 'Assoc')
    res['education'] = res['education'].replace(['Prof-school', 'Doctorate'], 'Academic')
    return res

def grouping_countries(df):
    countries_list = grouping_countries_helper(df)
    res = df.copy()
    res.loc[~res['native_country'].isin(countries_list), "native_country"] = "Other"
    res['native_country'] = res['native_country'].replace(countries_list[:11], 'Low-income')
    res['native_country'] = res['native_country'].replace(countries_list[11:17], 'Lower-middle-income')
    res['native_country'] = res['native_country'].replace(countries_list[17:23], 'Middle-income')
    res['native_country'] = res['native_country'].replace(countries_list[23:26], 'Upper-middle-income')
    res['native_country'] = res['native_country'].replace(countries_list[26:32]+countries_list[33:], 'High-income')
    return res

def grouping_countries_helper(df):
    gdp = pd.read_csv("gdp.csv", sep=";")
    df2 = df.copy()
    df2["income_level"] = df2.loc[:,"income_level"].map({'<=50K': 0, '>50K': 1})
    df2 = df2.groupby("native_country")["income_level"].mean().reset_index()
    countries = pd.merge(df2, gdp, left_on = "native_country", right_on = "Country", how = "left").sort_values(by = "GDP95")
    countries_list = list(countries["native_country"])
    return countries_list

In [601]:
def test_model(model,data,label=None):
    if label is None:
        label = ""
    X_train,X_test,y_train,y_test = get_split(data,0.2)
    kfolds = 8
    split = KFold(n_splits=kfolds, shuffle=True, random_state=42)
    test_model = get_pipeline(model)
    cv_results = cross_val_score(test_model, 
                     X_train.drop("education_num", axis=1), y_train, 
                     cv=split,
                     scoring="accuracy",
                     n_jobs=-1)
    
    print(f" {label} cross validation accuarcy score: {round(np.mean(cv_results), 4)}\
        +/- {round(np.std(cv_results), 4)} (std) \t min: {round(min(cv_results), 4)},\
        max: {round(max(cv_results), 4)}")
    
    test_model.fit(X_train, y_train)
    return test_model.predict(X_test), test_model.predict_proba(X_test)[:, 1]

def get_pipeline(model):
    cat_features = ["workclass", "education", "marital_status",
                    "occupation", "relationship", "race", 
                    "sex", "native_country"]     

    transformer = ColumnTransformer(
        [
        ("onehot", OneHotEncoder(handle_unknown = 'ignore'), cat_features), 
        ("std_scaler", StandardScaler(), ["age", "capital_gain", "capital_loss", "hours_per_week"])
        ],
        remainder = "passthrough"
    )

    model_pipeline = Pipeline(
        [
            ('transformer', transformer),
            ('model', model)
        ]
    )
    return model_pipeline

In [602]:
transformations = [grouping_marital,grouping_ethnic,grouping_education,grouping_countries]
transformations_names = ["grouping_marital","grouping_ethnic","grouping_education","grouping_countries"]
model = LogisticRegression(random_state=42, n_jobs=-1,max_iter=500)
for transformation, name in zip(transformations,transformations_names):
    transformed_data = transformation(df)
    test_model(model, transformed_data,name)

 grouping_marital cross validation accuarcy score: 0.8485        +/- 0.0046 (std) 	 min: 0.8421,        max: 0.8565
 grouping_ethnic cross validation accuarcy score: 0.8484        +/- 0.0041 (std) 	 min: 0.8434,        max: 0.8549
 grouping_education cross validation accuarcy score: 0.8482        +/- 0.0047 (std) 	 min: 0.8417,        max: 0.8554
 grouping_countries cross validation accuarcy score: 0.8463        +/- 0.0057 (std) 	 min: 0.8341,        max: 0.8521


Bazujących na tych wynikach, wybieramy `grouping_marital` jako docelowy sposób grupowania kategorycznego. Doświadczenia z poprzednich etapów prac, wskazały, że wielokrotne składanie grupowań nie pozwala osiągnać wyższej precyzji.

In [646]:
def result_tuned_model(df,model_type, param_test, param_grid, name):
    best_params = parse_params(get_best_params(model_type, param_test, param_grid, df))
    #best_params.update(class_weight= {'<=50K': 1 , '>50K': 1.5})
    #print(best_params)
    designated_model = model_type(**best_params)
    transformed_data = grouping_marital(df)
    y_hat, y_hat_proba = test_model(designated_model,transformed_data,name)
    return y_hat, y_hat_proba, designated_model

def result_tuned_model(df,model, name):
    transformed_data = grouping_marital(df)
    y_hat, y_hat_proba = test_model(model,transformed_data,name)
    return y_hat, y_hat_proba, model
    
def get_best_params(model_type, param_test, param_grid, df):
    test_model = model_type(**param_test)
    pipeline = get_pipeline(test_model)
    randomizer = RandomizedSearchCV(pipeline, param_grid, cv=3, n_iter=5)
    X_train, X_test, y_train, y_test = get_split(grouping_marital(df),0.2)
    randomizer.fit(X_train,y_train)
    return randomizer.best_params_


def parse_params(best_params):
    parsed_dicitionary_params = dict()
    for k in best_params.keys():
        parsed_dicitionary_params[k.replace("model__","")] = best_params[k]
    parsed_dicitionary_params["random_state"] = 42
    parsed_dicitionary_params["n_jobs"] = -1
    return parsed_dicitionary_params

In [651]:
param_grid_lr = {
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'model__max_iter': [1000],
    'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                }

param_test_lr = {
                "random_state": 42,
                "max_iter": 1000,
                 "n_jobs": -1
                }

param_test_rf = {
                "random_state": 42,
                 "n_jobs": -1
                }

param_test_xgb = {
                "random_state": 42,
                }

param_grid_rf = {
            "model__max_depth": [3, None],
            "model__max_features": [1, 3, 10],
            "model__min_samples_leaf": [1, 3, 10],
            "model__bootstrap": [True, False],
            "model__criterion": ["gini", "entropy"]
                }
param_grid_xgb = {
            "model__max_depth": [3, 6, 9],
            "model__eta": [0.2, 0.25, 0.3, 0.35],
            "model__reg_lambda": [0.01, 0.1, 1, 10],
            "model__scale_pos_weight": [1/2, 1, 2],
            "model__booster": ["gbtree", "dart"],
            "model__rate_drop" : [0.0, 0.1, 0.2],
            "model__eval_metric": ["logloss", "rmsle"]
                }


In [626]:
y_hat_Rf, y_hat_prob_Rf, model_Rf = result_tuned_model(df,RandomForestClassifier,
                                             param_test_rf,param_grid_rf,"RandomForestClassifier")

 RandomForestClassifier cross validation accuarcy score: 0.8628        +/- 0.0036 (std) 	 min: 0.8578,        max: 0.8686


In [627]:
y_hat_Lr, y_hat_prob_Lr, model_Lr =  result_tuned_model(df,LogisticRegression,
                                              param_test_lr, param_grid_lr,"LogisticRegression")

 LogisticRegression cross validation accuarcy score: 0.8485        +/- 0.0047 (std) 	 min: 0.8419,        max: 0.8565


In [628]:
y_hat_Gb, y_hat_prob_Gb, model_Gb =  result_tuned_model(df,XGBClassifier,
                                              param_test_xgb, param_grid_xgb,"XGBClassifier")

 XGBClassifier cross validation accuarcy score: 0.8684        +/- 0.0051 (std) 	 min: 0.86,        max: 0.8766


In [None]:
X_train, X_test, y_train, y_test = get_split(grouping_marital(df),0.2) #jescze raz dzilemy set aby miec dostep to 
                                                                       #test setów

# RandomForestClassifier

In [373]:
print(f"Bulit in:\n {classification_report(y_test, y_hat_Rf)}")
print(f"Roc_Auc_Score: {roc_auc_score(y_test, y_hat_prob_Rf)}\n")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_hat_Rf)}")

Bulit in:
               precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      6803
        >50K       0.77      0.60      0.68      2242

    accuracy                           0.86      9045
   macro avg       0.82      0.77      0.79      9045
weighted avg       0.85      0.86      0.85      9045

Roc_Auc_Score: 0.9155707136078785

Confusion matrix:
 [[6399  404]
 [ 886 1356]]


# LogisticRegression

In [577]:
print(f"Bulit in:\n {classification_report(y_test, y_hat_Lr)}")
print(f"Roc_Auc_Score: {roc_auc_score(y_test, y_hat_prob_Lr)}\n")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_hat_Lr)}")

Bulit in:
               precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90      6803
        >50K       0.73      0.59      0.65      2242

    accuracy                           0.84      9045
   macro avg       0.80      0.76      0.77      9045
weighted avg       0.84      0.84      0.84      9045

Roc_Auc_Score: 0.9013086266317676

Confusion matrix:
 [[6310  493]
 [ 930 1312]]


# XGBClassifier

In [619]:
print(f"Bulit in:\n {classification_report(y_test, y_hat_Gb)}")
print(f"Roc_Auc_Score: {roc_auc_score(y_test, y_hat_prob_Gb)}\n")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_hat_Gb)}")

Bulit in:
               precision    recall  f1-score   support

       <=50K       0.89      0.94      0.91      6803
        >50K       0.78      0.65      0.71      2242

    accuracy                           0.87      9045
   macro avg       0.84      0.79      0.81      9045
weighted avg       0.86      0.87      0.86      9045

Roc_Auc_Score: 0.9265574640877726

Confusion matrix:
 [[6402  401]
 [ 789 1453]]


# Voting

In [620]:
estimators=[('RFC', model_Rf), ('LR', model_Lr), ('XGB', model_Gb)]

In [644]:
model_soft = VotingClassifier(estimators= estimators, voting='soft', weights = [0.1, 0.1, 0.8])

In [649]:
y_hat_v, y_hat_prob_v, model_v = result_tuned_model(df,model_soft, "model_soft")

 model_soft cross validation accuarcy score: 0.8713        +/- 0.0051 (std) 	 min: 0.8644,        max: 0.8793
Parameters: { rate_drop } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [650]:
print(f"Bulit in:\n {classification_report(y_test, y_hat_v)}")
print(f"Roc_Auc_Score: {roc_auc_score(y_test, y_hat_prob_v)}\n")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_hat_v)}")

Bulit in:
               precision    recall  f1-score   support

       <=50K       0.89      0.94      0.91      6803
        >50K       0.79      0.64      0.70      2242

    accuracy                           0.87      9045
   macro avg       0.84      0.79      0.81      9045
weighted avg       0.86      0.87      0.86      9045

Roc_Auc_Score: 0.9256260323835197

Confusion matrix:
 [[6413  390]
 [ 811 1431]]


In [512]:
#df1 = df.copy()
#target_encoder = ce.TargetEncoder()
#target_encoder1 = ce.TargetEncoder()
#target_encoder2 = ce.TargetEncoder()
#target_encoder3 = ce.TargetEncoder()
#target_encoder4 = ce.TargetEncoder()
#target_encoder5 = ce.TargetEncoder()
#target_encoder6 = ce.TargetEncoder()
#target_encoder7 = ce.TargetEncoder()
#cat_features = ["workclass", "education", "marital_status",
#                    "occupation", "relationship", "race", 
#                    "sex", "native_country"]
#df1["income_level"] = df1.loc[:,"income_level"].map({'<=50K': 0, '>50K': 1})
#df1["workclass"] = target_encoder.fit_transform(df1["workclass"], df1["income_level"])
#df1["education"] = target_encoder1.fit_transform(df1["education"], df1["income_level"])
#df1["marital_status"] = target_encoder2.fit_transform(df1["marital_status"], df1["income_level"])
#df1["occupation"] = target_encoder3.fit_transform(df1["occupation"], df1["income_level"])
#df1["relationship"] = target_encoder4.fit_transform(df1["relationship"], df1["income_level"])
#df1["race"] = target_encoder5.fit_transform(df1["race"], df1["income_level"])
#df1["sex"] = target_encoder6.fit_transform(df1["sex"], df1["income_level"])
#df1["native_country"] = target_encoder7.fit_transform(df1["native_country"], df1["income_level"])
#X_c, y_c = pre_process(df1)

In [513]:
#pf = PolynomialFeatures(degree=1)
#X_features = pf.fit_transform(X_c)
#listFeat = pf.get_feature_names(X_c.columns)

In [514]:
#def feature_names(selector):
#    return np.array(pf.get_feature_names(X_c.columns))[selector.get_support()]

In [515]:
#def indexBest(k, bestFeat, listFeat):
#    chi2_selector = SelectKBest(chi2, k=k)
#    chi2_selector.fit_transform(X_features, y_c)
#    bestFeat = feature_names(chi2_selector)
#    indexs = []
#    for el in bestFeat:
#        idx = listFeat.index(el)
#        indexs.append(idx)
#    return indexs

In [516]:
#idx = indexBest(10, bestFeat, listFeat)

In [517]:
#X_c = pd.DataFrame(np.matrix(X_features)[:,idx])

In [518]:
#X_tr, X_te, y_tr, y_te = train_test_split(X_c, y_c,
#                        test_size = 0.2,
#                        random_state = 42,
#                        stratify= y_c)

In [538]:
#lr = LogisticRegression()
#lr.fit(X_tr, y_tr)
#y_hat = lr.predict(X_te)
#confusion_matrix(y_te, y_hat)


array([[6922,  509],
       [1023, 1315]])

In [539]:
#print(classification_report(y_te, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      7431
           1       0.72      0.56      0.63      2338

    accuracy                           0.84      9769
   macro avg       0.80      0.75      0.77      9769
weighted avg       0.84      0.84      0.84      9769

