In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import ssl # probably not needed on your machine, delete before release
ssl._create_default_https_context = ssl._create_unverified_context # probably not needed on your machine, delete before release



In [2]:
df = pd.read_csv("https://lovespreadsheet-tutorials.s3.amazonaws.com/APIDatasets/census_income_dataset.csv")

In [3]:
def pre_process(df):
    df[df == "?"] = np.nan
    df.dropna(inplace = True)
    data = df.drop(["fnlwgt", "income_level"], axis=1)
    target = np.array(df["income_level"])
    return data,target

In [4]:
def get_split(df,test_size_fraction):
    data, target = pre_process(df)
    X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                        test_size = test_size_fraction,
                                                        random_state = 42,
                                                       stratify=target)
    return X_train, X_test, y_train, y_test

In [5]:
def grouping_marital(df):
    res = df.copy()
    res['marital_status'] = res['marital_status'].replace(
        ['Widowed', 'Divorced', 'Separated', 'Never-married', 'Married-spouse-absent'], 'Living-Alone')
    res['marital_status'] = res['marital_status'].replace(
        ['Married-civ-spouse', 'Married-AF-spouse'], 'Married')
    return res

def grouping_ethnic(df):
    res = df.copy()
    res['race'] = res['race'].replace(['Asian-Pac-Islander', 'White'], '1stGroup')
    res['race'] = res['race'].replace(['Other', 'Black', 'Amer-Indian-Eskimo'], '2ndGroup')
    return res

def grouping_education(df):
    res = df.copy()
    res['education'] = res['education'].replace(
            ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th'], 'Obligatory')
    res['education'] = res['education'].replace(['HS-grad', 'Some-college'], 'HS-college')
    res['education'] = res['education'].replace(['Assoc-voc', 'Assoc-acdm'], 'Assoc')
    res['education'] = res['education'].replace(['Prof-school', 'Doctorate'], 'Academic')
    return res

def grouping_countries(df):
    countries_list = grouping_countries_helper(df)
    res = df.copy()
    res.loc[~res['native_country'].isin(countries_list), "native_country"] = "Other"
    res['native_country'] = res['native_country'].replace(countries_list[:11], 'Low-income')
    res['native_country'] = res['native_country'].replace(countries_list[11:17], 'Lower-middle-income')
    res['native_country'] = res['native_country'].replace(countries_list[17:23], 'Middle-income')
    res['native_country'] = res['native_country'].replace(countries_list[23:26], 'Upper-middle-income')
    res['native_country'] = res['native_country'].replace(countries_list[26:32]+countries_list[33:], 'High-income')
    return res

def grouping_countries_helper(df):
    gdp = pd.read_csv("gdp.csv", sep=";")
    df2 = df.copy()
    df2["income_level"] = df2.loc[:,"income_level"].map({'<=50K': 0, '>50K': 1})
    df2 = df2.groupby("native_country")["income_level"].mean().reset_index()
    countries = pd.merge(df2, gdp, left_on = "native_country", right_on = "Country", how = "left").sort_values(by = "GDP95")
    countries_list = list(countries["native_country"])
    return countries_list

In [6]:
def test_model(model,data,label=None):
    if label is None:
        label = ""
    X_train,X_test,y_train,y_test = get_split(data,0.2)
    kfolds = 8
    split = KFold(n_splits=kfolds, shuffle=True, random_state=42)
    cv_results = cross_val_score(get_pipeline(model), 
                     X_train.drop("education_num", axis=1), y_train, 
                     cv=split,
                     scoring="accuracy",
                     n_jobs=-1)
    
    print(f" {label} cross validation accuarcy score: {round(np.mean(cv_results), 4)} +/- {round(np.std(cv_results), 4)} (std) \t min: {round(min(cv_results), 4)}, max: {round(max(cv_results), 4)}")
    
def get_pipeline(model):
    cat_features = ["workclass", "education", "marital_status",
                    "occupation", "relationship", "race", 
                    "sex", "native_country"]     

    transformer = ColumnTransformer(
        [
        ("onehot", OneHotEncoder(handle_unknown = 'ignore'), cat_features), 
        ("std_scaler", StandardScaler(), ["age", "capital_gain", "capital_loss", "hours_per_week"])
        ],
        remainder = "passthrough"
    )

    model_pipeline = Pipeline(
        [
            ('transformer', transformer),
            ('model', model)
        ]
    )
    return model_pipeline

In [7]:
transformations = [grouping_marital,grouping_ethnic,grouping_education,grouping_countries]
transformations_names = ["grouping_marital","grouping_ethnic","grouping_education","grouping_countries"]
model = LogisticRegression(random_state=42, n_jobs=-1,max_iter=500)
for transformation, name in zip(transformations,transformations_names):
    transformed_data = transformation(df)
    test_model(model, transformed_data,name)

 grouping_marital cross validation accuarcy score: 0.8484 +/- 0.0046 (std) 	 min: 0.8421, max: 0.8563
 grouping_ethnic cross validation accuarcy score: 0.8483 +/- 0.0041 (std) 	 min: 0.8432, max: 0.8549
 grouping_education cross validation accuarcy score: 0.8482 +/- 0.0047 (std) 	 min: 0.8417, max: 0.8554
 grouping_countries cross validation accuarcy score: 0.8462 +/- 0.0057 (std) 	 min: 0.8341, max: 0.8518


Bazujących na tych wynikach, wybieramy `grouping_marital` jako docelowy sposób grupowania kategorycznego. Doświadczenia z poprzednich etapów prac, wskazały, że wielokrotne składanie grupowań nie pozwala osiągnać wyższej precyzji.

In [39]:
def result_tuned_model(df,model_type, param_test, param_grid, name):
    best_params = parse_params(get_best_params(model_type, param_test, param_grid, df))
    designated_model = model_type(**best_params)
    transformed_data = grouping_marital(df)
    test_model(designated_model,transformed_data,name)
    
    
def get_best_params(model_type, param_test, param_grid, df):
    test_model = model_type(**param_test)
    pipeline = get_pipeline(test_model)
    randomizer = RandomizedSearchCV(pipeline, param_grid, cv=3, n_iter=5)
    X_train, X_test, y_train, y_test = get_split(grouping_marital(df),0.2)
    randomizer.fit(X_train,y_train)
    return randomizer.best_params_


def parse_params(best_params):
    parsed_dicitionary_params = dict()
    for k in best_params.keys():
        parsed_dicitionary_params[k.replace("model__","")] = best_params[k]
    parsed_dicitionary_params["random_state"] = 42
    parsed_dicitionary_params["n_jobs"] = -1
    return parsed_dicitionary_params

In [43]:
param_grid_lr = {
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'model__max_iter': [1000],
    'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                }

param_test_lr = {
                "random_state": 42,
                "max_iter": 1000,
                 "n_jobs": -1
                }
param_test_rf = {
                "random_state": 42,
                 "n_jobs": -1
                }
param_grid_rf = {
            "model__max_depth": [3, None],
            "model__max_features": [1, 3, 10],
            "model__min_samples_leaf": [1, 3, 10],
            "model__bootstrap": [True, False],
            "model__criterion": ["gini", "entropy"]
                }

In [45]:
result_tuned_model(df,RandomForestClassifier,param_test_rf,param_grid_rf,"RandomForestClassifier")

 RandomForestClassifier cross validation accuarcy score: 0.862 +/- 0.0036 (std) 	 min: 0.856, max: 0.8675


In [None]:
result_tuned_model(df,LogisticRegression,param_test_lr, param_grid_lr,"LogisticRegression")