Introduction:

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, fbeta_score, make_scorer

# Load the data
data = pd.read_csv("census.csv")

# Check for missing values
print("Missing values per column:\n", data.isnull().sum())

# Target variable
income_raw = data['income']
features_raw = data.drop('income', axis=1)

# Define numerical and categorical columns
numerical_features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education_level', 'marital-status', 'occupation', 
                        'relationship', 'race', 'sex', 'native-country']

# Log-transform skewed features
features_raw['capital-gain'] = np.log1p(features_raw['capital-gain'])
features_raw['capital-loss'] = np.log1p(features_raw['capital-loss'])

# Define preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
features_processed = preprocessor.fit_transform(features_raw)

# Convert target variable to numerical
income = income_raw.apply(lambda x: 1 if x == '>50K' else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_processed, 
                                                    income, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Show results of the split
print(f"Training set has {X_train.shape[0]} samples.")
print(f"Testing set has {X_test.shape[0]} samples.")


Missing values per column:
 age                0
workclass          0
education_level    0
education-num      0
marital-status     0
occupation         0
relationship       0
race               0
sex                0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64
Training set has 36177 samples.
Testing set has 9045 samples.


Now try a model:

In [5]:
# Define an fbeta score with beta=0.5
scorer = make_scorer(fbeta_score, beta=0.5)

# Random Forest with GridSearchCV
rf_clf = RandomForestClassifier(random_state=42)

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid_search = GridSearchCV(estimator=rf_clf, param_grid=rf_param_grid, scoring=scorer, cv=5, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Best parameters and scores
print("Random Forest Best Parameters:", rf_grid_search.best_params_)
rf_best = rf_grid_search.best_estimator_

rf_predictions = rf_best.predict(X_test)

# Evaluate Random Forest
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_f_score = fbeta_score(y_test, rf_predictions, beta=0.5)

print("\nOptimized Random Forest Model\n------")
print("Accuracy score on testing data: {:.4f}".format(rf_accuracy))
print("F-score on testing data: {:.4f}".format(rf_f_score))

# Logistic Regression with GridSearchCV
log_reg = LogisticRegression(random_state=42, max_iter=1000)

log_reg_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

log_reg_grid_search = GridSearchCV(estimator=log_reg, param_grid=log_reg_param_grid, scoring=scorer, cv=5, n_jobs=-1)
log_reg_grid_search.fit(X_train, y_train)

print()
# Best parameters and scores
print("Logistic Regression Best Parameters:", log_reg_grid_search.best_params_)
log_reg_best = log_reg_grid_search.best_estimator_

log_reg_predictions = log_reg_best.predict(X_test)

# Evaluate Logistic Regression
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
log_reg_f_score = fbeta_score(y_test, log_reg_predictions, beta=0.5)

print("\nOptimized Logistic Regression Model\n------")
print("Accuracy score on testing data: {:.4f}".format(log_reg_accuracy))
print("F-score on testing data: {:.4f}".format(log_reg_f_score))

Random Forest Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

Optimized Random Forest Model
------
Accuracy score on testing data: 0.8636
F-score on testing data: 0.7573

Logistic Regression Best Parameters: {'C': 1, 'solver': 'liblinear'}

Optimized Logistic Regression Model
------
Accuracy score on testing data: 0.8451
F-score on testing data: 0.7094
