In [1]:
import pandas as pd
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

# Load the data
df_model = pd.read_csv('einkommen.train.clean', header=0, sep=',')
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4580 entries, 0 to 4579
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age               4580 non-null   int64 
 1   employment_type   4580 non-null   object
 2   weighting_factor  4580 non-null   int64 
 3   education         4580 non-null   object
 4   schooling         4580 non-null   int64 
 5   marital_status    4580 non-null   object
 6   employment_area   4580 non-null   object
 7   partnership       4580 non-null   object
 8   ethnicity         4580 non-null   object
 9   gender            4580 non-null   object
 10  gains             4580 non-null   int64 
 11  losses            4580 non-null   int64 
 12  worktime          4580 non-null   int64 
 13  country           4580 non-null   object
 14  income            4580 non-null   object
dtypes: int64(6), object(9)
memory usage: 536.8+ KB


# Transform Data & Training Preperation 

In [2]:
cat_col_keep = ["employment_type", "education", "marital_status", "employment_area", "partnership", "ethnicity"]

# Drop country and target column from the features 
X = df_model.drop(columns=["income", "country"])

# Convert the target column to binary
y = df_model["income"].apply(lambda x: -1 if x == "<=50K" else 1)

# Convert categorical features to binary
X["gender"] = X["gender"].apply(lambda x: 1 if x == "Male" else 0)
X = pd.get_dummies(X, columns=cat_col_keep)

X.info()
y.info()

# Define metrics to evaluate the model
scores = ["accuracy", "roc_auc"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4580 entries, 0 to 4579
Data columns (total 62 columns):
 #   Column                                Non-Null Count  Dtype
---  ------                                --------------  -----
 0   age                                   4580 non-null   int64
 1   weighting_factor                      4580 non-null   int64
 2   schooling                             4580 non-null   int64
 3   gender                                4580 non-null   int64
 4   gains                                 4580 non-null   int64
 5   losses                                4580 non-null   int64
 6   worktime                              4580 non-null   int64
 7   employment_type_Federal-gov           4580 non-null   bool 
 8   employment_type_Local-gov             4580 non-null   bool 
 9   employment_type_Private               4580 non-null   bool 
 10  employment_type_Self-emp-inc          4580 non-null   bool 
 11  employment_type_Self-emp-not-inc      4580 

# Dummy Classifier

In [3]:
X_dummy = X.copy()
y_dummy = y.copy()

# most_frequent strategy always predicts the most frequent label in the training set
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_dummy, y_dummy)

print("Dummy Baseline score: ", dummy.score(X_dummy, y_dummy))

Dummy Baseline score:  0.7471615720524017


# SVC Training

In [4]:
X_svc = X.copy()
y_svc = y.copy()
# Normalize the data
X_svc = (X_svc - X_svc.mean()) / X_svc.std()

# Define the hyperparameters to search
param_grid = {
    "C": [0.01, 0.1, 0.5, 1.0, 2.0, 4.0, 5.0, 10.0, 50.0, 100.0],
    "kernel": ["linear"]
}

# Define and run the model with nested cross-validation
clf_svc = GridSearchCV(
    SVC(), 
    param_grid, 
    cv=5, 
    scoring=scores, 
    refit="roc_auc")
nested_scores_svc = cross_validate(
    clf_svc, 
    X=X_svc, 
    y=y_svc, 
    cv=4, 
    scoring=scores, 
    n_jobs=-1, 
    verbose=10, 
    return_train_score=True, 
    return_estimator=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] END  accuracy: (train=0.845, test=0.844) roc_auc: (train=0.906, test=0.897) total time= 9.5min


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  9.6min


[CV] END  accuracy: (train=0.845, test=0.847) roc_auc: (train=0.905, test=0.891) total time= 9.6min


[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  9.7min remaining:  9.7min


[CV] END  accuracy: (train=0.848, test=0.832) roc_auc: (train=0.904, test=0.892) total time= 9.7min
[CV] END  accuracy: (train=0.847, test=0.838) roc_auc: (train=0.905, test=0.897) total time=10.5min


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 10.6min finished


# Random Forest Training

In [5]:
X_rf = X.copy()
y_rf = y.copy()

# Define the hyperparameters to search
param_grid = {
    "n_estimators": [10, 100, 1000],
    "max_depth": [5, 10, 20, None],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": [0.1, 0.5, 1.0, 2],
    "min_samples_leaf": [0.1, 0.5, 1],
}

# Define and run the model with nested cross-validation
clf_rf = GridSearchCV(
    RandomForestClassifier(), 
    param_grid, 
    cv=5, 
    scoring=scores, 
    refit="roc_auc")
nested_scores_rf = cross_validate(
    clf_rf, 
    X=X_rf, 
    y=y_rf, 
    cv=4, 
    scoring=scores, 
    n_jobs=-1, 
    verbose=10, 
    return_train_score=True, 
    return_estimator=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] END  accuracy: (train=0.890, test=0.838) roc_auc: (train=0.959, test=0.896) total time=122.8min


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 122.8min


[CV] END  accuracy: (train=0.882, test=0.845) roc_auc: (train=0.956, test=0.913) total time=123.1min


[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed: 123.1min remaining: 123.1min


[CV] END  accuracy: (train=0.883, test=0.863) roc_auc: (train=0.957, test=0.909) total time=124.1min
[CV] END  accuracy: (train=0.882, test=0.842) roc_auc: (train=0.959, test=0.904) total time=124.6min


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 124.6min finished


# SVC Evaluation

In [6]:
# Print the cross_validation results (generalized performance)
results_svc = pd.DataFrame(nested_scores_svc)
print("Mean roc_auc: ", results_svc["test_roc_auc"].mean())
print("Mean accuracy: ", results_svc["test_accuracy"].mean())

# Print the best model parameters and their performance during tuning
for i in range(4):
    print("Best params: ", nested_scores_svc["estimator"][i].best_params_)
    print("Best roc_auc: ", nested_scores_svc["estimator"][i].best_score_)

Mean roc_auc:  0.8941301340607835
Mean accuracy:  0.840174672489083
Best params:  {'C': 0.1, 'kernel': 'linear'}
Best roc_auc:  0.8928872877692342
Best params:  {'C': 0.5, 'kernel': 'linear'}
Best roc_auc:  0.8924593717495203
Best params:  {'C': 1.0, 'kernel': 'linear'}
Best roc_auc:  0.8924219625648611
Best params:  {'C': 50.0, 'kernel': 'linear'}
Best roc_auc:  0.8930458160838416


# Random Forest Evaluation

In [7]:
# Print the cross_validation results (generalized performance)
results_rf = pd.DataFrame(nested_scores_svc)
print("Mean roc_auc: ", results_rf["test_roc_auc"].mean())
print("Mean accuracy: ", results_rf["test_accuracy"].mean())

# Print the best model parameters and their performance during tuning
for i in range(4):
    print("Best params: ", nested_scores_rf["estimator"][i].best_params_)
    print("Best roc_auc: ", nested_scores_rf["estimator"][i].best_score_)

Mean roc_auc:  0.8941301340607835
Mean accuracy:  0.840174672489083
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best roc_auc:  0.9027895880462286
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Best roc_auc:  0.9069672614706619
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Best roc_auc:  0.9070810418339977
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Best roc_auc:  0.9030795657071058


# Apply Model

In [8]:
# Load the prediction data
df_pred = pd.read_csv('einkommen.pred.clean', header=0, sep=',')
X_pred = df_pred.drop(columns=["income", "country"])

# Convert categorical features to binary
X_pred["gender"] = X_pred["gender"].apply(lambda x: 1 if x == "Male" else 0)
X_pred = pd.get_dummies(X_pred, columns=cat_col_keep)
# X_pred.info()

# Train the final model with the best hyperparameters on the whole training set
final_clf = SVC(C=1.0, kernel="linear")
final_clf.fit(X_svc, y_svc)

# Predict the target column on the prediction set and save the results
y_pred = final_clf.predict(X_pred)
df_pred["income"] = y_pred
df_pred["income"] = df_pred["income"].apply(lambda x: "<=50K" if x == -1 else ">50K")
df_pred.to_csv('einkommen.pred', sep=',', index=False)