In [132]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier


cols = ['age', 'employment_type', 'weighting_factor', 'education', 'schooling', 'marital_status', 'employment_area', 'partnership', 'ethnicity', 'gender', 'gains', 'losses', 'worktime', 'country', 'income']

df_model = pd.read_csv('einkommen.train.clean', header=0, sep=',')
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4580 entries, 0 to 4579
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age               4580 non-null   int64 
 1   employment_type   4580 non-null   object
 2   weighting_factor  4580 non-null   int64 
 3   education         4580 non-null   object
 4   schooling         4580 non-null   int64 
 5   marital_status    4580 non-null   object
 6   employment_area   4580 non-null   object
 7   partnership       4580 non-null   object
 8   ethnicity         4580 non-null   object
 9   gender            4580 non-null   object
 10  gains             4580 non-null   int64 
 11  losses            4580 non-null   int64 
 12  worktime          4580 non-null   int64 
 13  country           4580 non-null   object
 14  income            4580 non-null   int64 
dtypes: int64(7), object(8)
memory usage: 536.8+ KB


# Transform Data & Training Preperation 

In [138]:
cat_col_keep = ["employment_type", "education", "marital_status", "employment_area", "partnership", "ethnicity"]

X = df_model.drop(columns=["income", "country"])
y = df_model["income"]

y["income"] = y["income"].apply(lambda x: -1 if x == "<=50K" else 1)
X["gender"] = X["gender"].apply(lambda x: 1 if x == "Male" else 0)
X = pd.get_dummies(X, columns=cat_col_keep)

X.info()
y.info()

inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = ["accuracy", "roc_auc"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4580 entries, 0 to 4579
Data columns (total 62 columns):
 #   Column                                Non-Null Count  Dtype
---  ------                                --------------  -----
 0   age                                   4580 non-null   int64
 1   weighting_factor                      4580 non-null   int64
 2   schooling                             4580 non-null   int64
 3   gender                                4580 non-null   int64
 4   gains                                 4580 non-null   int64
 5   losses                                4580 non-null   int64
 6   worktime                              4580 non-null   int64
 7   employment_type_Federal-gov           4580 non-null   bool 
 8   employment_type_Local-gov             4580 non-null   bool 
 9   employment_type_Private               4580 non-null   bool 
 10  employment_type_Self-emp-inc          4580 non-null   bool 
 11  employment_type_Self-emp-not-inc      4580 

# Dummy Classifier

In [134]:
X_dummy = X.copy()
y_dummy = y.copy()

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_dummy, y_dummy)

print("Dummy score: ", dummy.score(X_dummy, y_dummy))

Dummy score:  0.7471615720524017


# SVC Training

In [147]:
X_svc = X.copy()
y_svc = y.copy()
X_svc = (X_svc - X_svc.mean()) / X_svc.std()

param_grid = {
    "C": [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 50.0, 100.0],
    "kernel": ["linear"]
}

clf_svc = GridSearchCV(
    SVC(), 
    param_grid, 
    cv=inner_cv, 
    scoring=scores, 
    refit="roc_auc")
nested_scores_svc = cross_validate(
    clf_svc, 
    X=X_svc, 
    y=y_svc, 
    cv=outer_cv, 
    scoring=scores, 
    n_jobs=-1, 
    verbose=10, 
    return_train_score=True, 
    return_estimator=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] END  accuracy: (train=0.853, test=0.821) roc_auc: (train=0.906, test=0.888) total time=103.2min
[CV] START .....................................................................
[CV] END  accuracy: (train=0.846, test=0.852) roc_auc: (train=0.904, test=0.898) total time=117.2min
[CV] END  accuracy: (train=0.843, test=0.835) roc_auc: (train=0.904, test=0.899) total time=118.6min
[CV] END  accuracy: (train=0.849, test=0.829) roc_auc: (train=0.904, test=0.895) total time=126.1min
[CV] END  accuracy: (train=0.846, test=0.852) roc_auc: (train=0.903, test=0.899) total time=45.1min


# Random Forest Training

In [None]:
X_rf = X.copy()
y_rf = y.copy()

param_grid = {
    "n_estimators": [10, 100, 1000],
    "max_depth": [5, 10, 20, None],
    "max_features": ["sqrt", "log2", None]
}

clf_rf = GridSearchCV(
    RandomForestClassifier(), param_grid, cv=inner_cv, scoring=scores, refit="roc_auc")
nested_scores_rf = cross_validate(
    clf_rf, X=X_rf, y=y_rf, cv=outer_cv, scoring=scores, n_jobs=-1, verbose=10, return_train_score=True, return_estimator=True)

# SVC Evaluation

In [148]:
results_svc = pd.DataFrame(nested_scores_svc)
print("Mean roc_auc: ", results_svc["test_roc_auc"].mean())
print("Mean accuracy: ", results_svc["test_accuracy"].mean())

for i in range(5):
    print("Best params: ", 
          nested_scores_svc["estimator"][i].best_params_)
    print("Best roc_auc: ", 
          nested_scores_svc["estimator"][i].best_score_)

Mean roc_auc:  0.8957119661306472
Mean accuracy:  0.8375545851528384
Best params:  {'C': 5.0, 'gamma': 'scale', 'kernel': 'linear'}
Best roc_auc:  0.8930409541936417
Best params:  {'C': 50.0, 'gamma': 'scale', 'kernel': 'linear'}
Best roc_auc:  0.8943720468924046
Best params:  {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best roc_auc:  0.893675127089389
Best params:  {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
Best roc_auc:  0.898217209499401
Best params:  {'C': 100.0, 'gamma': 'scale', 'kernel': 'linear'}
Best roc_auc:  0.893617425301563


# Random Forest Evaluation

In [152]:
results_rf = pd.DataFrame(nested_scores_svc)
print("Mean roc_auc: ", results_rf["test_roc_auc"].mean())
print("Mean accuracy: ", results_rf["test_accuracy"].mean())
for i in range(5):
    print("Best params: ", nested_scores_rf["estimator"][i].best_params_)
    print("Best roc_auc: ", nested_scores_rf["estimator"][i].best_score_)

Mean roc_auc:  0.8957119661306472
Mean accuracy:  0.8375545851528384
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 1000}
Best roc_auc:  0.9068683022875407
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 1000}
Best roc_auc:  0.9051844479782121
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 1000}
Best roc_auc:  0.9056513966073068
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 1000}
Best roc_auc:  0.9083430425703034
Best params:  {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100}
Best roc_auc:  0.9046513778541904


# Apply Model

In [157]:
df_pred = pd.read_csv('einkommen.pred.clean', header=0, sep=',')
X_pred = df_pred.drop(columns=["income", "country"])

X_pred["gender"] = X_pred["gender"].apply(lambda x: 1 if x == "Male" else 0)
X_pred = pd.get_dummies(X_pred, columns=cat_col_keep)
# X_pred.info()

final_clf = SVC(C=1.0, kernel="linear")
final_clf.fit(X_svc, y_svc)

y_pred = final_clf.predict(X_pred)
df_pred["income"] = y_pred
df_pred["income"] = df_pred["income"].apply(lambda x: "<=50K" if x == -1 else ">50K")
df_pred.to_csv('einkommen.pred', sep=',', index=False)