In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [26]:
test = pd.read_csv('haneen_test.csv')
train = pd.read_csv('haneen_train.csv')

In [27]:
train.head()

Unnamed: 0,age_years,workclass,final_weight,education,educational-num,marital-status,job_title,relationship,race,gender,capital-gain,capital-loss,hours-per-week,country_of_origin,income,age_years_groups
0,32,Private,408328,seventh_eighth_grade,4,Married-civ-spouse,skilled_artisan,Husband,White,Male,0.0,0.0,40.0,United-States,0,20<age<40
1,68,Self-emp-not-inc,140892,masters_degree,14,Married-civ-spouse,logistics_transport,Husband,White,Male,0.0,0.0,15.0,United-States,0,60<age<80
2,50,Private,146015,high_school_graduate,9,Married-civ-spouse,skilled_artisan,Husband,White,Male,0.0,0.0,40.0,United-States,0,40<age<60
3,28,Private,44216,high_school_graduate,9,Never-married,security_personnel,Unmarried,Amer-Indian-Eskimo,Female,0.0,0.0,40.0,United-States,0,20<age<40
4,43,Private,171888,some_college,10,Divorced,administrative_assistant,Not-in-family,White,Female,0.0,0.0,40.0,United-States,0,40<age<60


In [28]:
train.isna().sum()

age_years            0
workclass            0
final_weight         0
education            0
educational-num      0
marital-status       0
job_title            0
relationship         0
race                 0
gender               0
capital-gain         0
capital-loss         0
hours-per-week       0
country_of_origin    0
income               0
age_years_groups     0
dtype: int64

In [29]:
test.isna().sum()

age_years            0
workclass            0
final_weight         0
education            0
educational-num      0
marital-status       0
job_title            0
relationship         0
race                 0
gender               0
capital-gain         0
capital-loss         0
hours-per-week       0
country_of_origin    0
dtype: int64

In [30]:
train.shape, test.shape

((25947, 16), (6489, 14))

In [32]:
train.drop(['age_years_groups', 'educational-num'], axis=1, inplace=True)

In [33]:
x = train.drop('income', axis=1)
y = train['income']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [34]:
sc = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

num_cols = x.select_dtypes(include='number').columns.tolist()
cat_cols = x.select_dtypes(exclude='number').columns.tolist()

num_transformer = Pipeline(steps=[
    ('scaler', sc)
])

cat_transformer = Pipeline(steps=[
    ('ohe', ohe)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [35]:
num_cols, cat_cols

(['age_years',
  'final_weight',
  'capital-gain',
  'capital-loss',
  'hours-per-week'],
 ['workclass',
  'education',
  'marital-status',
  'job_title',
  'relationship',
  'race',
  'gender',
  'country_of_origin'])

In [36]:
rf = RandomForestClassifier(random_state=42, n_estimators=200)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
y_pred_proba = pipe.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.8323699421965318
Log Loss: 0.37511634981443537
Classification Report:               precision    recall  f1-score   support

           0       0.87      0.87      0.87      3343
           1       0.77      0.76      0.76      1847

    accuracy                           0.83      5190
   macro avg       0.82      0.82      0.82      5190
weighted avg       0.83      0.83      0.83      5190

Confusion Matrix: [[2921  422]
 [ 448 1399]]


In [38]:
cat = CatBoostClassifier(random_state=42, verbose=0, iterations=1000)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
y_pred_proba = pipe.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))

Accuracy: 0.8608863198458574
Log Loss: 0.31033728395907084


In [40]:
rf = RandomForestClassifier(random_state=42, n_estimators=200)
gb = GradientBoostingClassifier(random_state=42, n_estimators=750)
cat = CatBoostClassifier(random_state=42, verbose=0, iterations=1000)
lgb = LGBMClassifier(random_state=42, verbose=-1, n_estimators=150)
xg = XGBClassifier(random_state=42, verbosity=0, n_estimators=50)

stack = StackingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('cat', cat), ('lgb', lgb), ('xg', xg)], 
    final_estimator=LogisticRegression())

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', stack)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
y_pred_proba = pipe.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))

Accuracy: 0.8620423892100193
Log Loss: 0.3172642859639938


In [41]:
cat = CatBoostClassifier(random_state=42, verbose=0, iterations=1000)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe.fit(x, y)
y_pred = pipe.predict(test)

submission = pd.DataFrame({"ID": test.index, "income": y_pred})
submission.to_csv('submission_haneen_data.csv', index=False)

In [42]:
testing = pd.read_csv('submission_5_ultra.csv')
print(accuracy_score(testing['income'], y_pred))

0.9798119895207273
