# 2nd Stage IEEE Competition: Modeling

In this notebook, I will show you the steps were taken to get to the final model.

## Importing

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
test = pd.read_csv('test_cleaned.csv')
test.drop('Unnamed: 0', axis=1, inplace=True)
train = pd.read_csv('train_cleaned.csv')
train.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
# I was trying to drop columns and see how it affects the model, and at last I decided to drop these columns

train.drop(['educational_num','cluster_gain_normalized', 'cluster_loss_normalized'], axis=1, inplace=True)
test.drop(['educational_num', 'cluster_gain_normalized', 'cluster_loss_normalized'], axis=1, inplace=True)

In [5]:
# This step was to calculate the weights for the classes, so that I can try it in models

total_samples = train.income.value_counts().sum()
samples_0 = train.income.value_counts()[0]
samples_1 = train.income.value_counts()[1]
samples_0 = total_samples/samples_0
samples_1 = total_samples/samples_1

In [6]:
train.shape, test.shape

((25948, 17), (6489, 16))

In [7]:
x = train.drop('income', axis=1)
y = train['income']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
sc = StandardScaler()
ohe = OneHotEncoder()

num_cols = x.select_dtypes(include='number').columns.tolist()
cat_cols = x.select_dtypes(exclude='number').columns.tolist()

num_transformer = Pipeline(steps=[
    ('scaler', sc)
])

cat_transformer = Pipeline(steps=[
    ('ohe', ohe)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [9]:
print(num_cols)
print(cat_cols)

['age_years', 'final_weight', 'capital_gain', 'capital_loss', 'hours_per_week', 'married', 'cluster_gain', 'cluster_loss']
['workclass', 'education', 'marital_status', 'job_title', 'relationship', 'race', 'gender', 'country_of_origin']


In [25]:
cat = CatBoostClassifier(random_state=42, verbose=0, n_estimators=2000)
pipe_2_cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe_2_cat.fit(x_train, y_train)
y_pred = pipe_2_cat.predict(x_test)
y_pred_proba = pipe_2_cat.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.7666666666666667
Log Loss: 0.4106101291826084
Confusion Matrix: [[2225 1048]
 [ 163 1754]]


In [197]:
pipe_2_cat.fit(x, y)
y_pred = pipe_2_cat.predict(test)
y_pred_proba = pipe_2_cat.predict_proba(test)

submission = pd.DataFrame({'ID':test.index, 'income':y_pred})
submission.to_csv('submission_cat_7_last.csv', index=False)


In [199]:
cat = CatBoostClassifier(random_state=42, verbose=0, learning_rate=0.05, max_depth=5)
pipe_2_cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe_2_cat.fit(x_train, y_train)
y_pred = pipe_2_cat.predict(x_test)
y_pred_proba = pipe_2_cat.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.841233140655106
Log Loss: 0.32514773452601425
Confusion Matrix: [[2903  370]
 [ 454 1463]]


In [200]:
pipe_2_cat.fit(x, y)
y_pred = pipe_2_cat.predict(test)
y_pred_proba = pipe_2_cat.predict_proba(test)

submission = pd.DataFrame({'ID':test.index, 'income':y_pred})
submission.to_csv('submission_cat_7_last.csv', index=False)


In [72]:
rf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
y_pred_proba = pipe.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))

Accuracy: 0.8208092485549133
Log Loss: 0.38224739556148135


In [73]:
gb = GradientBoostingClassifier(random_state=42, n_estimators=500)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', gb)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
y_pred_proba = pipe.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))

Accuracy: 0.8402697495183045
Log Loss: 0.3297380260477602


In [74]:
cat = CatBoostClassifier(random_state=42, verbose=0)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
y_pred_proba = pipe.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))

Accuracy: 0.8425818882466282
Log Loss: 0.32586646144110143


In [75]:
# Get feature importances from the model
feature_importances = pipe.named_steps['model'].feature_importances_

# Get numerical and categorical feature names
features = num_cols + list(pipe.named_steps['preprocessor'].transformers_[1][1].named_steps['ohe'].get_feature_names_out(cat_cols))

# Create a DataFrame to hold the feature names and their importances
feature_importances_df = pd.DataFrame({'feature': features, 'importance': feature_importances})

# Sort by importance and display the top 10 features
feature_importances_df.sort_values('importance', ascending=False).head(30)


Unnamed: 0,feature,importance
2,capital_gain,20.552251
5,married,16.372957
0,age_years,11.553463
3,capital_loss,8.663606
4,hours_per_week,6.308758
1,final_weight,3.55163
48,relationship_husband,2.810633
45,job_title_specialized_professional,2.74514
36,job_title_corporate_executive,2.089119
19,education_bachelors_degree,1.912684


In [77]:
lgb = LGBMClassifier(random_state=42, verbose=-1)
pipe_lgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lgb)
])

pipe_lgb.fit(x_train, y_train)
y_pred = pipe_lgb.predict(x_test)
y_pred_proba = pipe_lgb.predict_proba(x_test)


print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.841233140655106
Log Loss: 0.3282160774067055
Confusion Matrix: [[2897  376]
 [ 448 1469]]


In [91]:
xg = XGBClassifier(random_state=42, verbosity=0, n_estimators=50)
pipe_xg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xg)
])

pipe_xg.fit(x_train, y_train)
y_pred = pipe_xg.predict(x_test)
y_pred_proba = pipe_xg.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.8391136801541426
Log Loss: 0.32728489023675206
Confusion Matrix: [[2902  371]
 [ 464 1453]]


### CatBoost With class weights

In [107]:
cat = CatBoostClassifier(random_state=42, verbose=0, class_weights=[samples_0, samples_1])
pipe_1_cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe_1_cat.fit(x_train, y_train)
y_pred = pipe_1_cat.predict(x_test)
y_pred_proba = pipe_1_cat.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.8400770712909441
Log Loss: 0.3382780202722463
Confusion Matrix: [[2696  577]
 [ 253 1664]]


### CatBoost Without class weights

In [108]:
cat = CatBoostClassifier(random_state=42, verbose=0)
pipe_2_cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe_2_cat.fit(x_train, y_train)
y_pred = pipe_2_cat.predict(x_test)
y_pred_proba = pipe_2_cat.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.8425818882466282
Log Loss: 0.32586646144110143
Confusion Matrix: [[2903  370]
 [ 447 1470]]


In [140]:
cat = CatBoostClassifier(random_state=42, verbose=0, learning_rate=0.05, max_depth=5)
pipe_2_cat = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cat)
])

pipe_2_cat.fit(x_train, y_train)
y_pred = pipe_2_cat.predict(x_test)
y_pred_proba = pipe_2_cat.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy: 0.841233140655106
Log Loss: 0.32538864847979976
Confusion Matrix: [[2895  378]
 [ 446 1471]]


In [141]:
pipe_2_cat.fit(x, y)
y_pred = pipe_2_cat.predict(test)
y_pred_proba = pipe_2_cat.predict_proba(test)

submission = pd.DataFrame({'ID': test.index, 'income': y_pred})
submission.to_csv('submission_cat_3.csv', index=False)
submission.head()

Unnamed: 0,ID,income
0,0,0
1,1,0
2,2,0
3,3,0
4,4,1


## Improving Random Forest

In [130]:
rf = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=10)
pipe_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf)
])

pipe_rf.fit(x_train, y_train)
y_pred = pipe_rf.predict(x_test)
y_pred_proba = pipe_rf.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))

Accuracy: 0.820616570327553
Log Loss: 0.382453105017859
Confusion Matrix: [[2922  351]
 [ 580 1337]]
Classification Report:               precision    recall  f1-score   support

           0       0.83      0.89      0.86      3273
           1       0.79      0.70      0.74      1917

    accuracy                           0.82      5190
   macro avg       0.81      0.80      0.80      5190
weighted avg       0.82      0.82      0.82      5190



In [131]:
rf = RandomForestClassifier(random_state=42, n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', max_depth=40, bootstrap=True)
pipe_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf)
])

pipe_rf.fit(x_train, y_train)
y_pred = pipe_rf.predict(x_test)
y_pred_proba = pipe_rf.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))

Accuracy: 0.8329479768786127
Log Loss: 0.3519581231077142
Confusion Matrix: [[2897  376]
 [ 491 1426]]
Classification Report:               precision    recall  f1-score   support

           0       0.86      0.89      0.87      3273
           1       0.79      0.74      0.77      1917

    accuracy                           0.83      5190
   macro avg       0.82      0.81      0.82      5190
weighted avg       0.83      0.83      0.83      5190



In [134]:
lgbm = LGBMClassifier(random_state=42, verbose=-1)


xgb = XGBClassifier(random_state=42, verbosity=0, n_estimators=50)


cat = CatBoostClassifier(random_state=42, verbose=0, learning_rate=0.05, max_depth=5)

rf = RandomForestClassifier(random_state=42, n_estimators=100, min_samples_split=5, 
                            min_samples_leaf=2, max_features='sqrt', max_depth=40, bootstrap=True)


lr = LogisticRegression(random_state=42)

gb = GradientBoostingClassifier(random_state=42, n_estimators=500)

In [137]:
cat_ = CatBoostClassifier(random_state=42, verbose=0)
stack = StackingClassifier(estimators=[
    ('lgb', lgbm),
    ('xgb', xgb),
    ('cat', cat),
    ('gb', gb),
    ('rf', rf), 
], final_estimator=cat_)

stack_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', stack)
])

stack_pipe.fit(x_train, y_train)
y_pred = stack_pipe.predict(x_test)
y_pred_proba = stack_pipe.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))

Accuracy: 0.8418111753371869
Log Loss: 0.3303211796329823


In [149]:
stack_pipe.fit(x, y)
y_pred = stack_pipe.predict(test)
y_pred_proba = stack_pipe.predict_proba(test)

submission = pd.DataFrame({'ID': test.index, 'income': y_pred})
submission.to_csv('submission_stack_last.csv', index=False)

In [155]:
stack_2 = StackingClassifier(estimators=[
    ('cat', cat),
    ('lgb', lgbm),
    ('xgb', xgb),
    ('gb', gb), 
    ('cat2', cat),
    ('lgb2', lgbm),
    ('xgb2', xgb),
    ('gb2', gb)
], final_estimator=LogisticRegression(random_state=42))

stack_pipe_2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', stack_2)
])

stack_pipe_2.fit(x_train, y_train)
y_pred = stack_pipe_2.predict(x_test)
y_pred_proba = stack_pipe_2.predict_proba(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Log Loss:', log_loss(y_test, y_pred_proba))

Accuracy: 0.8410404624277457
Log Loss: 0.339850809090164


In [193]:
stack_pipe_2.fit(x, y)
y_pred = stack_pipe_2.predict(test)
y_pred_proba = stack_pipe_2.predict_proba(test)

submission = pd.DataFrame({'ID': test.index, 'income': y_pred})
submission.to_csv('submission_6_ultra.csv', index=False)