In [None]:
%pip install catboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score,classification_report,confusion_matrix
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from sklearn.impute import KNNImputer
import catboost as cb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
df.dtypes

Unnamed: 0,0
age,int64
workclass,object
fnlwgt,int64
education,object
education.num,int64
marital.status,object
occupation,object
relationship,object
race,object
sex,object


In [None]:
df.replace('?',np.nan,inplace=True)

In [None]:
df['income']=df['income'].map({'<=50K':0,'>50K':1})

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [None]:
categorical_cols = ['workclass','occupation','native.country']

In [None]:
df_knn_imputed = df.copy()

df_categorical = df_knn_imputed[categorical_cols]

label_encoders = {}
df_encoded_for_imputation = pd.DataFrame()

for col in categorical_cols:
    temp_placeholder = 'TEMP_NAN_PLACEHOLDER'
    df_categorical[col] = df_categorical[col].fillna(temp_placeholder)

    le = LabelEncoder()
    df_encoded_for_imputation[col] = le.fit_transform(df_categorical[col])
    label_encoders[col] = le

imputer = KNNImputer(n_neighbors=5, weights='distance')
df_imputed_array = imputer.fit_transform(df_encoded_for_imputation)

df_imputed_encoded = pd.DataFrame(df_imputed_array, columns=categorical_cols)

for col in categorical_cols:
    df_imputed_encoded[col] = df_imputed_encoded[col].round().astype(int)

    le = label_encoders[col]

    max_label = len(le.classes_) - 1
    df_imputed_encoded[col] = df_imputed_encoded[col].clip(0, max_label)

    df_imputed_encoded[col] = le.inverse_transform(df_imputed_encoded[col])

    most_frequent_after_imputation = df_imputed_encoded[col].mode()[0]
    df_imputed_encoded[col] = df_imputed_encoded[col].replace(temp_placeholder, most_frequent_after_imputation)

df_knn_imputed[categorical_cols] = df_imputed_encoded

print("Null values after KNN imputation on categorical columns:")
print(df_knn_imputed[categorical_cols].isnull().sum())

df_knn_imputed.head()

Null values after KNN imputation on categorical columns:
workclass         0
occupation        0
native.country    0
dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [None]:
df['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
0,24720
1,7841


#### THE DATASET IS IMBALANCED

In [None]:
df=df_knn_imputed
df2=df.copy()

In [None]:
cat_cols=['workclass','education','marital.status','occupation','relationship','race','sex','native.country']

In [None]:
X = df.drop('income', axis=1)
y = df['income']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
model_cb = CatBoostClassifier(
    iterations=500,
    l2_leaf_reg=5,
    learning_rate=0.05,
    depth=6,
    cat_features=cat_cols,
    verbose=False
)

model_cb.fit(X_train, y_train)
y_pred=model_cb.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))

Accuracy:  0.8713342545677875

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      4945
           1       0.78      0.65      0.71      1568

    accuracy                           0.87      6513
   macro avg       0.84      0.80      0.81      6513
weighted avg       0.87      0.87      0.87      6513



In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7]
}

grid=RandomizedSearchCV(model_cb,param_grid,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)
y_pred=grid.predict(X_test)
print("Best Parameter:",grid.best_params_)
# print("Best Cross Validation Score:",grid.best_score)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))

In [None]:
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

model_xgb = XGBClassifier(
    tree_method='hist',
    enable_categorical=True,
    use_label_encoder=False,
    eval_metric='logloss'
)

model_xgb.fit(X_train, y_train)
y_pred=model_xgb.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))

Accuracy:  0.865806847842776

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91      4945
           1       0.76      0.64      0.70      1568

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.87      0.86      6513



In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [100, 300, 500],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2.0]
}
grid=RandomizedSearchCV(model_xgb,param_grid,cv=5,n_jobs=-1,scoring='accuracy')
grid.fit(X_train,y_train)
y_pred=grid.predict(X_test)
print("Best Parameter:",grid.best_params_)
# print("Best Cross Validation Score:",grid.best_score)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))

Best Parameter: {'subsample': 1.0, 'reg_lambda': 2.0, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Accuracy:  0.8704130201136189

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      4945
           1       0.78      0.64      0.71      1568

    accuracy                           0.87      6513
   macro avg       0.84      0.79      0.81      6513
weighted avg       0.87      0.87      0.87      6513



In [None]:
for col in cat_cols:
    df2[col] = df2[col].astype('category')

X=df2.drop('income',axis=1)
y=df2['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
model_lgb = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    categorical_feature=cat_cols,
    verbose=-1
)

model_lgb.fit(X_train, y_train)
y_pred=model_lgb.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))

Accuracy:  0.8687240902809765

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      4945
           1       0.77      0.65      0.70      1568

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.87      0.86      6513



In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [15, 31, 63],
    'max_depth': [-1, 5, 10],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [100, 300, 500]
}

grid=RandomizedSearchCV(model_lgb,param_grid,cv=5,n_jobs=-1,scoring='accuracy')
grid.fit(X_train,y_train)
y_pred=grid.predict(X_test)
print("Best Parameter:",grid.best_params_)
# print("Best Cross Validation Score:",grid.best_score)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))

Best Parameter: {'subsample': 1.0, 'num_leaves': 15, 'n_estimators': 500, 'min_child_samples': 10, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Accuracy:  0.8696453247351451

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      4945
           1       0.77      0.65      0.71      1568

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.87      0.87      6513

