Models: LR, DT, and RF

Source: https://www.kaggle.com/code/ayaelsaied/adult-income-88-using-plotly/notebook#Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

### Data preprocessing

In [2]:
df = pd.read_csv("../../data/adult_income/adult.csv")

In [3]:
# Handle missing values
columns_with_question_marks = []
for column in df.columns:
    if df[column].isin(['?']).any():
        columns_with_question_marks.append(column)

print("Columns with question marks:", columns_with_question_marks)
df[columns_with_question_marks] = df[columns_with_question_marks].replace('?', np.nan)

Columns with question marks: ['workclass', 'occupation', 'native-country']


In [4]:
# Remove duplicates
print('shape of data before drop duplicate', df.shape)
duplicates = df[df.duplicated()]
df = df.drop_duplicates()
print('shape of data after drop duplicate', df.shape)

shape of data before drop duplicate (48842, 15)
shape of data after drop duplicate (48790, 15)


In [5]:
# Remove outliers
def outliers_handler(df,col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_limit = q1 - 1.5 * iqr
    upper_limit = q3 + 1.5 * iqr
    df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)  
    return df

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
for i in numeric_cols:
    df = outliers_handler(df, i)

In [6]:
# Encoding catagorical columns

workclass_mapping = {
    'Private': 'Private',
    'Self-emp-not-inc': 'Self-Employed',
    'Local-gov': 'Government',
    'State-gov': 'Government',
    'Self-emp-inc': 'Self-Employed',
    'Federal-gov': 'Government',
    'Without-pay': 'Other',
    'Never-worked': 'Other'
}
df['workclass'] = df['workclass'].map(workclass_mapping)

le = LabelEncoder()
df['workclass'] = le.fit_transform(df['workclass'])

In [7]:
education_mapping = {
    'HS-grad': 'learning',
    'Some-college': 'learning',
    'Bachelors ': 'graduate',
    'Masters': 'graduate',
    'Assoc-voc': 'Assoc',
    '11th': 'child',
    'Assoc-acdm': 'Assoc',
    '10th': 'child',
    '7th-8th': 'child',
    'Prof-school': 'graduate',
    '9th': 'child',
    '12th': 'child',
    'Doctorate':'gruduate',
    '5th-6th ': 'child',
    '1st-4th  ': 'child',
    'Preschool': 'child'
}
df['education'] = df['education'].map(education_mapping)
df['education'] = le.fit_transform(df['education'])

In [8]:
marital_status_mapping = {
    'Married-civ-spouse': 'Married',
    'Never-married': 'single',
    'Divorced': 'single',
    'Separated': 'single',
    'Widowed': 'single',
    'Married-spouse-absent': 'Married',
    'Married-AF-spouse': 'Married',
}
df['marital-status'] = df['marital-status'].map(marital_status_mapping)
df['marital-status'] = le.fit_transform(df['marital-status'])

In [9]:
occupation_mapping = {
    'Prof-specialty': 'Professional and Executive',
    'Craft-repair': 'Labor and Manufacturing',
    'Exec-managerial ': 'Professional and Executive',
    'Adm-clerical': 'Professional and Executive',
    'Sales': 'Sales and Services',
    'Other-service': 'Sales and Services',
    'Machine-op-inspct': 'Labor and Manufacturing',
    'Transport-moving': 'Labor and Manufacturing',
    'Handlers-cleaners': 'Labor and Manufacturing',
    'Farming-fishing ': 'Labor and Manufacturing',
    'Tech-suppor': 'Sales and Services',
    'Protective-serv': 'Sales and Services',
    'Priv-house-serv':'Sales and Services',
    'Armed-Forces ': 'Sales and Services'
}
df['occupation'] = df['occupation'].map(occupation_mapping)
df['occupation']=le.fit_transform(df['occupation'])

In [10]:
relationship_mapping = {
    'Husband': 'spouse',
    'Not-in-family': 'Others:',
    'Own-child': 'Immediate Family',
    'Unmarried': 'Others:',
    'Wife': 'spouse',
    'Other-relative': 'Immediate Family',
}
df['relationship'] = df['relationship'].map(relationship_mapping)
df['relationship'] = le.fit_transform(df['relationship'])

In [11]:
df['race'] = df['race'].map(lambda x: 'White' if x == 'White' else 'Other')
df['race'] = le.fit_transform(df['race'])

In [12]:
df['gender'] = le.fit_transform(df['gender'])

In [13]:
# Combine all categories except 'United-States'
df['native-country'] = df['native-country'].map(lambda x: 'United-States' if x == 'United-States' else 'Other')
df['native-country'] = le.fit_transform(df['native-country'])

In [14]:
# income #0<=50 #1>50
df['income'] = le.fit_transform(df['income'])

In [15]:
# Feature selection
df.drop(['capital-gain','capital-loss'], axis=1, inplace=True)

In [16]:
x = df.drop(columns=['income'])  
y = df['income']

# Balance data
resampler = SMOTE()
x, y = resampler.fit_resample(x, y)

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### Models training

In [None]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [24]:
# Logistic Regression
model = LogisticRegression(
    C=0.1,
    solver="liblinear",
    max_iter=100,
    random_state=42
)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8374427378065211


In [31]:
# Decision Tree
model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    criterion="entropy",
    ccp_alpha=0.0,
    random_state=42
)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8706548100242523


In [33]:
# Random Forest
model = RandomForestClassifier(
    n_estimators=150,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=3,
    criterion="entropy",
    max_features="sqrt",
    random_state=42
)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8823093505793587


### Models evaluation

In [34]:
# Evaluation
y_pred = model.predict(x_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

precision = precision_score(y_test, y_pred)
print("Precision:", precision)

recall = recall_score(y_test, y_pred)
print("Recall:", recall)

roc_auc = roc_auc_score(y_test, y_pred)
print("ROC AUC:", roc_auc)

Accuracy: 0.8823093505793587
Precision: 0.8938955491249828
Recall: 0.8691050375133976
ROC AUC: 0.882384497076482
