#IMPORT LIBRARY

In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import time

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
# from imblearn.combine import SMOTETomek

#Data Import

In [None]:
data_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
test_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
columns_name = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
        'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
        'hours_per_week', 'native_country', 'income']

train_set = pd.read_csv(data_URL, names=columns_name)

test_set = pd.read_csv(test_URL, names=columns_name, skiprows=1)

#Precessing and General info

In [None]:
train_set.head()

In [None]:
train_set.info()

In [None]:
test_set.info()

In [None]:
train_set.isnull().sum()

In [None]:
test_set.isnull().sum()

#Visualization

In [None]:
sns.countplot(x='occupation', data=train_set).set_title("Occupation Count")
plt.xticks(rotation=65)
plt.show()

In [None]:
sns.histplot(train_set['income']).set_title("Income Distribution")
plt.show()

In [None]:
sns.boxplot(x='income', y='age', data=train_set).set_title('Income Distribution by Age')
plt.show()

In [None]:
train_set['income'] = train_set['income'].str.strip()

occupation_stat = train_set.groupby('occupation')['income'].value_counts().unstack()
occupation_stat['total'] = occupation_stat.sum(axis=1)

occupation_stat['>50K_propolation'] = occupation_stat['>50K'] / occupation_stat['total']
occupation_stat['<=50K_propolation'] = occupation_stat['<=50K'] / occupation_stat['total']

occupation_stat[['>50K_propolation', '<=50K_propolation']].plot(kind='barh', stacked=True)
plt.title('Occupation Income Distribution')
plt.xlabel('Proportion')
plt.ylabel('Occupation')
plt.show()

#Preprocessing

In [None]:
# delete the useless column fnlwgt and education_num
train_set.drop('fnlwgt', axis=1, inplace=True)
test_set.drop('fnlwgt', axis=1, inplace=True)

train_set.drop('education_num', axis=1, inplace=True)
test_set.drop('education_num', axis=1, inplace=True)

In [None]:
# Missing Value Check
missing_values = train_set.isin([' ?']).sum()
print('Train Set Missing Value Check')
print(missing_values)
print('\n')

print('Test Set Missing Value Check')
missing_values = test_set.isin([' ?']).sum()
print(missing_values)

In [None]:
# Fill the unknown native country with "Not Clear"
train_set.loc[train_set['native_country'].isin([' ?']),'native_country'] = ' Not Clear'
test_set.loc[test_set['native_country'].isin([' ?']),'native_country'] = ' Not Clear'

In [None]:
# Check the situation
mask = train_set['occupation'].str.contains('\?') | train_set['workclass'].str.contains('\?')
missing_data = train_set.loc[mask, ['occupation', 'workclass']]

print(missing_data['occupation'].value_counts())
print(missing_data['workclass'].value_counts())

In [None]:
# Fill the ’Never-worked‘ with "Not Clear"
train_set.loc[train_set['workclass'] == ' Never-worked', 'occupation'] = ' Not clear'
test_set.loc[test_set['workclass'] == ' Never-worked', 'occupation'] = ' Not clear'

In [None]:
age_count = train_set.loc[train_set['occupation'] == ' ?', 'age'].value_counts()

print(age_count.to_string())

In [None]:
# Suppose age <= 22 is student & age >= 60 is retired person

train_set.loc[(train_set['age'] >= 60) & (train_set['occupation'] == ' ?') & (train_set['workclass'] == ' ?'), ['occupation', 'workclass']] = ' Retired'
test_set.loc[(test_set['age'] >= 60) & (test_set['occupation'] == ' ?') & (test_set['workclass'] == ' ?'), ['occupation', 'workclass']] = ' Retired'

train_set.loc[(train_set['age'] <= 22) & (train_set['occupation'] == ' ?') & (train_set['workclass'] == ' ?'), ['occupation', 'workclass']] = ' Student'
test_set.loc[(test_set['age'] <= 22) & (test_set['occupation'] == ' ?') & (test_set['workclass'] == ' ?'), ['occupation', 'workclass']]

In [None]:
# Set other situation to Not Clear
train_set.loc[(train_set['occupation'] == ' ?') & (train_set['workclass'] == ' ?'), ['occupation', 'workclass']] = ' Not Clear'
test_set.loc[ (test_set['occupation'] == ' ?') & (test_set['workclass'] == ' ?'), ['occupation', 'workclass']] = ' Not Clear'

In [None]:
# Classify education
train_set.loc[train_set['education'].isin([' Preschool', ' 1st-4th', ' 5th-6th']), 'education'] = ' Primary'
train_set.loc[train_set['education'].isin([' 11th', ' 9th', ' 7th-8th', ' 10th',' 12th',' HS-grad']), 'education']= ' Secondary'

test_set.loc[test_set['education'].isin([' Preschool', ' 1st-4th', ' 5th-6th']), 'education'] = ' Primary'
test_set.loc[test_set['education'].isin([' 11th', ' 9th', ' 7th-8th', ' 10th',' 12th',' HS-grad']), 'education']= ' Secondary'

# Classify marital_status

train_set.loc[train_set['marital_status'].isin([' Married-civ-spouse', ' Married-spouse-absent', ' Married-AF-spouse']), 'marital_status'] = ' Married'
train_set.loc[train_set['marital_status'].isin([' Never-married', ' Divorced', ' Separated', ' Widowed']), 'marital_status']= ' Non-Married'

test_set.loc[test_set['marital_status'].isin([' Married-civ-spouse', ' Married-spouse-absent', ' Married-AF-spouse']), 'marital_status'] = ' Married'
test_set.loc[test_set['marital_status'].isin([' Never-married', ' Divorced', ' Separated', ' Widowed']), 'marital_status']= ' Non-Married'

In [None]:
# Convert continuous variable to discrete variable

# Age
bins = [0, 23, 30, 40, 50, 60, train_set['age'].max()]
labels = ['<=22', '23-30', '31-40', '41-50', '50-59', '60+']
train_set['age_category'] = pd.cut(train_set['age'], bins=bins, labels=labels)

train_set = train_set.drop('age', axis=1)


bins = [0, 23, 30, 40, 50, 60, test_set['age'].max()]
labels = ['<=22', '23-30', '31-40', '41-50', '50-59', '60+']
test_set['age_category'] = pd.cut(test_set['age'], bins=bins, labels=labels)

test_set = test_set.drop('age', axis=1)

# hours_per_week
bins = [0, 39, 40, train_set['hours_per_week'].max()]
labels = ['0-39', '40', '40+']
train_set['hours_per_week_category'] = pd.cut(train_set['hours_per_week'], bins=bins, labels=labels)
train_set = train_set.drop('hours_per_week', axis=1)


bins = [0, 39, 40, test_set['hours_per_week'].max()]
labels = ['0-39', '40', '40+']
test_set['hours_per_week_category'] = pd.cut(test_set['hours_per_week'], bins=bins, labels=labels)

test_set = test_set.drop('hours_per_week', axis=1)

In [None]:
train_set.head()

In [None]:
test_set.head()

In [None]:
# OneHotEncode the discrete data

One_Hot_Encoder = OneHotEncoder(sparse=False)
onehot_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'age_category', 'hours_per_week_category']
onehot_encoded = One_Hot_Encoder.fit_transform(train_set[onehot_cols])
train_set = pd.concat([train_set.drop(onehot_cols, axis=1), pd.DataFrame(onehot_encoded, columns=One_Hot_Encoder.get_feature_names_out(onehot_cols))], axis=1)

One_Hot_Encoder = OneHotEncoder(sparse=False)
onehot_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'age_category', 'hours_per_week_category']
onehot_encoded = One_Hot_Encoder.fit_transform(test_set[onehot_cols])
test_set = pd.concat([test_set.drop(onehot_cols, axis=1), pd.DataFrame(onehot_encoded, columns=One_Hot_Encoder.get_feature_names_out(onehot_cols))], axis=1)

In [None]:
# LabelEncode the income data
Label_Encoder = LabelEncoder()

train_set[['native_country', 'income']] = train_set[['native_country', 'income']].apply(Label_Encoder.fit_transform)

Label_Encoder = LabelEncoder()
test_set[['native_country', 'income']] = test_set[['native_country', 'income']].apply(Label_Encoder.fit_transform)

In [None]:
train_set.head()

#Model

In [None]:
train_set_without_income = train_set.drop(['income'], axis=1)
test_set_without_income = test_set.drop(['income'], axis=1)

x_train, y_train = train_set_without_income.values, train_set['income'].values
x_test, y_test = test_set_without_income.values, test_set['income'].values

# Consider doing a mixed sampling however it dramatically decreases the accuracy
# ros = SMOTETomek(random_state=42)
# X_resampled, y_resampled = ros.fit_resample(x_train, y_train)

#Decision Tree

In [None]:
depths = []

for i in range(1,50):
    classifier = DecisionTreeClassifier(criterion="gini",max_depth=i,random_state=42, splitter="best").fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    depths.append(score)

plt.plot(range(1,50), depths)
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()

In [None]:
start=time.time()
classifier = DecisionTreeClassifier(criterion="gini",max_depth=12,random_state=42, splitter="best").fit(x_train, y_train)
end=time.time()

print('DT Trainning time: %s Seconds'%(end-start))

start=time.time()
y_pred = classifier.predict(x_test)
end=time.time()

print('DT Predicting time: %s Seconds'%(end-start))

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.imshow(cm, cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks([0, 1], ['0', '1'])
plt.yticks([0, 1], ['0', '1'])

# Display the plot
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC Curve')
plt.legend()

plt.show()

#KNN

In [None]:
neighbors = []

for i in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=i).fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    neighbors.append(score)

plt.plot(range(1,30), neighbors)
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()

In [None]:
neighbors = []

for i in range(1,4):
    knn = KNeighborsClassifier(n_neighbors=28, p=i).fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    neighbors.append(score)

plt.plot(range(1,4), neighbors)
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=28, p=2).fit(x_train, y_train)
y_pred = knn.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.imshow(cm, cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks([0, 1], ['0', '1'])
plt.yticks([0, 1], ['0', '1'])

# Display the plot
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC Curve')
plt.legend()

plt.show()