# <u>Adult Dataset - UCI Machine Learning Repository Assignment</u>

# Importing Important Libraries and Dataset

In [1]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
  
# Fetching dataset 
adult = fetch_ucirepo(id=2)
  
# Metadata 
print(adult.metadata) 
  
# Variable information 
print(adult.variables)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

# Getting the features and target (as pandas dataframes) 

In [2]:
features = adult.data.features 
target = adult.data.targets
data = pd.concat([features,target], axis = 1)

In [3]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


# Pre-processing

## Check for missing values

In [4]:
print(data['workclass'].value_counts().to_string())

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
?                    1836
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10


In [5]:
print(data['occupation'].value_counts().to_string())

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
?                    1843
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15


In [6]:
print(data['native-country'].value_counts().to_string())

native-country
United-States                 43832
Mexico                          951
?                               583
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru         

## Null Values

In [7]:
print(data.isnull().sum())

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64


In [8]:
data.dropna(inplace=True)

## Null Values after deletion

In [9]:
print(data.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


## "?" values 

In [10]:
data.isin(['?']).any()

age               False
workclass          True
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation         True
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country     True
income            False
dtype: bool

In [11]:
data = data[~data.isin(['?']).any(axis=1)]

## After Deletion

In [12]:
data.isin(['?']).any()

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
income            False
dtype: bool

### Duplicate Values

In [13]:
print("Duplicate Values Before Deletion:")
print(data.duplicated().sum())

data = data.drop_duplicates()

print("Duplicate Values After Deletion:")
print(data.duplicated().sum())

Duplicate Values Before Deletion:
28
Duplicate Values After Deletion:
0


## Fix incorrect income values

In [14]:
print(data['income'].value_counts().to_string())

income
<=50K     22633
<=50K.    11355
>50K       7506
>50K.      3700


In [15]:
data['income'].replace('<=50K.', '<=50K', inplace=True)
data['income'].replace('>50K.', '>50K', inplace=True)

In [16]:
print(data['income'].value_counts().to_string())

income
<=50K    33988
>50K     11206


In [17]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


# Encoding Categorical Values

In [18]:
target_column = 'income'

# List of categorical columns excluding the target column
categorical_columns = [col for col in data.columns if col != target_column and data[col].dtype == 'object']

# Apply get_dummies excluding the target column
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [19]:
data

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,2174,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,50,83311,13,0,0,13,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,38,215646,9,0,0,40,<=50K,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,53,234721,7,0,0,40,<=50K,False,True,False,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,13,0,0,40,<=50K,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,245211,13,0,0,40,<=50K,False,True,False,...,False,False,False,False,False,False,False,True,False,False
48837,39,215419,13,0,0,36,<=50K,False,True,False,...,False,False,False,False,False,False,False,True,False,False
48839,38,374983,13,0,0,50,<=50K,False,True,False,...,False,False,False,False,False,False,False,True,False,False
48840,44,83891,13,5455,0,40,<=50K,False,True,False,...,False,False,False,False,False,False,False,True,False,False


# Divide Dataset into Features and Target

In [20]:
X=data.drop(columns=[target_column])
y=data[target_column]

## Split Dataset using train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (31635, 96)
y_train shape: (31635,)


# Using Gaussian Naive-Bayes Classifier to train dataset and predict target

In [23]:
from sklearn.naive_bayes import GaussianNB

# Classifier
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Prediction
y_pred = classifier.predict(X_test)
y_test_array = y_test.values

In [24]:
print(len(y_test_array))
print(len(y_pred))

13559
13559


# Calculating Evaluation Metrics from Confusion Matrix

In [25]:
def calculate_metrics(y_true, y_pred):
    TP = np.sum((y_true == '>50K') & (y_pred == '>50K'))
    TN = np.sum((y_true == '<=50K') & (y_pred == '<=50K'))
    FP = np.sum((y_true == '<=50K') & (y_pred == '>50K'))
    FN = np.sum((y_true == '>50K') & (y_pred == '<=50K'))

    accuracy = (TP + TN) / (TP + TN + FP + FN) 
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    
    return accuracy, sensitivity, specificity

# Accuracy, Sensitivity and Specificity

In [26]:
print(f"Accuracy: {calculate_metrics(y_test, y_pred)[0]}\nSensitivity: {calculate_metrics(y_test, y_pred)[1]}\nSpecificity: {calculate_metrics(y_test, y_pred)[2]}")

Accuracy: 0.7868574378641493
Sensitivity: 0.307989307989308
Specificity: 0.945054945054945


# Calculating the posterior probabilities of making over 50k a year

In [27]:
probabilities = classifier.predict_proba(X_test)

In [28]:
prob_over_50k = probabilities[:, 1]
prob_over_50k

array([0.0143046 , 0.03193122, 0.00278249, ..., 0.02371454, 0.01353673,
       0.01025875])

## Statistics of Probabilities Array

In [29]:
df_describe = pd.DataFrame(prob_over_50k)
df_describe.describe()

Unnamed: 0,0
count,13559.0
mean,0.126355
std,0.313419
min,2.9e-05
25%,0.004236
50%,0.01028
75%,0.019997
max,1.0


### Calculating MAP (Maximum A Posteriori)

In [30]:
# Find Prob for both classes in the dataset
predicted_classes = classifier.predict(X_test)

# Calculate the maximum posterior probability (MAP) for each instance
max_posterior_probs = probabilities.max(axis=1)

map_estimates = [prob if pred_class == 1 else 1 - prob for prob, pred_class in zip(max_posterior_probs, predicted_classes)]

print(f"Maximum A Posteriori (MAP) estimates:", map_estimates)

Maximum A Posteriori (MAP) estimates: [0.014304598210601038, 0.03193121839745905, 0.0027824853265639993, 0.008048190957849921, 2.2737367544323206e-13, 0.00747449267305722, 0.008315234479099654, 0.0, 0.0033652891023130405, 0.0024827772687644467, 0.012663167069375425, 0.014323203349148828, 0.03629173133796715, 0.0029275801175145943, 0.0005623658372135854, 0.01616092535001412, 0.0005702934610837262, 0.01906047337876915, 1.7053025658242404e-13, 0.0019068637820682799, 0.0020051855051174128, 0.018579393327482152, 0.014851421275694832, 0.009244691378330727, 0.01779946579270264, 7.584580141717367e-08, 0.0008157461955358158, 0.008797245247889385, 0.003119794312856139, 0.009790785118442225, 0.01668846621341924, 0.1984044913915317, 0.009365459788997277, 0.0072610742526253436, 0.002285169320526581, 0.013626451917499338, 0.02696138358759681, 0.012238233407886434, 0.0019405876497728958, 0.030744185664165702, 0.0016662469496157017, 0.016021758899686045, 0.0014879744413383023, 0.009977800504598178, 0.