In [1]:
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# Load data
file_path = "adult.csv"
adult_income_df = pd.read_csv(file_path)
adult_income_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [5]:
adult_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
cols = list(adult_income_df.columns.values)
str_cols = []
num_cols = []
for i in range (0,15):
    if adult_income_df.iloc[:,i].dtype == 'object':
        str_cols.append(cols[i])
    else:
        num_cols.append(cols[i])

In [9]:
for i in str_cols:
    print(adult_income_df[i].str.contains('/?').value_counts(), '/n')
    print('-'*40)

True    32561
Name: workclass, dtype: int64 /n
----------------------------------------
True    32561
Name: education, dtype: int64 /n
----------------------------------------
True    32561
Name: marital.status, dtype: int64 /n
----------------------------------------
True    32561
Name: occupation, dtype: int64 /n
----------------------------------------
True    32561
Name: relationship, dtype: int64 /n
----------------------------------------
True    32561
Name: race, dtype: int64 /n
----------------------------------------
True    32561
Name: sex, dtype: int64 /n
----------------------------------------
True    32561
Name: native.country, dtype: int64 /n
----------------------------------------
True    32561
Name: income, dtype: int64 /n
----------------------------------------


In [15]:
for i in ('workclass', 'occupation', 'native.country'):
    print(adult_income_df[i].value_counts()[adult_income_df[i].value_counts() == adult_income_df[i].value_counts().max()],'\n')
    print('-'*40)

Private    22696
Name: workclass, dtype: int64 

----------------------------------------
Prof-specialty    4140
Name: occupation, dtype: int64 

----------------------------------------
United-States    29170
Name: native.country, dtype: int64 

----------------------------------------


In [17]:
adult_income_df['workclass'] = adult_income_df['workclass'].str.replace('\?', 'Private', regex = True)
adult_income_df['occupation'] = adult_income_df['occupation'].str.replace('\?', 'Prof-specialty', regex = True)
adult_income_df['native.country'] = adult_income_df['native.country'].str.replace('\?', 'United-States', regex = True)
adult_income_df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [18]:
adult_income_df['education'].unique()

array(['HS-grad', 'Some-college', '7th-8th', '10th', 'Doctorate',
       'Prof-school', 'Bachelors', 'Masters', '11th', 'Assoc-acdm',
       'Assoc-voc', '1st-4th', '5th-6th', '12th', '9th', 'Preschool'],
      dtype=object)

In [21]:
income = adult_income_df['education'].unique()
for i in df:
    print(i, ':/n', adult_income_df.loc[adult_income_df['education'] == i].value_counts(adult_income_df['income']), '\n')
    print('-'*20)

HS-grad :/n income
<=50K    8826
>50K     1675
dtype: int64 

--------------------
Some-college :/n income
<=50K    5904
>50K     1387
dtype: int64 

--------------------
7th-8th :/n income
<=50K    606
>50K      40
dtype: int64 

--------------------
10th :/n income
<=50K    871
>50K      62
dtype: int64 

--------------------
Doctorate :/n income
>50K     306
<=50K    107
dtype: int64 

--------------------
Prof-school :/n income
>50K     423
<=50K    153
dtype: int64 

--------------------
Bachelors :/n income
<=50K    3134
>50K     2221
dtype: int64 

--------------------
Masters :/n income
>50K     959
<=50K    764
dtype: int64 

--------------------
11th :/n income
<=50K    1115
>50K       60
dtype: int64 

--------------------
Assoc-acdm :/n income
<=50K    802
>50K     265
dtype: int64 

--------------------
Assoc-voc :/n income
<=50K    1021
>50K      361
dtype: int64 

--------------------
1st-4th :/n income
<=50K    162
>50K       6
dtype: int64 

--------------------
5th-6t

In [26]:
status = adult_income_df['marital.status'].unique()
for i in status:
    print(i, '\n', adult_income_df.loc[adult_income_df['marital.status'] == i].value_counts(adult_income_df['income']), '\n')
    print('-'*20)

Widowed 
 income
<=50K    908
>50K      85
dtype: int64 

--------------------
Divorced 
 income
<=50K    3980
>50K      463
dtype: int64 

--------------------
Separated 
 income
<=50K    959
>50K      66
dtype: int64 

--------------------
Never-married 
 income
<=50K    10192
>50K       491
dtype: int64 

--------------------
Married-civ-spouse 
 income
<=50K    8284
>50K     6692
dtype: int64 

--------------------
Married-spouse-absent 
 income
<=50K    384
>50K      34
dtype: int64 

--------------------
Married-AF-spouse 
 income
<=50K    13
>50K     10
dtype: int64 

--------------------
