In [1]:
import pandas as pd
import numpy as np

# Load Dataset Description

In [2]:
with open("data/adult/adult.names") as f:
    print(f.read())

| This data was extracted from the census bureau database found at
| http://www.census.gov/ftp/pub/DES/www/welcome.html
| Donor: Ronny Kohavi and Barry Becker,
|        Data Mining and Visualization
|        Silicon Graphics.
|        e-mail: ronnyk@sgi.com for questions.
| Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
| 48842 instances, mix of continuous and discrete    (train=32561, test=16281)
| 45222 if instances with unknown values are removed (train=30162, test=15060)
| Duplicate or conflicting instances : 6
| Class probabilities for adult.all file
| Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
| Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
|
| Extraction was done by Barry Becker from the 1994 Census database.  A set of
|   reasonably clean records was extracted using the following conditions:
|   ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
|
| Prediction task is to determine whether a person makes over

# Load Datasets

In [3]:
adult_columns = ['age','workclass','fnlwgt','education','education_num','marital_status','occupation',
            'relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country',
            'income']

# reading csv files
adult_data_train =  pd.read_csv('data/adult/adult.data', sep=",",names=adult_columns)
adult_data_test =  pd.read_csv('data/adult/adult.test', sep=",",names=adult_columns)
adult_data_test = adult_data_test.drop(0)

adult_data = pd.concat([adult_data_train, adult_data_test], ignore_index=True)

print("Rows in Dataset: " + str(len(adult_data)))
adult_data

Rows in Dataset: 48842


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419.0,Bachelors,13.0,Divorced,Prof-specialty,Not-in-family,White,Female,0.0,0.0,36.0,United-States,<=50K.
48838,64,?,321403.0,HS-grad,9.0,Widowed,?,Other-relative,Black,Male,0.0,0.0,40.0,United-States,<=50K.
48839,38,Private,374983.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
48840,44,Private,83891.0,Bachelors,13.0,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455.0,0.0,40.0,United-States,<=50K.


# Data Preprocessing

In [4]:
print(adult_data["native_country"].value_counts())
print(adult_data["native_country"].nunique())

 United-States                 43832
 Mexico                          951
 ?                               857
 Philippines                     295
 Germany                         206
 Puerto-Rico                     184
 Canada                          182
 El-Salvador                     155
 India                           151
 Cuba                            138
 England                         127
 China                           122
 South                           115
 Jamaica                         106
 Italy                           105
 Dominican-Republic              103
 Japan                            92
 Guatemala                        88
 Poland                           87
 Vietnam                          86
 Columbia                         85
 Haiti                            75
 Portugal                         67
 Taiwan                           65
 Iran                             59
 Greece                           49
 Nicaragua                        49
 

In [5]:
pd.crosstab(adult_data["workclass"], adult_data["occupation"])

occupation,?,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
?,2799,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Federal-gov,0,487,15,93,268,9,36,19,55,0,253,47,17,96,37
Local-gov,0,421,0,211,331,43,65,24,300,0,1061,450,16,58,156
Never-worked,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Private,0,4208,0,4748,3995,670,1923,2882,4057,242,3409,299,4439,1154,1880
Self-emp-inc,0,47,0,167,617,82,6,17,42,0,245,5,420,9,38
Self-emp-not-inc,0,70,0,798,587,653,21,59,276,0,575,7,591,42,183
State-gov,0,375,0,94,287,25,19,19,191,0,629,175,20,87,60
Without-pay,0,3,0,1,1,8,2,2,2,0,0,0,1,0,1


In [6]:
# drop certain columns
adult_data = adult_data.drop(columns=['native_country', 'fnlwgt', 'relationship', 'education'])

### Missings
Deal with missing values:
- Represented by **" ?"** in adult_data

In [7]:
adult_data.workclass[48838]

' ?'

In [8]:
# replace representation for missing values in data with NaN

adult_data = adult_data.replace({" ?": np.nan})
print("-------------\nMissing Values in Adult Data:\n", adult_data.isna().sum())

-------------
Missing Values in Adult Data:
 age                  0
workclass         2799
education_num        0
marital_status       0
occupation        2809
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
income               0
dtype: int64


In [9]:
adult_data.loc[adult_data['workclass'] == " Never-worked", "occupation"] = "None"

In [10]:
print("-------------\nMissing Values in Adult Data:\n", adult_data.isna().sum())
pd.crosstab(adult_data["workclass"], adult_data["occupation"])

-------------
Missing Values in Adult Data:
 age                  0
workclass         2799
education_num        0
marital_status       0
occupation        2799
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
income               0
dtype: int64


occupation,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,None
workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Federal-gov,487,15,93,268,9,36,19,55,0,253,47,17,96,37,0
Local-gov,421,0,211,331,43,65,24,300,0,1061,450,16,58,156,0
Never-worked,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10
Private,4208,0,4748,3995,670,1923,2882,4057,242,3409,299,4439,1154,1880,0
Self-emp-inc,47,0,167,617,82,6,17,42,0,245,5,420,9,38,0
Self-emp-not-inc,70,0,798,587,653,21,59,276,0,575,7,591,42,183,0
State-gov,375,0,94,287,25,19,19,191,0,629,175,20,87,60,0
Without-pay,3,0,1,1,8,2,2,2,0,0,0,1,0,1,0


In [11]:
# drop missings
adult_data = adult_data.dropna(subset=['workclass', 'occupation'])

print("-------------\nMissing Values in Adult Data:\n", adult_data.isna().sum())
print("Number of rows in Dataset:", len(adult_data))

-------------
Missing Values in Adult Data:
 age               0
workclass         0
education_num     0
marital_status    0
occupation        0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
income            0
dtype: int64
Number of rows in Dataset: 46043


### - Encoding
Create sensible (numeric) encoding of all columns.

In [12]:
print("-------------\nDatatypes in Adult Data:\n", adult_data.dtypes)

-------------
Datatypes in Adult Data:
 age                object
workclass          object
education_num     float64
marital_status     object
occupation         object
race               object
sex                object
capital_gain      float64
capital_loss      float64
hours_per_week    float64
income             object
dtype: object


In [13]:
adult_data["age"] = pd.to_numeric(adult_data["age"])
print("-------------\nDatatypes in Adult Data:\n", adult_data.dtypes)

-------------
Datatypes in Adult Data:
 age                 int64
workclass          object
education_num     float64
marital_status     object
occupation         object
race               object
sex                object
capital_gain      float64
capital_loss      float64
hours_per_week    float64
income             object
dtype: object


In [14]:
for column in adult_data.columns:
    if adult_data[column].dtype=="O":
        print(adult_data[column].value_counts())

 Private             33906
 Self-emp-not-inc     3862
 Local-gov            3136
 State-gov            1981
 Self-emp-inc         1695
 Federal-gov          1432
 Without-pay            21
 Never-worked           10
Name: workclass, dtype: int64
 Married-civ-spouse       21452
 Never-married            14882
 Divorced                  6364
 Separated                 1433
 Widowed                   1296
 Married-spouse-absent      584
 Married-AF-spouse           32
Name: marital_status, dtype: int64
 Prof-specialty       6172
 Craft-repair         6112
 Exec-managerial      6086
 Adm-clerical         5611
 Sales                5504
 Other-service        4923
 Machine-op-inspct    3022
 Transport-moving     2355
 Handlers-cleaners    2072
 Farming-fishing      1490
 Tech-support         1446
 Protective-serv       983
 Priv-house-serv       242
 Armed-Forces           15
None                    10
Name: occupation, dtype: int64
 White                 39451
 Black                  4359
 

In [15]:
adult_data.income[41838]

' <=50K.'

In [16]:
# encode binary columns
adult_data = adult_data.replace({'sex':{" Male":0, " Female":1}}).rename(columns={"sex": "sex_female"}) # encode binary sex column numerically
adult_data = adult_data.replace({'income':{" <=50K":0, " <=50K.":0, " >50K":1, " >50K.":1}}).rename(columns={"income": "income_>50k"}) # encode binary income column numerically

# encode workclass
adult_data = adult_data.replace({'workclass':{" Local-gov":" gov", " State-gov":" gov", " Federal-gov":" gov",
                                             " Self-emp-not-inc":" Self-employed", " Self-emp-inc":" Self-employed"}})

# encode marital_status
adult_data = adult_data.replace({'marital_status':{" Married-civ-spouse":" married", " Married-spouse-absent":" married", " Married-AF-spouse":" married",
                                                  " Separated":" separated", " Divorced":" separated", " Widowed":" separated"}})

In [17]:
print(adult_data.workclass.value_counts())
print(adult_data.marital_status.value_counts())

 Private          33906
 gov               6549
 Self-employed     5557
 Without-pay         21
 Never-worked        10
Name: workclass, dtype: int64
 married          22068
 Never-married    14882
 separated         9093
Name: marital_status, dtype: int64


In [18]:
adult_data

Unnamed: 0,age,workclass,education_num,marital_status,occupation,race,sex_female,capital_gain,capital_loss,hours_per_week,income_>50k
0,39,gov,13.0,Never-married,Adm-clerical,White,0,2174.0,0.0,40.0,0
1,50,Self-employed,13.0,married,Exec-managerial,White,0,0.0,0.0,13.0,0
2,38,Private,9.0,separated,Handlers-cleaners,White,0,0.0,0.0,40.0,0
3,53,Private,7.0,married,Handlers-cleaners,Black,0,0.0,0.0,40.0,0
4,28,Private,13.0,married,Prof-specialty,Black,1,0.0,0.0,40.0,0
...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,13.0,Never-married,Prof-specialty,White,0,0.0,0.0,40.0,0
48837,39,Private,13.0,separated,Prof-specialty,White,1,0.0,0.0,36.0,0
48839,38,Private,13.0,married,Prof-specialty,White,0,0.0,0.0,50.0,0
48840,44,Private,13.0,separated,Adm-clerical,Asian-Pac-Islander,0,5455.0,0.0,40.0,0


In [18]:
# save preprocessed data set
adult_data.to_csv('data/adult/adult_preprocessed.csv', index=False)

# Statistics, One Hot Encoding, Scaling
These steps will also be implemented in the web app, the data will be saved and loaded without one hot encoding and scaling.

In [23]:
adult_data["income_>50k"].value_counts()

0    34621
1    11422
Name: income_>50k, dtype: int64

In [38]:
# will also be implemented in web app

type_dict = {'sex_female': object, # change type of binary columns to object for statistics
                'income_>50k': object
               }
  
adult_data = adult_data.astype(type_dict)

print("Categorical and Binary Columns:")
for column in adult_data.columns:
    if adult_data[column].dtype=="O":
        print(adult_data[column].value_counts(), "\n")

print("Numerical Columns:")
round(adult_data.describe(),1)

Categorical and Binary Columns:
 Private          33906
 gov               6549
 Self-employed     5557
 Without-pay         21
 Never-worked        10
Name: workclass, dtype: int64 

 married          22068
 Never-married    14882
 separated         9093
Name: marital_status, dtype: int64 

 Prof-specialty       6172
 Craft-repair         6112
 Exec-managerial      6086
 Adm-clerical         5611
 Sales                5504
 Other-service        4923
 Machine-op-inspct    3022
 Transport-moving     2355
 Handlers-cleaners    2072
 Farming-fishing      1490
 Tech-support         1446
 Protective-serv       983
 Priv-house-serv       242
 Armed-Forces           15
None                    10
Name: occupation, dtype: int64 

 White                 39451
 Black                  4359
 Asian-Pac-Islander     1423
 Amer-Indian-Eskimo      435
 Other                   375
Name: race, dtype: int64 

0    31121
1    14922
Name: sex_female, dtype: int64 

0    34621
1    11422
Name: income_>50k, d

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
count,46043.0,46043.0,46043.0,46043.0,46043.0
mean,38.6,10.1,1114.1,88.7,40.9
std,13.2,2.6,7588.8,405.3,12.0
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,47.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [19]:
adult_data

Unnamed: 0,age,workclass,education_num,marital_status,occupation,race,sex_female,capital_gain,capital_loss,hours_per_week,income_>50k
0,39,gov,13.0,Never-married,Adm-clerical,White,0,2174.0,0.0,40.0,0
1,50,Self-employed,13.0,married,Exec-managerial,White,0,0.0,0.0,13.0,0
2,38,Private,9.0,separated,Handlers-cleaners,White,0,0.0,0.0,40.0,0
3,53,Private,7.0,married,Handlers-cleaners,Black,0,0.0,0.0,40.0,0
4,28,Private,13.0,married,Prof-specialty,Black,1,0.0,0.0,40.0,0
...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,13.0,Never-married,Prof-specialty,White,0,0.0,0.0,40.0,0
48837,39,Private,13.0,separated,Prof-specialty,White,1,0.0,0.0,36.0,0
48839,38,Private,13.0,married,Prof-specialty,White,0,0.0,0.0,50.0,0
48840,44,Private,13.0,separated,Adm-clerical,Asian-Pac-Islander,0,5455.0,0.0,40.0,0


In [20]:
# create one hot encoding for categorical variables
adult_data_onehot = adult_data.copy()
adult_data_onehot=pd.get_dummies(adult_data_onehot)
adult_data_onehot

Unnamed: 0,age,education_num,sex_female,capital_gain,capital_loss,hours_per_week,income_>50k,workclass_ Never-worked,workclass_ Private,workclass_ Self-employed,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,occupation_None,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White
0,39,13.0,0,2174.0,0.0,40.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,50,13.0,0,0.0,0.0,13.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,38,9.0,0,0.0,0.0,40.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,53,7.0,0,0.0,0.0,40.0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13.0,1,0.0,0.0,40.0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,13.0,0,0.0,0.0,40.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
48837,39,13.0,1,0.0,0.0,36.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
48839,38,13.0,0,0.0,0.0,50.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
48840,44,13.0,0,5455.0,0.0,40.0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
# scaling

columns_to_normalize = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

adult_data[columns_to_normalize] = adult_data[columns_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

adult_data

Unnamed: 0,age,education_num,sex_female,capital_gain,capital_loss,hours_per_week,income_>50k,workclass_ Never-worked,workclass_ Private,workclass_ Self-employed,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,occupation_None,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White
0,0.301370,0.800000,0,0.021740,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0.452055,0.800000,0,0.000000,0.0,0.122449,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0.287671,0.533333,0,0.000000,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0.493151,0.400000,0,0.000000,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0.150685,0.800000,1,0.000000,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,0.219178,0.800000,0,0.000000,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
48837,0.301370,0.800000,1,0.000000,0.0,0.357143,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
48839,0.287671,0.800000,0,0.000000,0.0,0.500000,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
48840,0.369863,0.800000,0,0.054551,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
