In [85]:
import pandas as pd
from io import StringIO
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

In [79]:
column_count = 50

In [None]:
data_file = open('data/adult/adult.data',mode='r')
whole_csv = data_file.read()
# the dataset consists of abundant useless whitespaces
whole_csv = whole_csv.replace(' ','')
data_file.close()

In [29]:
column_names = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"]
dataframe = pd.read_csv(StringIO(whole_csv), names=column_names)
dataframe.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## workclass analysis

In [31]:
dataframe.groupby(['workclass']).size().reset_index(name='counts')

max class: 
workclass    Without-pay
counts             22696
dtype: object


Unnamed: 0,workclass,counts
0,?,1836
1,Federal-gov,960
2,Local-gov,2093
3,Never-worked,7
4,Private,22696
5,Self-emp-inc,1116
6,Self-emp-not-inc,2541
7,State-gov,1298
8,Without-pay,14


## workclass processing

In [33]:
dataframe['workclass'] = dataframe['workclass'].replace('?', 'UNK')
dataframe.groupby(['workclass']).size().reset_index(name='counts')

Unnamed: 0,workclass,counts
0,Federal-gov,960
1,Local-gov,2093
2,Never-worked,7
3,Private,22696
4,Self-emp-inc,1116
5,Self-emp-not-inc,2541
6,State-gov,1298
7,UNK,1836
8,Without-pay,14


## fnlwgt analysis

In [36]:
dataframe['fnlwgt'].describe()

2

In [37]:
for i in range(len(dataframe['fnlwgt'])):
    try:
        float(dataframe['fnlwgt'][i])
    except ValueError as e:
        print(f"found missing value {dataframe['fnlwgt'][i]} in fnlwgt column, {i} row")

## education analysis

In [39]:
dataframe.groupby(['education']).size().reset_index(name='counts')

Unnamed: 0,education,counts
0,10th,933
1,11th,1175
2,12th,433
3,1st-4th,168
4,5th-6th,333
5,7th-8th,646
6,9th,514
7,Assoc-acdm,1067
8,Assoc-voc,1382
9,Bachelors,5355


## education-num analysis

In [40]:
dataframe['education-num'].describe()

count    32561.000000
mean        10.080679
std          2.572720
min          1.000000
25%          9.000000
50%         10.000000
75%         12.000000
max         16.000000
Name: education-num, dtype: float64

In [41]:
for i in range(len(dataframe['education-num'])):
    try:
        float(dataframe['education-num'][i])
    except ValueError as e:
        print(f"found missing value {dataframe['education-num'][i]} in education-num column, {i} row")

## marital-status analysis

In [42]:
dataframe.groupby(['marital-status']).size().reset_index(name='counts')

Unnamed: 0,marital-status,counts
0,Divorced,4443
1,Married-AF-spouse,23
2,Married-civ-spouse,14976
3,Married-spouse-absent,418
4,Never-married,10683
5,Separated,1025
6,Widowed,993


## occupation analysis

In [43]:
dataframe.groupby(['occupation']).size().reset_index(name='counts')

Unnamed: 0,occupation,counts
0,?,1843
1,Adm-clerical,3770
2,Armed-Forces,9
3,Craft-repair,4099
4,Exec-managerial,4066
5,Farming-fishing,994
6,Handlers-cleaners,1370
7,Machine-op-inspct,2002
8,Other-service,3295
9,Priv-house-serv,149


## occupation processing

In [44]:
dataframe['occupation'] = dataframe['occupation'].replace('?', 'UNK')
dataframe.groupby(['occupation']).size().reset_index(name='counts')

Unnamed: 0,occupation,counts
0,Adm-clerical,3770
1,Armed-Forces,9
2,Craft-repair,4099
3,Exec-managerial,4066
4,Farming-fishing,994
5,Handlers-cleaners,1370
6,Machine-op-inspct,2002
7,Other-service,3295
8,Priv-house-serv,149
9,Prof-specialty,4140


## relationship analysis

In [45]:
dataframe.groupby(['relationship']).size().reset_index(name='counts')

Unnamed: 0,relationship,counts
0,Husband,13193
1,Not-in-family,8305
2,Other-relative,981
3,Own-child,5068
4,Unmarried,3446
5,Wife,1568


In [46]:
## race analysis
dataframe.groupby(['race']).size().reset_index(name='counts')

Unnamed: 0,race,counts
0,Amer-Indian-Eskimo,311
1,Asian-Pac-Islander,1039
2,Black,3124
3,Other,271
4,White,27816


## sex analysis

In [48]:
dataframe.groupby(['sex']).size().reset_index(name='counts')

Unnamed: 0,sex,counts
0,Female,10771
1,Male,21790


## capital-gain analysis

In [49]:
dataframe['capital-gain'].describe()

count    32561.000000
mean      1077.648844
std       7385.292085
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      99999.000000
Name: capital-gain, dtype: float64

In [52]:
col = 'capital-gain'
for i in range(len(dataframe[col])):
    try:
        float(dataframe[col][i])
    except ValueError as e:
        print(f"found missing value {dataframe[col][i]} in {col} column, {i} row")

## capital-loss analysis

In [53]:
dataframe['capital-loss'].describe()

count    32561.000000
mean        87.303830
std        402.960219
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       4356.000000
Name: capital-loss, dtype: float64

In [54]:
col = 'capital-loss'
for i in range(len(dataframe[col])):
    try:
        float(dataframe[col][i])
    except ValueError as e:
        print(f"found missing value {dataframe[col][i]} in {col} column, {i} row")

## hours-per-week analysis

In [55]:
dataframe['hours-per-week'].describe()

count    32561.000000
mean        40.437456
std         12.347429
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: hours-per-week, dtype: float64

In [56]:
col = 'hours-per-week'
for i in range(len(dataframe[col])):
    try:
        float(dataframe[col][i])
    except ValueError as e:
        print(f"found missing value {dataframe[col][i]} in {col} column, {i} row")


## native-country analysis

In [57]:
dataframe.groupby(['native-country']).size().reset_index(name='counts')

Unnamed: 0,native-country,counts
0,?,583
1,Cambodia,19
2,Canada,121
3,China,75
4,Columbia,59
5,Cuba,95
6,Dominican-Republic,70
7,Ecuador,28
8,El-Salvador,106
9,England,90


In [58]:
dataframe['native-country'] = dataframe['native-country'].replace('?', 'UNK')
dataframe.groupby(['native-country']).size().reset_index(name='counts')

Unnamed: 0,native-country,counts
0,Cambodia,19
1,Canada,121
2,China,75
3,Columbia,59
4,Cuba,95
5,Dominican-Republic,70
6,Ecuador,28
7,El-Salvador,106
8,England,90
9,France,29


## salary analysis

In [59]:
dataframe.groupby(['salary']).size().reset_index(name='counts')

Unnamed: 0,salary,counts
0,<=50K,24720
1,>50K,7841


# Statistics

In [64]:
print(f"Number of rows: {dataframe.shape[0]}")
print(f"Dataframe original column count: {len(dataframe.columns)}")
print(f"Dataframe column list: {dataframe.columns}")
print("Column data types: ")
for col in dataframe.columns:
    print(f"{col} column dtype: {dataframe[col].dtype}")

Number of rows: 32561
Dataframe original column count: 15
Dataframe column list: Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')
Column data types: 
age column dtype: int64
workclass column dtype: object
fnlwgt column dtype: int64
education column dtype: object
education-num column dtype: int64
marital-status column dtype: object
occupation column dtype: object
relationship column dtype: object
race column dtype: object
sex column dtype: object
capital-gain column dtype: int64
capital-loss column dtype: int64
hours-per-week column dtype: int64
native-country column dtype: object
salary column dtype: object


In [70]:
for col in dataframe.columns:
    if dataframe[col].dtype == 'int64':
        continue
    one_hot_encoded = pd.get_dummies(dataframe[col], prefix=col, prefix_sep='_', drop_first=True, dtype=int)
    dataframe = dataframe.join(one_hot_encoded)
    dataframe = dataframe.drop(col,axis = 1)

In [72]:
print(f"current column count: {len(dataframe.columns)}")
print(f"current column names: {dataframe.columns}")

current column count: 101
current column names: Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_Local-gov', 'workclass_Never-worked',
       'workclass_Private', 'workclass_Self-emp-inc',
       ...
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_UNK',
       'native-country_United-States', 'native-country_Vietnam',
       'native-country_Yugoslavia', 'salary_>50K'],
      dtype='object', length=101)


In [74]:
target_column = "salary_>50K"
X_dataframe = dataframe.drop(target_column, inplace=False, axis=1)
Y_dataframe = dataframe[target_column]

In [75]:
# checking whether all data is numeric
for col in X_dataframe:
    for i in range(len(X_dataframe[col])):
        try:
            float(X_dataframe[col][i])
        except ValueError as e:
            print(X_dataframe[col][i] == ' ')
            print(f"found missing value {X_dataframe[col][i]} in {col} column, {i} row")
            exit(1)

In [80]:
information_gains = []
for column in X_dataframe.columns:
    single_column = X_dataframe[column].to_numpy()
    single_column = single_column.reshape((single_column.shape[0], 1))
    information_gain = mutual_info_classif(single_column, Y_dataframe, random_state=42)[0]
    information_gains.append({
        "column": column,
        "information": information_gain
    })

sorted_attributes = sorted(information_gains, key=lambda d: d['information'], reverse=True)
keeping_attributes = []
for info in sorted_attributes[:column_count]:
    keeping_attributes.append(info["column"])

['marital-status_Married-civ-spouse',
 'capital-gain',
 'age',
 'marital-status_Never-married',
 'education-num',
 'hours-per-week',
 'relationship_Own-child',
 'capital-loss',
 'fnlwgt',
 'sex_Male',
 'occupation_Exec-managerial',
 'relationship_Not-in-family',
 'occupation_Other-service',
 'occupation_Prof-specialty',
 'education_Bachelors',
 'relationship_Unmarried',
 'education_Masters',
 'workclass_Self-emp-inc',
 'education_HS-grad',
 'education_Prof-school',
 'education_Doctorate',
 'race_White',
 'relationship_Other-relative',
 'relationship_Wife',
 'occupation_UNK',
 'race_Black',
 'workclass_UNK',
 'education_11th',
 'occupation_Handlers-cleaners',
 'native-country_United-States',
 'marital-status_Separated',
 'native-country_Mexico',
 'marital-status_Widowed',
 'education_9th',
 'occupation_Machine-op-inspct',
 'workclass_Private',
 'education_7th-8th',
 'education_5th-6th',
 'education_Some-college',
 'workclass_Local-gov',
 'occupation_Priv-house-serv',
 'marital-status_Ma

In [82]:
print(f"length of keeping attributes: {len(keeping_attributes)}")
keeping_attributes

length of keeping attributes: 50


['marital-status_Married-civ-spouse',
 'capital-gain',
 'age',
 'marital-status_Never-married',
 'education-num',
 'hours-per-week',
 'relationship_Own-child',
 'capital-loss',
 'fnlwgt',
 'sex_Male',
 'occupation_Exec-managerial',
 'relationship_Not-in-family',
 'occupation_Other-service',
 'occupation_Prof-specialty',
 'education_Bachelors',
 'relationship_Unmarried',
 'education_Masters',
 'workclass_Self-emp-inc',
 'education_HS-grad',
 'education_Prof-school',
 'education_Doctorate',
 'race_White',
 'relationship_Other-relative',
 'relationship_Wife',
 'occupation_UNK',
 'race_Black',
 'workclass_UNK',
 'education_11th',
 'occupation_Handlers-cleaners',
 'native-country_United-States',
 'marital-status_Separated',
 'native-country_Mexico',
 'marital-status_Widowed',
 'education_9th',
 'occupation_Machine-op-inspct',
 'workclass_Private',
 'education_7th-8th',
 'education_5th-6th',
 'education_Some-college',
 'workclass_Local-gov',
 'occupation_Priv-house-serv',
 'marital-status_Ma

In [83]:
X_dataframe = X_dataframe[keeping_attributes]

In [84]:
len(X_dataframe.columns)

50

In [None]:
# add bias column
X_dataframe['ones'] = 1

In [88]:
# conversion to numpy array and scaling
X_numpy = X_dataframe.to_numpy()
Y_numpy = Y_dataframe.to_numpy().reshape((Y_dataframe.shape[0],1))
scaler = StandardScaler()
X_numpy = scaler.fit_transform(X_numpy)

In [89]:
print(f"X_numpy shape: {X_numpy.shape}")
print(f"Y_numpy shape: {Y_numpy.shape}")

X_numpy shape: (32561, 50)
Y_numpy shape: (32561, 1)
