# Cencus 1994 USA
### Data Explore

In [1]:
# importing necessary modules
import pandas as pd 
import numpy as np 
import os

In [2]:
# column names and types for reading
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
dtype = { 'age':int, 'workclass':object, 'fnlwgt':int, 'education':object, 'education-num':int, 'marital-status':object, 'occupation':object,
        'relationship':object, 'race':object, 'gender':object, 'capital-gain':int, 'capital-loss':int, 'hours-per-week':int, 
        'native-country':object, 'income':str}

In [3]:
# load datasets
file1 = "adult.data"
file2 = "adult.test"
folder = r'/home/huzyk/Documents/Python/Representing Data/Cencus_1994_USA/data'
path1 = os.path.join(folder, file1)
path2 = os.path.join(folder, file2)

df_train = pd.read_csv(path1, names=names, dtype=dtype)
df_test = pd.read_csv(path2, names=names, dtype=dtype)

In [4]:
# exploring the shape of datasets
print("Train set shape: {}".format(df_train.shape))
print("Test set shape: {}".format(df_test.shape))

Train set shape: (32561, 15)
Test set shape: (16281, 15)


In [5]:
dataset = df_train.merge(df_test, how='outer')
print("All dataset shape: {}".format(dataset.shape))
dataset.head(10)

All dataset shape: (48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


Dataset consist of:<br>
- 7 - polinomials attributes<br>
- 1 - binomical attributes<br>
- 6 - continuous attributes

In [6]:
# Continuous attributes
df_cont = pd.DataFrame()
cont_attr = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
df_cont["Mean"] = dataset[cont_attr].mean(axis=0)
df_cont["Median"] = dataset[cont_attr].median(axis=0)
df_cont["Std Dev"] = dataset[cont_attr].std(axis=0)
df_cont["Min Val"] = dataset[cont_attr].min(axis=0)
df_cont["Max Val"] = dataset[cont_attr].max(axis=0)
df_cont

Unnamed: 0,Mean,Median,Std Dev,Min Val,Max Val
age,38.643585,37.0,13.71051,17,90
fnlwgt,189664.134597,178144.5,105604.025423,12285,1490400
education-num,10.078089,10.0,2.570973,1,16
capital-gain,1079.067626,0.0,7452.019058,0,99999
capital-loss,87.502314,0.0,403.004552,0,4356
hours-per-week,40.422382,40.0,12.391444,1,99


In [34]:
# Polinomails attributes
arr_index = np.array(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'])
arr_val = []
arr_subindex = []

for el in arr_index:
    ser = dataset[el].value_counts()
    arr_val.extend(list(ser))
    arr_subindex.append(list(ser.index))

i = 0
arr_index_subindex = []
for index in arr_index:
    for subindex in arr_subindex[i]:
        arr_index_subindex.append([index, subindex])
    i += 1
    
df_index = pd.DataFrame(arr_index_subindex, columns=['Attribute', 'Value'])
df_pol = pd.DataFrame(data = arr_val, index=pd.MultiIndex.from_frame(df_index), columns=['Count'])

np.set_printoptions(suppress=True, precision=1)
df_pol['Percentage'] = np.around((np.array(arr_val)/dataset.shape[0])*100, decimals=2)

df_pol

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Percentage
Attribute,Value,Unnamed: 2_level_1,Unnamed: 3_level_1
workclass,Private,33906,69.42
workclass,Self-emp-not-inc,3862,7.91
workclass,Local-gov,3136,6.42
workclass,?,2799,5.73
workclass,State-gov,1981,4.06
workclass,Self-emp-inc,1695,3.47
workclass,Federal-gov,1432,2.93
workclass,Without-pay,21,0.04
workclass,Never-worked,10,0.02
education,HS-grad,15784,32.32


In [8]:
# Binomical attributes
df_bin = pd.DataFrame()
df_bin["Count"] = dataset.gender.value_counts()
per_arr = np.array([])
for x in df_bin['Count'].iteritems():
    per_arr = np.append(per_arr, x[1]/dataset.shape[0])
df_bin["Percentage"] = per_arr
df_bin

Unnamed: 0,Count,Percentage
Male,32650,0.668482
Female,16192,0.331518
