In [21]:
import arff
import pandas as pd

# Load the ARFF file
data = arff.load(open('data/ACS Income by State.arff', 'r'))

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data['data'])
df.columns = data['attributes']

# Feature explanations can be found in the paper appendix starting on p. 18
# Features:
#     AGEP = Age --> ratio
#     COW = Class of worker --> nominal
#     SCHL = Educational attainment --> ordinal
#     MAR = Marital status --> Nominal
#     OCCP = Occupation --> Nominal
#     POBP = Place of birth --> Nominal
#     RELP = Relationship --> Nominal
#     WKHP = Usual hours worked per week past 12 months --> ratio until 99+
#     SEX = 1 male 2 female
#     RAC1P = Recoded detailed race code --> nominal
#     ST = State
# Targets:
#     PINCP = Total person's income
# 

In [25]:
df.columns = [col[0] for col in df.columns] # rename columns

In [27]:
df.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST,PINCP
0,18.0,1.0,18.0,5.0,4720.0,13.0,17.0,21.0,2.0,2.0,1.0,1600.0
1,53.0,5.0,17.0,5.0,3605.0,18.0,16.0,40.0,1.0,1.0,1.0,10000.0
2,41.0,1.0,16.0,5.0,7330.0,1.0,17.0,40.0,1.0,1.0,1.0,24000.0
3,18.0,6.0,18.0,5.0,2722.0,1.0,17.0,2.0,2.0,1.0,1.0,180.0
4,21.0,5.0,19.0,5.0,3870.0,12.0,17.0,50.0,1.0,1.0,1.0,29000.0


In [28]:
# Max: Explore the dataset, understand the features
# Find out which classes are underrepresented

race_counts = df["RAC1P"].value_counts()
print("Unique values and counts in race column:")
print(race_counts)
print("#############")

sex_counts = df["SEX"].value_counts()
print("Unique values and counts in sex column:")
print(sex_counts)
print("#############")

sex_counts = df["MAR"].value_counts()
print("Unique values and counts in martial status column:")
print(sex_counts)
print("#############")


Unique values and counts in race column:
RAC1P
1.0    1298126
2.0     147573
6.0      96399
8.0      62877
9.0      41815
3.0      11887
7.0       2813
5.0       2104
4.0        906
Name: count, dtype: int64
#############
Unique values and counts in sex column:
SEX
1.0    866735
2.0    797765
Name: count, dtype: int64
#############
Unique values and counts in martial status column:
MAR
1.0    908704
5.0    513951
3.0    178811
2.0     34582
4.0     28452
Name: count, dtype: int64
#############
