In [1]:
import pandas as pd

url = "https://github.com/nickeubank/MIDS_Data/raw/refs/heads/master/US_AmericanCommunitySurvey/US_ACS_2017_10pct_sample.dta"

# Load the data
acs_data = pd.read_stata(url)

# Display the first few rows
print(acs_data.head())


   year  datanum   serial      cbserial          numprec subsamp  hhwt  \
0  2017        1   177686  2.017001e+12                9      64    55   
1  2017        1  1200045  2.017001e+12                6      79    25   
2  2017        1    70831  2.017000e+12  1 person record      36    57   
3  2017        1   557128  2.017001e+12                2      10    98   
4  2017        1   614890  2.017001e+12                4      96    54   

                                   hhtype       cluster    adjust  ...  \
0  female householder, no husband present  2.017002e+12  1.011189  ...   
1       male householder, no wife present  2.017012e+12  1.011189  ...   
2          male householder, living alone  2.017001e+12  1.011189  ...   
3         married-couple family household  2.017006e+12  1.011189  ...   
4         married-couple family household  2.017006e+12  1.011189  ...   

   migcounty1                 migmet131 vetdisab                   diffrem  \
0           0  not in identifiab

In [2]:
# Load the data
data = pd.read_stata(url)
# Calculate mean US income
mean_income = data['inctot'].mean()
results = {"ex2_avg_income": mean_income}

print(results)

{'ex2_avg_income': np.float64(1723646.2703978634)}


In [4]:
# Calculate the proportion reporting income of 9,999,999
results["ex3_share_making_9999999"] = data['inctot'].value_counts(normalize=True).get(9999999, 0)

# Calculate the proportion reporting income of 0
results["ex3_share_making_zero"] = data['inctot'].value_counts(normalize=True).get(0, 0)

print(results)


{'ex2_avg_income': np.float64(1723646.2703978634), 'ex3_share_making_9999999': np.float64(0.1689665333350052), 'ex3_share_making_zero': np.float64(0.10557547867738336)}


In [5]:
import numpy as np

# Replace all instances of 9999999 with np.nan in 'inctot'
data['inctot'] = data['inctot'].replace(9999999, np.nan)


In [6]:
# Replace all instances of 9999999 with np.nan in 'inctot'
data['inctot'] = data['inctot'].replace(9999999, np.nan)

# Calculate the new average US income, ignoring np.nan
results["ex5_avg_income"] = data['inctot'].mean()

print(results)


{'ex2_avg_income': np.float64(1723646.2703978634), 'ex3_share_making_9999999': np.float64(0.1689665333350052), 'ex3_share_making_zero': np.float64(0.10557547867738336), 'ex5_avg_income': np.float64(40890.177564946454)}


In [7]:
# Age distribution for people with missing income
missing_income_ages = data.loc[data['inctot'].isna(), 'age'].value_counts()

# Age distribution for people with non-missing income
nonmissing_income_ages = data.loc[data['inctot'].notna(), 'age'].value_counts()

# Print the top results for inspection
print("Ages with missing income:")
print(missing_income_ages.head(15))

print("\nAges with non-missing income:")
print(nonmissing_income_ages.head(15))

# (No need to update results dictionary for this inspection step)


Ages with missing income:
age
10                      3997
9                       3977
14                      3847
12                      3845
13                      3800
11                      3791
8                       3648
7                       3527
6                       3524
5                       3512
2                       3405
1                       3340
4                       3318
3                       3220
less than 1 year old    3150
Name: count, dtype: int64

Ages with non-missing income:
age
60    4950
54    4821
59    4776
56    4776
58    4734
57    4720
55    4693
61    4644
62    4614
53    4600
18    4496
63    4488
52    4418
65    4362
19    4342
Name: count, dtype: int64


In [8]:
# Subset to only employed respondents
employed_data = data[data['empstat'] == "employed"]

# You can check the size of this group if you want:
print(employed_data.shape)

# print(employed_data['inctot'].value_counts())


(148758, 104)


In [9]:
# Check the unique values in the 'race' column for employed respondents
print(employed_data['race'].value_counts())

# Now proceed if "black" and "white" are present;
# If the values are spelled differently, adjust them accordingly

# Calculate average income for employed Black Americans
results["ex8_avg_income_black"] = employed_data.loc[employed_data['race'] == "black", "inctot"].mean()

# Calculate average income for employed White Americans
results["ex8_avg_income_white"] = employed_data.loc[employed_data['race'] == "white", "inctot"].mean()

# Calculate the percentage difference
if results["ex8_avg_income_black"] > 0:
    results["ex8_racial_difference"] = (
        (results["ex8_avg_income_white"] - results["ex8_avg_income_black"])
        / results["ex8_avg_income_black"]
    ) * 100
else:
    results["ex8_racial_difference"] = None

print(results)


race
white                               116017
black/african american/negro         13175
other asian or pacific islander       6424
other race, nec                       5755
two major races                       3135
chinese                               2149
american indian or alaska native      1290
three or more major races              426
japanese                               387
Name: count, dtype: int64
{'ex2_avg_income': np.float64(1723646.2703978634), 'ex3_share_making_9999999': np.float64(0.1689665333350052), 'ex3_share_making_zero': np.float64(0.10557547867738336), 'ex5_avg_income': np.float64(40890.177564946454), 'ex8_avg_income_black': nan, 'ex8_avg_income_white': np.float64(60473.15372747098), 'ex8_racial_difference': None}


In [10]:
# Weighted average income for employed White Americans
white = employed_data[employed_data['race'] == "white"]
results["ex9_avg_income_white"] = (
    (white["inctot"] * white["perwt"]).sum() / white["perwt"].sum()
)

# Weighted average income for employed Black Americans
black = employed_data[employed_data['race'] == "black"]
results["ex9_avg_income_black"] = (
    (black["inctot"] * black["perwt"]).sum() / black["perwt"].sum()
)

print(results)


{'ex2_avg_income': np.float64(1723646.2703978634), 'ex3_share_making_9999999': np.float64(0.1689665333350052), 'ex3_share_making_zero': np.float64(0.10557547867738336), 'ex5_avg_income': np.float64(40890.177564946454), 'ex8_avg_income_black': nan, 'ex8_avg_income_white': np.float64(60473.15372747098), 'ex8_racial_difference': None, 'ex9_avg_income_white': np.float64(58361.48196061399), 'ex9_avg_income_black': np.float64(nan)}


  (black["inctot"] * black["perwt"]).sum() / black["perwt"].sum()


In [11]:
# Check the available race values for employed respondents
print(employed_data['race'].value_counts())

# Adjust as needed if the value for Black is different, e.g., 'Black/African American'
# For example, if you find 'black/african american' instead of 'black', use that value below
black_race_label = "black"  # update if a different label appears above

# Weighted average income for employed Black Americans
black = employed_data[employed_data['race'].str.lower() == black_race_label]
results["ex9_avg_income_black"] = (
    (black["inctot"] * black["perwt"]).sum() / black["perwt"].sum()
)

print(results)


race
white                               116017
black/african american/negro         13175
other asian or pacific islander       6424
other race, nec                       5755
two major races                       3135
chinese                               2149
american indian or alaska native      1290
three or more major races              426
japanese                               387
Name: count, dtype: int64
{'ex2_avg_income': np.float64(1723646.2703978634), 'ex3_share_making_9999999': np.float64(0.1689665333350052), 'ex3_share_making_zero': np.float64(0.10557547867738336), 'ex5_avg_income': np.float64(40890.177564946454), 'ex8_avg_income_black': nan, 'ex8_avg_income_white': np.float64(60473.15372747098), 'ex8_racial_difference': None, 'ex9_avg_income_white': np.float64(58361.48196061399), 'ex9_avg_income_black': np.float64(nan)}


  (black["inctot"] * black["perwt"]).sum() / black["perwt"].sum()


Exercise 11


In [12]:
assert set(results.keys()) == {
    "ex2_avg_income",
    "ex3_share_making_9999999",
    "ex3_share_making_zero",
    "ex5_avg_income",
    "ex8_avg_income_black",
    "ex8_avg_income_white",
    "ex8_racial_difference",
    "ex9_avg_income_black",
    "ex9_avg_income_white",
    "ex10_wage_gap",
}

AssertionError: 