In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 1. Load the data from the CSV file.
file_path = "../data/usa_per_0923.csv"
df = pd.read_csv(file_path)
original_count = len(df)
print(f"Initial number of records: {original_count}")

Initial number of records: 47203336


In [3]:
# 2a. Remove instances with INCTOT == 9999998 or 9999999.
mask_inctot_bad = (df['INCTOT'] == 9999998) | (df['INCTOT'] == 9999999)
removed_count = mask_inctot_bad.sum()
df = df[~mask_inctot_bad]
current_count = len(df)
percent_removed = removed_count / original_count * 100
print(f"Removed {removed_count} instances with INCTOT == 9999998 or 9999999. That is {percent_removed:.2f}% of the data.")

# 2b. Remove instances with AGE < 15.
count_before_age = len(df)
df = df[df['AGE'] >= 15]
removed_count_age = count_before_age - len(df)
percent_removed_age = removed_count_age / count_before_age * 100
print(f"Removed {removed_count_age} instances with AGE < 15. That is {percent_removed_age:.2f}% of the data (after previous filter).")

Removed 7947616 instances with INCTOT == 9999998 or 9999999. That is 16.84% of the data.
Removed 0 instances with AGE < 15. That is 0.00% of the data (after previous filter).


In [4]:
# 3. Group the data by YEAR and compute insights.
grouped = df.groupby('YEAR')

# Initialize a list to store the insight rows.
insights = []

# Loop over each group (i.e. for each YEAR).
for year, group in grouped:
    # a. The number of values with INCTOT <= 1.
    count_le1 = (group['INCTOT'] <= 1).sum()
    
    # b. The number of values with INCTOT < 0.
    count_lt0 = (group['INCTOT'] < 0).sum()
    
    # c. The average and median of INCTOT values that are > 1.
    # If no values > 1 exist, these will be NaN.
    subset_over1 = group[group['INCTOT'] > 1]['INCTOT']
    avg_over1 = subset_over1.mean()
    median_over1 = subset_over1.median()
    
    # d. The overall average and median of INCTOT values.
    avg_all = group['INCTOT'].mean()
    median_all = group['INCTOT'].median()
    
    insights.append({
        'YEAR': year,
        'count_INCTOT_<=1': count_le1,
        'count_INCTOT_<0': count_lt0,
        'avg_INCTOT_>1': avg_over1,
        'median_INCTOT_>1': median_over1,
        'avg_INCTOT_all': avg_all,
        'median_INCTOT_all': median_all
    })

insights_df = pd.DataFrame(insights)
print("\nData insights grouped by YEAR:")
print(insights_df)


Data insights grouped by YEAR:
    YEAR  count_INCTOT_<=1  count_INCTOT_<0  avg_INCTOT_>1  median_INCTOT_>1  \
0   2009            298182             2848   39439.429449           26000.0   
1   2010            325499             2665   38854.338430           25000.0   
2   2011            352759             2325   38009.142122           24300.0   
3   2012            349685             2092   39509.808419           25000.0   
4   2013            347240             1826   41306.260724           26000.0   
5   2014            343041             1724   42106.821431           26700.0   
6   2015            344350             1821   44093.474655           28000.0   
7   2016            337795             1756   45324.715535           29000.0   
8   2017            338592             3005   46812.474673           30000.0   
9   2018            333886             3057   48542.906077           30000.0   
10  2019            325444             2818   51474.128886           33000.0   
11  2020

In [5]:
# 4. Remove instances with INCTOT <= 1 from the filtered dataset.
final_count_before = len(df)
df = df[df['INCTOT'] > 1]
removed_final = final_count_before - len(df)
print(f"\nRemoved {removed_final} instances with INCTOT <= 1 from the dataset.")


Removed 5047327 instances with INCTOT <= 1 from the dataset.


In [7]:
# 5. For the filtered data, compute the INCTOT percentiles grouped by YEAR at every 0.01% interval.
# First, create an array of quantile levels from 0 to 1 in steps of 0.0001 (i.e. 0.01%)
quantile_levels = np.linspace(0, 1, 10001)   # 0, 0.0001, 0.0002, ... 1.0

# Next, compute the quantiles per YEAR and store results in a DataFrame
percentile_dfs = []
for year, group in df.groupby("YEAR"):
    # Compute the quantiles for the INCTOT column for this group
    quantiles = group["INCTOT"].quantile(quantile_levels)
    # Construct a DataFrame: each row reports YEAR, the decimal representation of the percentile,
    # and the corresponding INCTOT value.
    temp_df = pd.DataFrame({
        "YEAR": year,
        "Percentile": quantile_levels,  # If preferred, you could multiply by 100 to show percentages
        "INCTOT_value": quantiles.values
    })
    percentile_dfs.append(temp_df)

# Concatenate all groups into one DataFrame and sort by YEAR then Percentile.
percentile_df = pd.concat(percentile_dfs, ignore_index=True)
percentile_df.sort_values(["YEAR", "Percentile"], inplace=True)


Percentile DataFrame (first few rows):
    YEAR  Percentile  INCTOT_value
0   2009      0.0000           4.0
1   2009      0.0001           4.0
2   2009      0.0002           4.0
3   2009      0.0003           4.0
4   2009      0.0004           4.0
5   2009      0.0005           4.0
6   2009      0.0006          10.0
7   2009      0.0007          10.0
8   2009      0.0008          20.0
9   2009      0.0009          20.0
10  2009      0.0010          20.0
11  2009      0.0011          30.0
12  2009      0.0012          30.0
13  2009      0.0013          40.0
14  2009      0.0014          40.0
15  2009      0.0015          50.0
16  2009      0.0016          50.0
17  2009      0.0017          50.0
18  2009      0.0018          60.0
19  2009      0.0019          60.0
20  2009      0.0020          70.0
21  2009      0.0021          80.0
22  2009      0.0022          80.0
23  2009      0.0023          90.0
24  2009      0.0024         100.0
25  2009      0.0025         100.0
26  2009      0

In [8]:
# Remove duplicate (YEAR, INCTOT_value) rows by keeping only the median Percentile for each group
percentile_df = (
    percentile_df
    .groupby(["YEAR", "INCTOT_value"], as_index=False)["Percentile"]
    .median()
)

In [None]:
# Save the results to CSV files:
insights_df.to_csv("../data/usa_inc_stats.csv", index=False)
percentile_df.to_csv("../data/usa_percentiles.csv", index=False)