In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Load ACS PUMS data
df = pd.read_parquet("../data/processed/parquet_pums/pums_all_cleaned.parquet")

In [3]:
# Display basic info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()


Dataset shape: (18600624, 21)

Columns: ['ST', 'PUMA', 'AGEP', 'SEX', 'RAC1P', 'HISP', 'MAR', 'SCHL', 'ESR', 'OCCP', 'SOCP', 'WAGP', 'PINCP', 'WKW', 'DIS', 'DEAR', 'DEYE', 'DOUT', 'CIT', 'NATIVITY', 'YEAR']

Data types:
ST           int32
PUMA         int32
AGEP         int32
SEX          int32
RAC1P        int32
HISP         int32
MAR          int32
SCHL         int32
ESR          int32
OCCP         int32
SOCP        object
WAGP         int32
PINCP        int32
WKW          int32
DIS          int32
DEAR         int32
DEYE         int32
DOUT         int32
CIT          int32
NATIVITY     int32
YEAR         int32
dtype: object

First few rows:


Unnamed: 0,ST,PUMA,AGEP,SEX,RAC1P,HISP,MAR,SCHL,ESR,OCCP,...,WAGP,PINCP,WKW,DIS,DEAR,DEYE,DOUT,CIT,NATIVITY,YEAR
0,1,2701,56,1,2,1,1,14,1,2040,...,2800,3450,1,1,2,2,2,1,1,2015
1,1,2701,61,2,2,1,1,19,6,0,...,0,14000,0,2,2,2,2,1,1,2015
2,1,2701,61,1,2,1,3,19,1,9600,...,35000,35000,1,2,2,2,2,1,1,2015
3,1,1000,52,2,9,1,3,11,6,0,...,0,8800,0,1,2,2,2,1,1,2015
4,1,1000,84,1,1,1,2,21,6,0,...,0,13200,0,2,2,2,2,1,1,2015


In [4]:
# Calculate correlation matrix for numeric columns
# Select only numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'int32', 'float64', 'float32']).columns.tolist()

print(f"Numeric columns for correlation: {numeric_cols}")
print(f"\nNumber of numeric columns: {len(numeric_cols)}")

# Calculate correlation matrix
corr_matrix = df[numeric_cols].corr()

print(f"\nCorrelation matrix shape: {corr_matrix.shape}")
print("\nCorrelation matrix:")
print(corr_matrix)

Numeric columns for correlation: ['ST', 'PUMA', 'AGEP', 'SEX', 'RAC1P', 'HISP', 'MAR', 'SCHL', 'ESR', 'OCCP', 'WAGP', 'PINCP', 'WKW', 'DIS', 'DEAR', 'DEYE', 'DOUT', 'CIT', 'NATIVITY', 'YEAR']

Number of numeric columns: 20

Correlation matrix shape: (20, 20)

Correlation matrix:
                ST      PUMA      AGEP       SEX     RAC1P      HISP  \
ST        1.000000  0.235100 -0.001032 -0.000163 -0.101644 -0.040033   
PUMA      0.235100  1.000000 -0.013497  0.000280  0.017754  0.010933   
AGEP     -0.001032 -0.013497  1.000000  0.050775 -0.137578 -0.062153   
SEX      -0.000163  0.000280  0.050775  1.000000  0.000179  0.000668   
RAC1P    -0.101644  0.017754 -0.137578  0.000179  1.000000  0.222451   
HISP     -0.040033  0.010933 -0.062153  0.000668  0.222451  1.000000   
MAR      -0.020172 -0.001815 -0.486363 -0.035286  0.083874  0.041548   
SCHL      0.004916  0.018968  0.005302  0.029259 -0.107418 -0.071755   
ESR      -0.009889 -0.008895  0.345328  0.083701 -0.033054 -0.017974   


In [5]:
# Create interactive correlation heatmap using Plotly
fig = px.imshow(
    corr_matrix,
    labels=dict(x="Variable", y="Variable", color="Correlation"),
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    color_continuous_scale="RdBu",
    color_continuous_midpoint=0,
    aspect="auto",
    title="ACS PUMS Data - Correlation Matrix"
)

# Add text annotations with correlation values
fig.update_traces(
    text=np.round(corr_matrix.values, 2),
    texttemplate="%{text}",
    textfont={"size": 10},
    hovertemplate="<b>%{y}</b> vs <b>%{x}</b><br>Correlation: %{z:.3f}<extra></extra>"
)

# Update layout for better readability
fig.update_layout(
    width=1000,
    height=1000,
    xaxis_title="",
    yaxis_title="",
    font=dict(size=10)
)

fig.show()


In [None]:
# Find strongest correlations
# Create a mask to exclude diagonal and duplicate pairs
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

# Flatten the correlation matrix and get pairs
corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.columns)):
        if mask[i, j]:  # Only upper triangle
            corr_pairs.append({
                'Variable 1': corr_matrix.columns[i],
                'Variable 2': corr_matrix.columns[j],
                'Correlation': corr_matrix.iloc[i, j]
            })

corr_df = pd.DataFrame(corr_pairs)

# Sort by absolute correlation value
corr_df['Abs_Correlation'] = corr_df['Correlation'].abs()
corr_df_sorted = corr_df.sort_values('Abs_Correlation', ascending=False)

print("Top 20 Strongest Correlations:")
print("=" * 80)
print(corr_df_sorted.head(20).to_string(index=False))

# Show strongest positive correlations
print("\n\nTop 10 Strongest Positive Correlations:")
print("=" * 80)
print(corr_df_sorted[corr_df_sorted['Correlation'] > 0].head(10).to_string(index=False))

# Show strongest negative correlations
print("\n\nTop 10 Strongest Negative Correlations:")
print("=" * 80)
print(corr_df_sorted[corr_df_sorted['Correlation'] < 0].head(10).to_string(index=False))


Top 20 Strongest Correlations:
Variable 1 Variable 2  Correlation  Abs_Correlation
       CIT   NATIVITY     0.973383         0.973383
      WAGP      PINCP     0.847475         0.847475
       DIS       DOUT     0.607177         0.607177
       DIS       DEAR     0.510123         0.510123
      AGEP        MAR    -0.486363         0.486363
       ESR       WAGP    -0.439590         0.439590
       ESR       OCCP    -0.438616         0.438616
       WKW       YEAR    -0.390288         0.390288
       DIS       DEYE     0.389808         0.389808
     RAC1P        CIT     0.383436         0.383436
     RAC1P   NATIVITY     0.376688         0.376688
      AGEP        ESR     0.345328         0.345328
       ESR        DIS    -0.333831         0.333831
      AGEP        DIS    -0.307026         0.307026
      SCHL      PINCP     0.292425         0.292425
       ESR      PINCP    -0.291174         0.291174
      AGEP       OCCP    -0.278134         0.278134
       ESR       DOUT    -0.27669