In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("csv_pca/psam_p06.csv")
df.head()

In [None]:
len(df)

In [None]:
len(np.unique(df.PUMA.values))

In [None]:
INCOME_COL = "PINCP"
AGE_COL = "AGEP"

In [None]:
df = df[df[INCOME_COL] != 0.0]

In [None]:
currdf = df[df["PUMA"] == 8103]
plt.scatter(currdf[AGE_COL], currdf[INCOME_COL])

currdf = df[df["PUMA"] == 3100]
plt.scatter(currdf[AGE_COL], currdf[INCOME_COL])

currdf = df[df["PUMA"] == 2901]
plt.scatter(currdf[AGE_COL], currdf[INCOME_COL])

currdf = df[df["PUMA"] == 2901]
plt.scatter(currdf[AGE_COL], currdf[INCOME_COL])

plt.yscale("log")

In [None]:
avg_incomes = []
avg_ages = []
std_incomes = []
std_ages = []

for puma in np.unique(df.PUMA.values):
    currdf = df[df["PUMA"] == puma]
    incomes = currdf[INCOME_COL].values
    avg_incomes.append(np.nanmean(incomes))
    std_incomes.append(np.nanstd(incomes))
    ages = currdf[AGE_COL].values
    avg_ages.append(np.nanmean(ages))
    std_ages.append(np.nanstd(ages))

In [None]:
plt.scatter(avg_ages, avg_incomes, color="r")
plt.title("AVG AGE vs AVG INCOME per PUMA")
plt.show()

In [None]:
plt.scatter(std_ages, std_incomes)
plt.title("STD AGE vs STD INCOME per PUMA")
plt.show()

In [None]:
import seaborn as sns
fig = plt.figure()
ax = fig.add_subplot(111)

for puma in np.unique(df.PUMA.values):
    currdf = df[df["PUMA"] == puma]
    incomes = currdf[INCOME_COL].values
    sns.kdeplot(np.log(incomes[incomes > 0]), ax=ax, alpha=0.3)

In [None]:
import shapefile as shp

In [None]:
sf = shp.Reader("csv_pca/ipums_puma_2010/ipums_puma_2010.shp")

plt.figure()
kept = 0
ids = []

for shape in sf.shapeRecords():
    if shape.record[4] != "California":
        continue
    
    kept += 1
    ids.append(int(shape.record[5]))
    x = [i[0] for i in shape.shape.points[:]]
    y = [i[1] for i in shape.shape.points[:]]
    plt.plot(x,y)
plt.show()

In [None]:
tmp = sf.shapeRecords()[0]

In [None]:
tmp.record

In [None]:
sorted(ids)

In [None]:
np.unique(df.PUMA.values)

In [None]:
import geopandas as gpd

shp = gpd.read_file("csv_pca/ipums_puma_2010/ipums_puma_2010.shp")

In [None]:
shp = shp[shp["State"] == "California"]

In [None]:
shp

In [None]:
import geopandas as gpd

# open file
gdf = shp

# add NEIGHBORS column
gdf["NEIGHBORS"] = None  

for index, country in gdf.iterrows():   

    # get 'not disjoint' countries
    neighbors = gdf[~gdf.geometry.disjoint(country.geometry)].PUMA.tolist()

    # remove own name of the country from the list
    neighbors = [ name for name in neighbors if country.PUMA != name ]

    # add names of neighbors as NEIGHBORS value
    gdf.at[index, "NEIGHBORS"] = ", ".join(neighbors)


In [None]:
shp

In [None]:
puma2id = {x: i for i, x in enumerate(np.unique(df.PUMA.values))}

In [None]:
np.random.seed(100)

income_data = []
for puma in np.unique(df.PUMA.values):
    currdf = df[df["PUMA"] == puma]
    inc = np.random.choice(currdf[INCOME_COL].values, 200)
    income_data.append(inc)

In [None]:
import pickle

income_data = np.array(income_data)
with open("income_data/california_income_subsampled.pickle", "wb") as fp:
    pickle.dump(income_data, fp)

In [None]:
n = len(np.unique(df.PUMA.values))

neighbors = np.zeros((n, n))
for index, puma in shp.iterrows():  
    row = puma2id[int(puma.PUMA)]
    curr_neigh = [int(x) for x in puma.NEIGHBORS.split(", ")]
    for neigh in curr_neigh:
        col = puma2id[neigh]
        neighbors[row, col] += 1

In [None]:
with open("income_data/california_puma_neighbors.pickle", "wb") as fp:
    pickle.dump(neighbors, fp)