In [2]:
import requests
import pandas as pd
import geopandas as gpd
from io import BytesIO
from zipfile import ZipFile

# ============================================
# STEP 1: Fetch ACS 5-Year Median Income Data
# ============================================

print("üì° Fetching ACS 5-Year 2023 median household income data for LA County...")

base_url = "https://api.census.gov/data/2023/acs/acs5"
get_vars = ["NAME", "B19013_001E"]  # Median household income
params = {
    "get": ",".join(get_vars),
    "for": "tract:*",
    "in": "state:06+county:037"  # California, Los Angeles County
}

response = requests.get(base_url, params=params)
response.raise_for_status()

data = response.json()
columns = data[0]
rows = data[1:]
income_df = pd.DataFrame(rows, columns=columns)

# Clean and convert data types
income_df["B19013_001E"] = pd.to_numeric(income_df["B19013_001E"], errors="coerce")
income_df["GEOID"] = income_df["state"] + income_df["county"] + income_df["tract"]

print(f"‚úÖ Retrieved {len(income_df)} census tracts of income data")

# ============================================
# STEP 2: Download Census Tract Shapefile
# ============================================

print("üó∫Ô∏è Downloading LA County Census Tract shapefile (TIGER/Line)...")

# TIGER/Line URL for 2023 tracts, California (state FIPS 06)
shapefile_url = "https://www2.census.gov/geo/tiger/TIGER2023/TRACT/tl_2023_06_tract.zip"

r = requests.get(shapefile_url)
r.raise_for_status()

# Extract ZIP in memory
with ZipFile(BytesIO(r.content)) as z:
    z.extractall("tl_2023_06_tract")

tracts_gdf = gpd.read_file("tl_2023_06_tract/tl_2023_06_tract.shp")

# Filter for Los Angeles County (FIPS 037)
tracts_gdf = tracts_gdf[tracts_gdf["COUNTYFP"] == "037"]

print(f"‚úÖ Loaded {len(tracts_gdf)} tract polygons for Los Angeles County")

# ============================================
# STEP 3: Merge Census Tract Shapes with Income Data
# ============================================

merged = tracts_gdf.merge(income_df, left_on="GEOID", right_on="GEOID", how="left")

# ============================================
# STEP 4: Export the Results
# ============================================

# Save to GeoJSON
merged.to_file("la_tracts_median_income.geojson", driver="GeoJSON")

# Save also to CSV (no geometry)
merged.drop(columns="geometry").to_csv("la_tracts_median_income.csv", index=False)

print("üéâ Done!")
print("Saved files:")
print(" - la_tracts_median_income.geojson (for mapping/GIS)")
print(" - la_tracts_median_income.csv (for analysis)")


üì° Fetching ACS 5-Year 2023 median household income data for LA County...
‚úÖ Retrieved 2498 census tracts of income data
üó∫Ô∏è Downloading LA County Census Tract shapefile (TIGER/Line)...
‚úÖ Loaded 2498 tract polygons for Los Angeles County
üéâ Done!
Saved files:
 - la_tracts_median_income.geojson (for mapping/GIS)
 - la_tracts_median_income.csv (for analysis)


In [None]:

from shapely.geometry import Point

# === Step 1: Load Census tract GeoJSON ===
tracts = gpd.read_file("la_tracts_median_income.geojson")

# === Step 2: Load your other dataset with lon/lat ===
crimes = pd.read_csv("Crime_Data_from_2020_to_Present.csv") 

# === Step 3: Convert crimes DataFrame to GeoDataFrame ===
geometry = [Point(xy) for xy in zip(crimes["LON"], crimes["LAT"])]
crimes_gdf = gpd.GeoDataFrame(crimes, geometry=geometry)

# Ensure both GeoDataFrames use the same CRS
crimes_gdf.set_crs(tracts.crs, inplace=True)
tracts = tracts.to_crs(crimes_gdf.crs)

# === Step 4: Spatial join - assign each crime to a Census tract ===
joined = gpd.sjoin(crimes_gdf, tracts, how="left", predicate="within")



# The joined GeoDataFrame now has median income from tract for each point
# e.g., 'B19013_001E' column from Census



# === Step 5: Optional - save result ===
joined.to_csv("lapd_crimes_with_median_income.csv", index=False)
joined.to_file("lapd_crimes_with_median_income.geojson", driver="GeoJSON")

print("‚úÖ Merged points with Census tract median income successfully!")


‚úÖ Merged points with Census tract median income successfully!


In [8]:
merged_data.to_csv("la_tracts_median_income_with_crime.csv", index=False)

### NEW CONTEXTUAL FEATURES 

In [27]:
df = pd.read_csv("lapd_crimes_with_median_income.csv")

premis_codes = [101,108,104,707,123]

premis_filtered = df[df['Premis Cd'].apply(lambda x: x in premis_codes)]
premis_filtered = pd.DataFrame(premis_filtered)
premis_filtered = premis_filtered[premis_filtered['B19013_001E'].apply(lambda x: x >= 0)]



premis_filtered.info()



<class 'pandas.core.frame.DataFrame'>
Index: 367301 entries, 3 to 1004988
Data columns (total 48 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DR_NO           367301 non-null  int64  
 1   Date Rptd       367301 non-null  object 
 2   DATE OCC        367301 non-null  object 
 3   TIME OCC        367301 non-null  int64  
 4   AREA            367301 non-null  int64  
 5   AREA NAME       367301 non-null  object 
 6   Rpt Dist No     367301 non-null  int64  
 7   Part 1-2        367301 non-null  int64  
 8   Crm Cd          367301 non-null  int64  
 9   Crm Cd Desc     367301 non-null  object 
 10  Mocodes         229046 non-null  object 
 11  Vict Age        367301 non-null  int64  
 12  Vict Sex        230186 non-null  object 
 13  Vict Descent    230179 non-null  object 
 14  Premis Cd       367301 non-null  float64
 15  Premis Desc     367301 non-null  object 
 16  Weapon Used Cd  91457 non-null   float64
 17  Weapon Desc   

In [28]:
premis_filtered.groupby("AREA NAME")["B19013_001E"].mean()


AREA NAME
77th Street     60017.217854
Central         75573.686444
Devonshire     111689.994111
Foothill        80475.678399
Harbor          73889.895960
Hollenbeck      62489.600111
Hollywood       72106.543542
Mission         76769.618205
N Hollywood     87068.055903
Newton          57002.731005
Northeast       99015.598973
Olympic         57018.363871
Pacific        117936.181424
Rampart         57570.290100
Southeast       55429.316604
Southwest       58181.728584
Topanga        100876.307233
Van Nuys        87537.575956
West LA        127968.145007
West Valley     94775.693601
Wilshire        98287.787204
Name: B19013_001E, dtype: float64

In [None]:
premis_filtered['thefts_by_income'] = premis_filtered['B19013_001E'] / premis_filtered[premis_filtered['Crm Cd']==510]

KeyError: 'Crm Cc'