# Predictive Analytics - Individual Assignment

Data wrangling and analysis of King County property sales and Seattle crime data.

In [12]:
!pip install pandas geopandas scipy

Collecting scipy
  Downloading scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl (20.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.3/20.3 MB[0m [31m41.5 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.17.1


In [5]:
import pandas as pd
import numpy as np

## Unique Years in Each Dataset

In [6]:
# Date formats to avoid parsing warnings
KINGCO_DATE_FORMAT = '%Y-%m-%d'
SPD_DATE_FORMAT = '%m/%d/%Y %I:%M:%S %p'

# Load sales data - read only date column for efficiency (sales_temp is a temporary dataframe used only in this cell for the year analysis)
sales_temp = pd.read_csv('kingco_sales.csv', usecols=['sale_date'])
sales_temp['sale_date'] = pd.to_datetime(sales_temp['sale_date'], format=KINGCO_DATE_FORMAT, errors='coerce')
sales_temp['year'] = sales_temp['sale_date'].dt.year
sales_years = np.unique(sales_temp['year'].dropna()).astype(int).tolist()

# Count records per year
counts_per_year = sales_temp['year'].value_counts().sort_index()

print('kingco_sales.csv (sales_df)')
print(f'Unique years (sale_date): {sales_years}')
print(f'Year range: {min(sales_years)} - {max(sales_years)}')
print(f'\nRecords per year:')
print(counts_per_year.to_string())

kingco_sales.csv (sales_df)
Unique years (sale_date): [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
Year range: 1999 - 2025

Records per year:
year
1999    24392
2000    22738
2001    21932
2002    23289
2003    28080
2004    31419
2005    32502
2006    29794
2007    23228
2008    14260
2009    12974
2010    13730
2011    12717
2012    16877
2013    21603
2014    22021
2015    25050
2016    26899
2017    27079
2018    23957
2019    24282
2020    26129
2021    30234
2022    21109
2023    15409
2024    17077
2025    16833


In [7]:
# Load crimes data - use chunks for large file
crimes_years_list = []
for chunk in pd.read_csv('SPD_Crime_Data__2008-Present.csv', chunksize=100000, usecols=['Offense Date'], low_memory=False):
    chunk['Offense Date'] = pd.to_datetime(chunk['Offense Date'], format=SPD_DATE_FORMAT, errors='coerce')
    chunk['year'] = chunk['Offense Date'].dt.year
    crimes_years_list.append(chunk['year'])

crimes_years_series = pd.concat(crimes_years_list)
counts_per_year_crimes = crimes_years_series.value_counts().sort_index()
crimes_years = np.unique(crimes_years_series.dropna()).astype(int).tolist()

print('SPD_Crime_Data__2008-Present.csv (crimes_df)')
print(f'Unique years (Offense Date): {crimes_years}')
print(f'Year range: {min(crimes_years)} - {max(crimes_years)}')
print(f'\nRecords per year:')
print(counts_per_year_crimes.to_string())

SPD_Crime_Data__2008-Present.csv (crimes_df)
Unique years (Offense Date): [1900, 1908, 1915, 1920, 1929, 1934, 1951, 1953, 1957, 1960, 1964, 1966, 1968, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
Year range: 1900 - 2025

Records per year:
year
1900        5
1908        1
1915        1
1920        1
1929        1
1934        1
1951        1
1953        1
1957        1
1960        1
1964        4
1966        2
1968        1
1973        1
1974        3
1975        3
1976        5
1977        3
1978        1
1979        3
1980        7
1981        3
1982        1
1983        4
1984        2
1985        3
1986        1
1987        5
1988        3
1989        4
1990        6
1991       19
1992        4
1993       11
1994 

## Filter Data to Matching Years

Keep only records from 2021 onwards that appear in both datasets.

In [19]:
# Find years that appear in both datasets, from 2021 onwards
START_YEAR = 2021
matching_years = [y for y in np.intersect1d(sales_years, crimes_years) if y >= START_YEAR]
print(f'Matching years (from {START_YEAR}): {matching_years}')

# Filter sales data to matching years only
sales_raw = pd.read_csv('kingco_sales.csv')
sales_raw['sale_date'] = pd.to_datetime(sales_raw['sale_date'], format=KINGCO_DATE_FORMAT, errors='coerce')
sales_raw['year'] = sales_raw['sale_date'].dt.year
sales_df = sales_raw[sales_raw['year'].isin(matching_years)].copy()

# Filter crimes data to matching years only (load in chunks)
crimes_chunks = []
for chunk in pd.read_csv('SPD_Crime_Data__2008-Present.csv', chunksize=100000, low_memory=False):
    chunk['Offense Date'] = pd.to_datetime(chunk['Offense Date'], format=SPD_DATE_FORMAT, errors='coerce')
    chunk['year'] = chunk['Offense Date'].dt.year
    chunk_filtered = chunk[chunk['year'].isin(matching_years)]
    crimes_chunks.append(chunk_filtered)

crimes_df = pd.concat(crimes_chunks, ignore_index=True)

print(f'\nsales_df: {len(sales_df):,} rows')
print(f'crimes_df: {len(crimes_df):,} rows')

Matching years (from 2021): [np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]

sales_df: 90,167 rows
crimes_df: 360,445 rows
END_DATE (min max - 1 day): 2025-05-24 00:00:00


## Filter to Seattle City Limits (Point-in-Polygon)

Use official Seattle boundary (ArcGIS Open Data) with Point-in-Polygon for precise geographic filtering. Convex hull of boundary lines creates the polygon (lines don't form closed loops).

In [20]:
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import unary_union

# Load Seattle city limits (official boundary from ArcGIS Open Data)
# Source: https://opendata.arcgis.com/datasets/c5f3575dd7d545ada27064c74ac74f52
boundary_gdf = gpd.read_file('seattle_city_limits.geojson')

# Extract line segments (boundary is MultiLineString - lines don't form closed loops)
# Use convex hull to create polygon for Point-in-Polygon
all_lines = []
for geom in boundary_gdf.geometry:
    if geom.geom_type == 'MultiLineString':
        for line in geom.geoms:
            all_lines.append(line)
    elif geom.geom_type == 'LineString':
        all_lines.append(geom)

lines_union = unary_union(all_lines)
seattle_boundary = lines_union.convex_hull  # Polygon from boundary envelope

# Point-in-Polygon: keep only properties within Seattle boundary
sales_df['geometry'] = sales_df.apply(lambda r: Point(r['longitude'], r['latitude']), axis=1)
sales_gdf = gpd.GeoDataFrame(sales_df, geometry='geometry', crs='EPSG:4326')
in_seattle = sales_gdf.geometry.within(seattle_boundary)
sales_df = sales_df[in_seattle].drop(columns=['geometry']).reset_index(drop=True)

# crimes_df is already Seattle-only (SPD jurisdiction)
print(f'sales_df (Seattle only, PIP): {len(sales_df):,} rows')
print(f'crimes_df: {len(crimes_df):,} rows')

sales_df (Seattle only, PIP): 22,803 rows
crimes_df: 360,445 rows
Max crime date: 2025-05-23 23:57:00
Max sales date (Seattle): 2025-05-24 00:00:00


## Spatial Join: Properties + Crime Counts

Join properties with crime data by counting crimes within a radius of each sale location (same year).

## Check 2025 matching months/days

Ensure overlapping date coverage for Seattle data: use the earlier of the two max dates (crimes vs sales) minus 1 day as cutoff. Reproducible—no hardcoded dates.

In [None]:
# Dynamic end date: min of both max dates minus 1 day
max_crime_date = crimes_df['Offense Date'].max()
max_sales_date = sales_df['sale_date'].max()
END_DATE = (min(max_crime_date, max_sales_date) - pd.Timedelta(days=1)).normalize()
sales_df = sales_df[sales_df['sale_date'] <= END_DATE].copy()
crimes_df = crimes_df[crimes_df['Offense Date'] <= END_DATE].copy()

print(f'Max crime date (before cutoff): {max_crime_date}')
print(f'Max sales date (Seattle, before cutoff): {max_sales_date}')
print(f'END_DATE (min - 1 day): {END_DATE}')
print(f'\nsales_df: {len(sales_df):,} rows')
print(f'crimes_df: {len(crimes_df):,} rows')

In [14]:
from scipy.spatial import cKDTree

# Filter valid coordinates (crimes_df uses -1.0 for missing; sales_df may have NaN)
sales_df = sales_df.dropna(subset=['latitude', 'longitude'])
RADIUS_KM = 1.0  # crimes within 1 km of property
# At Seattle latitude (~47°): 1° lat ≈ 111 km, 1° long ≈ 76 km
# 1 km ≈ 0.009° lat, 0.013° long - use 0.01 as approx for both
radius_deg = RADIUS_KM / 111

# Prepare crime data - valid coords only
crimes_df['Latitude'] = pd.to_numeric(crimes_df['Latitude'], errors='coerce')
crimes_df['Longitude'] = pd.to_numeric(crimes_df['Longitude'], errors='coerce')
crimes_valid = crimes_df[
    (crimes_df['Latitude'] > 40) & (crimes_df['Latitude'] < 50) &
    (crimes_df['Longitude'] < -100) & (crimes_df['Longitude'] > -125)
].copy()
crime_coords = crimes_valid[['Latitude', 'Longitude']].values
crime_years = crimes_valid['year'].values

# Build spatial index on crimes
crime_tree = cKDTree(crime_coords)

# Process in batches - vectorized query for speed
BATCH_SIZE = 50000
crime_counts = []
for i in range(0, len(sales_df), BATCH_SIZE):
    batch = sales_df.iloc[i:i+BATCH_SIZE]
    coords = batch[['latitude', 'longitude']].values
    years = batch['year'].values
    all_indices = crime_tree.query_ball_point(coords, r=radius_deg)
    counts = [sum(1 for idx in inds if crime_years[idx] == years[j]) for j, inds in enumerate(all_indices)]
    crime_counts.extend(counts)
    print(f'Processed {min(i+BATCH_SIZE, len(sales_df)):,} / {len(sales_df):,} properties...')

sales_df['crime_count_1km'] = crime_counts

# Create joined df (sales with crime feature)
df_joined = sales_df.copy()
print(f'\ndf_joined: {len(df_joined):,} rows')
print(f'Crime count stats: min={df_joined["crime_count_1km"].min()}, max={df_joined["crime_count_1km"].max()}, mean={df_joined["crime_count_1km"].mean():.1f}')

Processed 25,414 / 25,414 properties...

df_joined: 25,414 rows
Crime count stats: min=0, max=5405, mean=594.2
