In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyspark.sql.functions import col

# --- Setup ---
spark.sql("USE CATALOG final_project")
print("Catalog set to final_project.")

# ==============================================================================
# SECTION 1: Temporal Trends (Daily Crime Volume)
# ==============================================================================
print("Generating Visualization: Daily Crime Trends...")

df_daily = spark.read.table("final_project.gold.daily_trends")
pdf_daily = df_daily.toPandas()
pdf_daily['crime_date'] = pd.to_datetime(pdf_daily['crime_date'])

plt.figure(figsize=(14, 6))
sns.lineplot(data=pdf_daily, x='crime_date', y='total_crimes', hue='Crime_Category')
plt.title('Daily Crime Trends by Category (2025)')
plt.xlabel('Date')
plt.ylabel('Number of Incidents')
plt.legend(title='Category')
plt.grid(True, alpha=0.3)
plt.show()

# ==============================================================================
# SECTION 2: Spatiotemporal Heatmap (When do crimes happen?)
# ==============================================================================
print("Generating Visualization: Weekly vs Hourly Heatmap...")

df_heatmap = spark.read.table("final_project.gold.hourly_heatmap")
pdf_heatmap = df_heatmap.toPandas()

# Pivot data for heatmap format (7 days x 24 hours)
heatmap_data = pdf_heatmap.pivot_table(
    index='day_of_week', 
    columns='crime_hour', 
    values='incident_count', 
    aggfunc='sum'
)

# Sort index to ensure Mon-Sun order (assuming 1=Sunday or Monday depending on setting, usually 1=Sun in Spark)
plt.figure(figsize=(16, 6))
sns.heatmap(heatmap_data, cmap='Reds', annot=False, fmt='d')
plt.title('Crime Intensity Heatmap: Day of Week vs. Hour of Day')
plt.xlabel('Hour of Day (0-23)')
plt.ylabel('Day of Week (1-7)')
plt.show()

# ==============================================================================
# SECTION 3: District Safety Analysis
# ==============================================================================
print("Generating Visualization: Crimes by Police District...")

df_district = spark.read.table("final_project.gold.district_stats")
pdf_district = df_district.orderBy(col("crime_count").desc()).toPandas()

plt.figure(figsize=(14, 6))
sns.barplot(data=pdf_district, x='District', y='crime_count', palette='viridis')
plt.title('Total Crime Volume by Police District')
plt.xlabel('District ID')
plt.ylabel('Total Incidents')
plt.xticks(rotation=45)
plt.show()

# ==============================================================================
# SECTION 4: Arrest Efficiency
# ==============================================================================
print("Generating Visualization: Arrest Rates by District...")

plt.figure(figsize=(14, 6))
sns.scatterplot(data=pdf_district, x='crime_count', y='arrest_rate', size='violent_crimes_count', sizes=(20, 500), alpha=0.7)
plt.title('District Analysis: Crime Volume vs. Arrest Rate (Size = Violent Crimes)')
plt.xlabel('Total Crime Count')
plt.ylabel('Arrest Rate (0-1)')
plt.grid(True)
plt.show()