In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import col, avg
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# --- Setup ---
spark.sql("USE CATALOG final_project")

# ==============================================================================
# PART 1: K-Means Clustering (District Risk Profiling)
# ==============================================================================
print("Starting K-Means Clustering on Districts...")

df_district = spark.read.table("final_project.gold.district_stats")

# Prepare features: Violent Count, Property Count, Arrest Rate
assembler = VectorAssembler(
    inputCols=["violent_crimes_count", "property_crimes_count", "arrest_rate"],
    outputCol="features"
)
data_vec = assembler.transform(df_district)

# Scale features (important for K-Means)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(data_vec)
scaledData = scalerModel.transform(data_vec)

# Train K-Means (k=3: Low Risk, Medium Risk, High Risk)
kmeans = KMeans(featuresCol="scaledFeatures", k=3, seed=1)
model = kmeans.fit(scaledData)

# Make predictions
predictions = model.transform(scaledData)
print("Clustering complete. Cluster centers displayed below:")

centers = model.clusterCenters()
for center in centers:
    print(center)

# Visualizing Clusters
pdf_clusters = predictions.select("District", "violent_crimes_count", "property_crimes_count", "prediction").toPandas()

plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=pdf_clusters, 
    x='property_crimes_count', 
    y='violent_crimes_count', 
    hue='prediction', 
    palette='deep', 
    s=100
)
plt.title('District Segmentation: K-Means Clustering (k=3)')
plt.xlabel('Property Crimes')
plt.ylabel('Violent Crimes')
plt.grid(True)
plt.show()

# ==============================================================================
# PART 2: Hypothesis Testing (Weekend vs. Weekday Effect)
# ==============================================================================
print("Performing Statistical Hypothesis Analysis: Weekend vs Weekday...")

df_ml = spark.read.table("final_project.gold.ml_features")

# Calculate average crimes per hour for Weekends (1) vs Weekdays (0)
# We aggregate by date first to get daily counts, then average by is_weekend
stats = df_ml.groupBy("crime_month", "day_of_week", "is_weekend") \
    .count() \
    .groupBy("is_weekend") \
    .agg(avg("count").alias("avg_daily_crimes")) \
    .toPandas()

print("Average Daily Crimes (0=Weekday, 1=Weekend):")
print(stats)

# Visualization of distribution
pdf_dist = df_ml.groupBy("crime_date", "is_weekend").count().toPandas()

plt.figure(figsize=(8, 5))
sns.boxplot(data=pdf_dist, x='is_weekend', y='count')
plt.title('Distribution of Daily Crime Volume: Weekday (0) vs Weekend (1)')
plt.ylabel('Daily Crimes')
plt.show()