In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime

# Load sample data
df = pd.read_csv("../data/raw/chicago_crimes_sample.csv")

print("=== CHICAGO CRIME DATA OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print("\nColumns:")
print(df.columns.tolist())

# Basic info
print("\n=== DATA QUALITY CHECK ===")
print("Missing values:")
print(df.isnull().sum())

print("\n=== CRIME TYPES ===")
print(df['primary_type'].value_counts().head(10))

print("\n=== GEOGRAPHIC COVERAGE ===")
print(f"Latitude range: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
print(f"Longitude range: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Crime types distribution
df['primary_type'].value_counts().head(10).plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Top 10 Crime Types')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Temporal distribution
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['hour'] = df['date'].dt.hour
df.groupby('month').size().plot(kind='line', ax=axes[0,1])
axes[0,1].set_title('Crimes by Month')

# 3. Hourly distribution
df.groupby('hour').size().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Crimes by Hour of Day')

# 4. Geographic scatter
axes[1,1].scatter(df['longitude'], df['latitude'], alpha=0.5, s=1)
axes[1,1].set_title('Crime Locations')
axes[1,1].set_xlabel('Longitude')
axes[1,1].set_ylabel('Latitude')

plt.tight_layout()
plt.savefig('../data/exports/eda_overview.png', dpi=300, bbox_inches='tight')
plt.show()

# Interactive map using plotly
fig = px.scatter_mapbox(
    df.dropna(subset=['latitude', 'longitude']).sample(1000),  # Sample for performance
    lat='latitude',
    lon='longitude',
    color='primary_type',
    hover_data=['date', 'description'],
    mapbox_style='open-street-map',
    title='Chicago Crime Locations (Sample)',
    zoom=10,
    center={'lat': 41.8781, 'lon': -87.6298}
)
fig.show()

# Save processed sample
df_clean = df.dropna(subset=['latitude', 'longitude', 'primary_type'])
df_clean.to_csv("../data/processed/crimes_sample_clean.csv", index=False)
print(f"Cleaned dataset saved: {len(df_clean)} records")