In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Model imports
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# Set random seed for reproducibility
np.random.seed(42)

print("="*80)
print("PARKING TICKET HOTSPOT PREDICTION ANALYSIS")
print("Thunder Pandas - DS 4002")
print("="*80)

In [None]:
# =============================================================================
# STEP 1: LOAD AND CLEAN DATA
# =============================================================================
print("\n[STEP 1] Loading and cleaning parking ticket data...")

# Load the data
# NOTE: Update this path to match your data location
df = pd.read_csv('Parking_Tickets.csv')

print(f"Initial dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Parse datetime columns
df['DateIssued'] = pd.to_datetime(df['DateIssued'], errors='coerce')
df['TimeIssued'] = pd.to_datetime(df['TimeIssued'], format='%H:%M', errors='coerce').dt.time

# Remove rows with missing critical information
initial_count = len(df)
df = df.dropna(subset=['DateIssued', 'StreetName'])
print(f"Removed {initial_count - len(df)} rows with missing DateIssued or StreetName")

# Standardize street names (remove extra spaces, convert to uppercase)
df['StreetName'] = df['StreetName'].str.strip().str.upper()

# Remove "Void" violations as they are likely administrative errors
df = df[df['ViolationDescription'] != 'Void']
print(f"Removed void tickets. Current shape: {df.shape}")

# Filter to reasonable date range (2010 onwards for better data quality)
df = df[df['DateIssued'] >= '2010-01-01']
print(f"Filtered to 2010 onwards. Current shape: {df.shape}")

# Remove duplicate tickets
df = df.drop_duplicates(subset=['TicketNumber'])
print(f"After removing duplicates: {df.shape}")

print(f"\nCleaned dataset shape: {df.shape}")
print(f"Date range: {df['DateIssued'].min()} to {df['DateIssued'].max()}")


In [None]:
# =============================================================================
# STEP 2: CREATE TEMPORAL AND STREET-LEVEL FEATURES
# =============================================================================
print("\n[STEP 2] Creating temporal and street-level features...")

# Extract temporal features
df['Year'] = df['DateIssued'].dt.year
df['Month'] = df['DateIssued'].dt.month
df['Day'] = df['DateIssued'].dt.day
df['DayOfWeek'] = df['DateIssued'].dt.dayofweek  # 0=Monday, 6=Sunday
df['DayName'] = df['DateIssued'].dt.day_name()
df['Hour'] = pd.to_datetime(df['TimeIssued'].astype(str), format='%H:%M:%S', errors='coerce').dt.hour
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

# Create season feature
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['Season'] = df['Month'].apply(get_season)

# Create date for aggregation
df['Date'] = df['DateIssued'].dt.date

print(f"Created temporal features: Year, Month, Day, DayOfWeek, Hour, IsWeekend, Season")


In [None]:
# =============================================================================
# STEP 3: AGGREGATE DATA BY STREET AND TIME
# =============================================================================
print("\n[STEP 3] Aggregating ticket counts by street and date...")

# Aggregate by street and date
daily_street = df.groupby(['Date', 'StreetName']).size().reset_index(name='TicketCount')
daily_street['Date'] = pd.to_datetime(daily_street['Date'])

# Add temporal features back
daily_street['Year'] = daily_street['Date'].dt.year
daily_street['Month'] = daily_street['Date'].dt.month
daily_street['DayOfWeek'] = daily_street['Date'].dt.dayofweek
daily_street['IsWeekend'] = daily_street['DayOfWeek'].isin([5, 6]).astype(int)
daily_street['Season'] = daily_street['Month'].apply(get_season)

print(f"Aggregated dataset shape: {daily_street.shape}")
print(f"Date range: {daily_street['Date'].min()} to {daily_street['Date'].max()}")

# Calculate street-level features
street_stats = df.groupby('StreetName').size().reset_index(name='TotalTickets')
street_stats['AvgTicketsPerDay'] = street_stats['TotalTickets'] / \
    (df['Date'].max() - df['Date'].min()).days

# Merge street stats
daily_street = daily_street.merge(street_stats[['StreetName', 'AvgTicketsPerDay']],
                                   on='StreetName', how='left')

# Focus on top streets to reduce noise and improve model performance
top_streets = street_stats.nlargest(50, 'TotalTickets')['StreetName'].tolist()
daily_street = daily_street[daily_street['StreetName'].isin(top_streets)]
print(f"Filtered to top 50 streets. Shape: {daily_street.shape}")


In [None]:
# =============================================================================
# STEP 4: PREPARE FEATURES FOR MODELING
# =============================================================================
print("\n[STEP 4] Preparing features for modeling...")

# Encode categorical variables
le_street = LabelEncoder()
le_season = LabelEncoder()

daily_street['StreetName_Encoded'] = le_street.fit_transform(daily_street['StreetName'])
daily_street['Season_Encoded'] = le_season.fit_transform(daily_street['Season'])

# Sort by date for time series split
daily_street = daily_street.sort_values('Date').reset_index(drop=True)

# Select features for modeling
feature_cols = ['Month', 'DayOfWeek', 'IsWeekend', 'Season_Encoded',
                'StreetName_Encoded', 'AvgTicketsPerDay']
X = daily_street[feature_cols]
y = daily_street['TicketCount']

print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print(f"Features used: {feature_cols}")