<a href="https://colab.research.google.com/github/marcochow1026/LA_Crime_Time_Series/blob/main/LA_Crime_Time_Series_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>

In [None]:
import os, sys, time
print("Installing pmdarima with numpy<2.0 ...")
!pip install --quiet --force-reinstall "numpy<2.0" pmdarima prophet plotly -U

print("Restarting runtime...")
time.sleep(3)
os.kill(os.getpid(), 9)


In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import time
import matplotlib.pyplot as plt
import seaborn as sns
from contextlib import contextmanager
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier

past = pd.read_csv('/content/Crime_Data_from_2010_to_2019_20250929.csv')
current = pd.read_csv('/content/Crime_Data_from_2020_to_Present_20250929.csv')


In [None]:
# Combine the two dataframes
combined_df = pd.concat([past, current], ignore_index=True)

# Convert date columns to datetime objects, coercing errors to NaT and specifying format
for col in ['Date Rptd', 'DATE OCC']:
    combined_df[col] = pd.to_datetime(combined_df[col], errors='coerce')


In [None]:
combined_df = combined_df.drop(combined_df.columns[[0, 7, 10, 18, 19, 24, 25, 28]], axis=1)

# Part I Violent Crimes filter
part1_codes = [110, 113, 121, 122, 815, 820, 821, 210, 220, 230, 231, 235, 236, 250, 251, 761, 926]

violent_crimes = combined_df[combined_df['Crm Cd'].isin(part1_codes)].copy()

# Parse dates
violent_crimes['DATE OCC'] = pd.to_datetime(violent_crimes['DATE OCC'])
violent_crimes['year_month'] = violent_crimes['DATE OCC'].dt.to_period('M')

# Drop data from March 2024 onwards
violent_crimes = violent_crimes[violent_crimes['DATE OCC'] < '2024-03-01'].copy()


In [None]:
# Group by YearMonth and count occurrences
crime_counts_over_time = violent_crimes['year_month'].value_counts().sort_index()

# Plot the time series
plt.figure(figsize=(14, 7))
crime_counts_over_time.plot()
plt.title('Total number of Violent Crimes Over Time (Monthly)')
plt.xlabel('Date')
plt.ylabel('Number of Crimes')
plt.show()


In [None]:
# Group by YearMonth and count occurrences
crime_counts_over_time = violent_crimes['year_month'].value_counts().sort_index()

# Convert Period → first day of the month (datetime)
crime_counts_df = crime_counts_over_time.to_frame(name='count')
crime_counts_df.index = crime_counts_df.index.to_timestamp()
crime_counts_df.index.name = 'year_month'

crime_counts_df['month']      = crime_counts_df.index.month
crime_counts_df['month_name']= crime_counts_df.index.strftime('%b')
crime_counts_df['year']       = crime_counts_df.index.year

print(crime_counts_df.head())


In [None]:
# Classical Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

decomp = seasonal_decompose(crime_counts_df['count'], model='additive', period=12)
fig = decomp.plot()
fig.set_size_inches(14,8)
plt.suptitle('Classical Decomposition', fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Seasonal plot of monthly violent crime counts by year

plt.figure(figsize=(14, 7))
sns.lineplot(data=crime_counts_df, x='month', y='count', hue='year', palette='viridis')
plt.title('Seasonal Plot of Monthly Violent Crime Counts by Year', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.xticks(ticks=range(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


### Seasonal Sub-series Plot: Monthly Violent Crimes

In [None]:
import calendar

#Seasonal Sub-series Plot: Monthly Violent Crimes

# Create a figure and a set of subplots
fig, axes = plt.subplots(4, 3, figsize=(15, 12), sharex=True, sharey=True)
axes = axes.flatten() # Flatten the 2D array of axes for easy iteration

# Loop through each month (1 to 12)
for i, month_num in enumerate(range(1, 13)):
    month_data = crime_counts_df[crime_counts_df['month'] == month_num]
    month_name = calendar.month_abbr[month_num]

    sns.lineplot(ax=axes[i], x='year', y='count', data=month_data, marker='o', palette='viridis')
    axes[i].set_title(month_name)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].grid(alpha=0.3)

# Add a common y-label
fig.text(0.04, 0.5, 'Number of Crimes', va='center', rotation='vertical', fontsize=12)
# Add a common x-label
fig.text(0.5, 0.04, 'Year', ha='center', fontsize=12)

plt.suptitle('Seasonal Sub-series Plot: Monthly Violent Crimes', fontsize=16)
plt.tight_layout(rect=[0.05, 0.05, 1, 0.96]) # Adjust layout to make space for suptitle and common labels
plt.show()


In [None]:
# =====================================================
# KRUSKAL-WALLIS TEST
# =====================================================

from scipy.stats import kruskal

decomp = seasonal_decompose(crime_counts_df['count'], model='additive', period=12)
detrended = crime_counts_df['count'] - decomp.trend

# Clean and prepare
detrended_clean = detrended.dropna().to_frame(name='detrended')
detrended_clean['month'] = detrended_clean.index.month
detrended_clean['month_name'] = detrended_clean.index.strftime('%b')

# Prepare data for Kruskal-Wallis: one list per month
monthly_data = [group['detrended'].values for month, group in detrended_clean.groupby('month')]

# Run the test
stat, p_value = kruskal(*monthly_data)

print("="*80)
print(f"KRUSKAL-WALLIS TEST: H = {stat:.2f},  p = {p_value:.2e}")
print()
if p_value < 0.001:
    print("→ EXTREMELY STRONG EVIDENCE OF MONTHLY SEASONALITY (p < 0.001)")
    print("→ Violent crime levels are NOT the same across months — even after removing trend")
else:
    print("→ No seasonality detected (very unlikely in your data)")
print("="*80)


In [None]:
# Boxplot
plt.figure(figsize=(13, 7))
sns.boxplot(
    x='month_name',
    y='detrended',
    data=detrended_clean,
    order=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'],
    palette='coolwarm',
    linewidth=1.5
)
plt.axhline(0, color='black', linestyle='--', alpha=0.8)
plt.title('Detrended Monthly Violent Crimes in Los Angeles', fontsize=16, pad=20)
plt.ylabel('Crimes Above/Below Long-Term Trend', fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Print summary stats
print("\nAverage deviation from trend by month:")
print(detrended_clean.groupby('month_name')['detrended'].mean().round(1)
      .reindex(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']))


In [None]:
# =====================================================
# FINAL PLOTS: PROVE STRONG SEASONALITY EXISTS
# =====================================================

decomp = seasonal_decompose(crime_counts_df['count'], model='additive', period=12)

detrended = crime_counts_df['count'] - decomp.trend
detrended_clean = detrended.dropna()

detrended_df = detrended_clean.to_frame(name='detrended')
detrended_df['month'] = detrended_df.index.month
detrended_df['year'] = detrended_df.index.year
detrended_df['month_name'] = detrended_df.index.strftime('%b')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("coolwarm")

plt.figure(figsize=(14, 8))

plt.subplot(2, 1, 2)
plt.plot(detrended.index, detrended, label='Detrended Series (Trend Removed)', color='darkorange', linewidth=1.8)
plt.axhline(0, color='black', linestyle='--', alpha=0.7)
plt.title('Detrended Series', fontsize=14)
plt.xlabel('Year')
plt.ylabel('Detrended Count')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(14, 8))

plt.plot(detrended_clean.index, detrended_clean,
         color='darkorange', linewidth=2.2, label='Detrended Series')
plt.axhline(0, color='black', linestyle='--', alpha=0.8, linewidth=1.2)

# Highlight summer peaks and winter troughs
summer = detrended_df[detrended_df['month'].isin([6,7,8])]
winter = detrended_df[detrended_df['month'].isin([12,1,2])]

plt.scatter(summer.index, summer['detrended'], color='red', s=60, zorder=5, edgecolors='darkred', label='Summer Peak (Jun–Aug)')
plt.scatter(winter.index, winter['detrended'], color='blue', s=60, zorder=5, edgecolors='navy', label='Winter Trough (Dec–Feb)')

plt.title('Detrended Monthly Violent Crime Series in Los Angeles (2010–2024)',
              fontsize=16, fontweight='bold', pad=20)
plt.ylabel('Crimes Above/Below Trend', fontsize=13)
plt.legend(fontsize=12)
plt.grid(alpha=0.4)
