In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#load the data
df = pd.read_csv('PDReports_2003_2024.csv', low_memory=False)  # Assuming the data is in a CSV file

# filter data to only include rows with non-unique values in 'Incident Number'
df = df[df.duplicated('Incident Number', keep=False)]

# remove ones with the same Time ie. has same incident number bcuz officer filed multiple reports
df = df[~df.duplicated(subset=['Incident Number', 'Time'], keep=False)]

# now remove non unique ones
df = df[df.duplicated('Incident Number', keep=False)]

# remove ones with the same Time ie. has same incident number bcuz officer filed multiple reports
df = df[~df.duplicated(subset=['Incident Number', 'Longitude', 'Latitude'], keep=False)]

# now remove non unique ones
df = df[df.duplicated('Incident Number', keep=False)]

# list of unique crime types and their counts
counts = df['Category'].value_counts()


# sort the dataset by 'Date' and 'Time'
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d').dt.date
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.time
df = df.sort_values(by=['Date', 'Time'])
df['Date'] = df['Date'].astype(str)
df['Time'] = df['Time'].astype(str)


# list of unique crime types and their counts
# counts = df['Category'].value_counts()

# apply the following to the dataframe
# 1. get all unique 'Incident Number' values
# 2. for each unique value, get a list of all entries with that value in 'Incident Number'
# 3. have column for incident number, list of dates and times, locations (lat, long), incident category
# 4. create a new dataframe with the above columns
unique_incidents = df['Incident Number'].unique()
incident_data = []

for incident in unique_incidents:
    incident_entries = df[df['Incident Number'] == incident]
    dates = incident_entries['Date'].tolist()
    times = incident_entries['Time'].tolist()
    longitudes = incident_entries['Longitude'].tolist()
    latitudes = incident_entries['Latitude'].tolist()
    categories = incident_entries['Category'].tolist()
    descriptions = incident_entries['Description'].tolist()
    disctricts = incident_entries['District'].tolist()
    dayOfWeeks = incident_entries['DayOfWeek'].tolist()
    
    # Append the data for this incident to the list
    incident_data.append({
        'Incident Number': incident,
        'Dates': dates,
        'Times': times,
        'DayOfWeeks': dayOfWeeks,
        'Categories': categories,
        'Descriptions': descriptions,
        'Longitudes': longitudes,
        'Latitudes': latitudes,
        'Districts': disctricts
    })

incident_data = pd.DataFrame(incident_data)

incident_data['Categories'].value_counts()

# filter the dataframe to only include rows with 'Categories' containing 'Vehicle Theft' in first position and 'Vehicle Related' in second position
carJobs = incident_data[incident_data['Categories'].apply(lambda x: 'Vehicle Theft' == x[0] and 'Vehicle Related' == x[1])]

carJobs = carJobs[carJobs['Longitudes'].apply(lambda x: not (np.isnan(x[0]) or np.isnan(x[1])))]  

In [None]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import ColumnDataSource, HoverTool, Select, CustomJS
from bokeh.io import output_notebook
from bokeh.layouts import column
from geopy.distance import geodesic
from bokeh.embed import components

# Enable Bokeh output in a notebook
output_notebook()

# Load CSV data
df = carJobs.copy()

# Convert string representation of lists into actual lists
def safe_eval(value):
    try:
        return eval(value) if isinstance(value, str) else value
    except:
        return []

df['Latitudes'] = df['Latitudes'].apply(safe_eval)
df['Longitudes'] = df['Longitudes'].apply(safe_eval)
df['Districts'] = df['Districts'].apply(safe_eval)

# Calculate distance between first and second locations
def calculate_distance(row):
    if len(row['Latitudes']) >= 2 and len(row['Longitudes']) >= 2:
        start_coords = (row['Latitudes'][0], row['Longitudes'][0])
        end_coords = (row['Latitudes'][1], row['Longitudes'][1])
        return geodesic(start_coords, end_coords).kilometers
    return np.nan

df['DistanceTravelled'] = df.apply(calculate_distance, axis=1)

# Extract stolen and found districts
df['StolenDistrict'] = df['Districts'].apply(lambda d: d[0] if isinstance(d, list) and len(d) > 0 else "Unknown")
df['FoundDistrict'] = df['Districts'].apply(lambda d: d[1] if isinstance(d, list) and len(d) > 1 else "Unknown")

# Convert DataFrame to Bokeh-compatible format
df_dict = df.to_dict(orient='list')

# Unique list of stolen districts
district_list = sorted(df['StolenDistrict'].unique())

# Initial data for the first district
initial_district = district_list[0]
filtered_df = df[df['StolenDistrict'] == initial_district]
found_counts = filtered_df['FoundDistrict'].value_counts()
avg_distances = filtered_df.groupby('FoundDistrict')['DistanceTravelled'].mean()

source = ColumnDataSource(data={
    'found_districts': found_counts.index.tolist(),
    'count': found_counts.values.tolist(),
    'avg_distance': [avg_distances.get(d, 0) for d in found_counts.index]
})

# Create interactive dropdown
district_select = Select(title="Select Stolen District:", value=initial_district, options=district_list)

# Create figure
p = figure(x_range=list(found_counts.index), height=600, width=900, 
           title="Number of Stolen Vehicles by Found District for Selected Stolen District",
           x_axis_label="Found District", y_axis_label="Number of Vehicles", tools="pan,wheel_zoom,box_zoom,reset")

# Add bar chart
bars = p.vbar(x='found_districts', top='count', source=source, width=0.5, color='navy')

# Add hover tool
tooltips = [
    ("Found District", "@found_districts"),
    ("Count", "@count"),
    ("Avg Distance Travelled (km)", "@avg_distance{0.2f}")
]
p.add_tools(HoverTool(renderers=[bars], tooltips=tooltips))

# JavaScript Callback for Dropdown
callback = CustomJS(args=dict(source=source, df_dict=df_dict, select=district_select), code="""
    var selected_district = select.value;
    var new_data = { found_districts: [], count: [], avg_distance: [] };

    var stolen = df_dict['StolenDistrict'];
    var found = df_dict['FoundDistrict'];
    var distances = df_dict['DistanceTravelled'];
    
    var counts = {};
    var total_dist = {};
    var occurrences = {};

    for (var i = 0; i < stolen.length; i++) {
        if (stolen[i] === selected_district) {
            var fd = found[i] || "Unknown";
            counts[fd] = (counts[fd] || 0) + 1;
            total_dist[fd] = (total_dist[fd] || 0) + (distances[i] || 0);
            occurrences[fd] = (occurrences[fd] || 0) + 1;
        }
    }

    for (var key in counts) {
        new_data.found_districts.push(key);
        new_data.count.push(counts[key]);
        new_data.avg_distance.push(total_dist[key] / occurrences[key]);
    }

    source.data = new_data;
    source.change.emit();
    p.x_range.factors = new_data.found_districts;
""")

district_select.js_on_change("value", callback)

# Show interactive layout
layout = column(district_select, p)
show(layout)

output_file("bokeh_plot.html")
save(layout)