# Models

Our data set only contains vehicles parked at parking meters and received a ticket, therefore, we are not able to use a ML classification model. Instead, we will attempt 3 different approaches:
* Density-based
* Synthetic negative sampling
* 2-step positive, unlabeled learning

## Import Datasets

In [47]:
from itertools import product

import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from shapely import wkt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



In [48]:
def df_to_gdf(df: pd.DataFrame, geometry_key: str = "Geometry") -> gpd.GeoDataFrame:
    df[geometry_key] = df[geometry_key].apply(wkt.loads)
    return gpd.GeoDataFrame(df, geometry=geometry_key)

In [49]:
tickets_df = df_to_gdf(pd.read_csv("../data/cleaned_data/parking_tickets.csv"))
tickets_df["lat"] = tickets_df["Geometry"].apply(lambda point: point.coords[0][0])
tickets_df["lon"] = tickets_df["Geometry"].apply(lambda point: point.coords[0][1])

# only downtown for now
tickets_df = tickets_df[tickets_df["Neighbourhood"] == "Downtown"]

tickets_df.head()

Unnamed: 0,Block,Street,EntryDate,Status,Year,BI_ID,month,dayofweek,Geometry,Neighbourhood,lat,lon
7,800,RICHARDS ST,2023-05-01,VA,2023,4487120,5,0,POINT (49.284 -123.112),Downtown,49.283893,-123.112199
12,1200,W CORDOVA ST,2023-05-01,IS,2023,4487144,5,0,POINT (49.288 -123.117),Downtown,49.287755,-123.116633
49,800,RICHARDS ST,2023-03-08,VA,2023,4487432,3,2,POINT (49.284 -123.112),Downtown,49.283893,-123.112199
52,700,W PENDER ST,2023-03-08,IS,2023,4487448,3,2,POINT (49.285 -123.115),Downtown,49.284659,-123.11545
55,400,HOWE ST,2023-03-08,IS,2023,4487459,3,2,POINT (49.285 -123.117),Downtown,49.284786,-123.116643


In [50]:
meters_df = df_to_gdf(pd.read_csv("../data/cleaned_data/parking_meters.csv"))
meters_df["lat"] = meters_df["Geometry"].apply(lambda point: point.coords[0][0])
meters_df["lon"] = meters_df["Geometry"].apply(lambda point: point.coords[0][1])


# only downtown for now
meters_df = meters_df[meters_df["Geo Local Area"] == "Downtown"]

meters_df.head()

Unnamed: 0,METERHEAD,TIMEINEFFE,Geo Local Area,METERID,Geometry,lat,lon
9,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519
17,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,80031,POINT (49.281 -123.107),49.28106,-123.106932
28,Twin,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,170603,POINT (49.276 -123.127),49.275638,-123.126964
35,Twin,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,170716,POINT (49.277 -123.129),49.276562,-123.128648
36,Twin,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,170714,POINT (49.277 -123.129),49.276562,-123.128648


In [51]:

# Spatial join to find nearest meter to each ticket
tickets_with_meters = gpd.sjoin_nearest(
    tickets_df,
    meters_df,
    how='left',
    max_distance=50,  # meters
    distance_col='distance_to_meter'
)

# only downtown for now

tickets_with_meters = tickets_with_meters[tickets_with_meters["Neighbourhood"] == "Downtown"]

tickets_with_meters.head(5)

Unnamed: 0,Block,Street,EntryDate,Status,Year,BI_ID,month,dayofweek,Geometry,Neighbourhood,lat_left,lon_left,index_right,METERHEAD,TIMEINEFFE,Geo Local Area,METERID,lat_right,lon_right,distance_to_meter
7,800,RICHARDS ST,2023-05-01,VA,2023,4487120,5,0,POINT (49.284 -123.112),Downtown,49.283893,-123.112199,3332,Pay Station,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,590404,49.28377,-123.112159,0.000129
12,1200,W CORDOVA ST,2023-05-01,IS,2023,4487144,5,0,POINT (49.288 -123.117),Downtown,49.287755,-123.116633,3987,Pay Station,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,41019,49.287967,-123.117021,0.000442
49,800,RICHARDS ST,2023-03-08,VA,2023,4487432,3,2,POINT (49.284 -123.112),Downtown,49.283893,-123.112199,3332,Pay Station,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,590404,49.28377,-123.112159,0.000129
52,700,W PENDER ST,2023-03-08,IS,2023,4487448,3,2,POINT (49.285 -123.115),Downtown,49.284659,-123.11545,370,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,620502,49.285108,-123.116001,0.000711
52,700,W PENDER ST,2023-03-08,IS,2023,4487448,3,2,POINT (49.285 -123.115),Downtown,49.284659,-123.11545,4221,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,620504,49.285108,-123.116001,0.000711


## Generate Unlabeled Data

* All possible meter/time combinations

In [52]:
# define simulation window
unlabeled_df = meters_df.copy()
n = len(unlabeled_df)

# each day of 2023
dates = pd.date_range(start="2023-01-01", end="2023-12-31", freq="D")
dates_df = pd.DataFrame({"EntryDate": dates})

dates_df["Year"] = dates_df["EntryDate"].dt.year
dates_df["month"] = dates_df["EntryDate"].dt.month
dates_df["dayofweek"] = dates_df["EntryDate"].dt.dayofweek

meters_df['_key'] = 1
dates_df['_key'] = 1
unlabeled_df = pd.merge(meters_df, dates_df, on="_key").drop(columns=["_key", "EntryDate"], axis=1)

unlabeled_df.head(14)

Unnamed: 0,METERHEAD,TIMEINEFFE,Geo Local Area,METERID,Geometry,lat,lon,Year,month,dayofweek
0,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,6
1,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,0
2,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,1
3,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,2
4,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,3
5,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,4
6,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,5
7,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,6
8,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,0
9,Twin Bay Single,METER IN EFFECT: 9:00 AM TO 10:00 PM,Downtown,540409,POINT (49.281 -123.108),49.281454,-123.107519,2023,1,1


## Positive Unlabeled Learning
* Ref: https://dtai.cs.kuleuven.be/tutorials/pulearning/modules/pu%20learning%20tutorial/two-step/


### Training

In [53]:
# Add label: 1 = ticket issued, 0 = unknown
tickets_df["label"] = 1
unlabeled_df["label"] = 0

tickets_df = tickets_df.drop(columns=["EntryDate", "Geometry", "Street", "Block", "Status", "BI_ID", "Neighbourhood"])
unlabeled_df = unlabeled_df.drop(columns=["METERHEAD", "TIMEINEFFE", "Geo Local Area", "METERID", "Geometry"])

# Combine datasets
combined_df = pd.concat([tickets_df, unlabeled_df])
X = combined_df.drop(columns="label")
y = combined_df["label"]

In [54]:
# Step 1: Split positives into training + spy set
P_train, P_spy = train_test_split(tickets_df, test_size=0.2, random_state=42)

In [55]:
# Combine spies with the unlabeled data
spy_unlabeled = pd.concat([P_spy, unlabeled_df])
spy_unlabeled["label"] = 0  # mark all as unlabeled

In [None]:
# Train a classifier to distinguish P_train (1) vs spy_unlabeled (0)
X_train = pd.concat([
    P_train.drop(columns=["label", "Geometry"], errors="ignore"), 
    spy_unlabeled.drop(columns=["label", "Geometry"], errors="ignore")
])
y_train = pd.concat([P_train["label"], spy_unlabeled["label"]])

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
# Predict probabilities on the original unlabeled set
X_train_columns = X_train.columns

unlabeled_probs = clf.predict_proba(unlabeled_df.drop(columns="label"))[:, 1]  # prob of being positive

KeyError: "['label'] not found in axis"

In [None]:
# Select reliable negatives
threshold = 0.1  # examples with prob < 10% are considered "safe negatives"
reliable_negatives = unlabeled_df[unlabeled_probs < threshold]
reliable_negatives["label"] = 0

In [None]:
# Final training set: true positives + reliable negatives
final_train = pd.concat([tickets_df, reliable_negatives])
X_final = final_train.drop(columns="label")
y_final = final_train["label"]

In [None]:
# Final classifier
model = RandomForestClassifier()
model.fit(X_final, y_final)

## Validation