In [6]:
import requests
import json
import time
import pickle
from gensim.utils import simple_preprocess
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import TaggedDocument
from lbl2vec import Lbl2Vec
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os
from requests.auth import HTTPBasicAuth
from urllib.parse import quote_plus
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt 

KEY_311 = os.environ.get('311_API_KEY')
AUTH_311 = HTTPBasicAuth('key', KEY_311)
API_URL_311 = 'https://boston2-production.spotmobile.net/open311/v2/services.json'
ILLEGAL_PARKING_SERVICE_CODE = quote_plus("Transportation - Traffic Division:Enforcement & Abandoned Vehicles:Parking Enforcement")

# Outline:
0. Data Collection
1. Data Preparation
3. Data Preprocessing
4. Model Training
5. Model Evaluation
6. Model Prediction

# 0. Data Collection

## Methods to gather service request JSON objects from the Boston Open311 API Endpoint

These methods will request all available service requests (limited to last 90 days) and write the resulting JSON array of service requests to the given filename.

In [None]:
def execute_single_query(page_num:int):
    ''' 
    Sends an HTTP GET request to the Boston 311 API for Illegal Parking.
    Requests 100 (maximum allowed) service requests for a given page.
    
    Returns: HTTP Response Object
    '''
    query_url = f'https://boston2-production.spotmobile.net/open311/v2/requests.json?service_code={ILLEGAL_PARKING_SERVICE_CODE}&per_page=100&page={page_num}'

    return requests.get(url=query_url, auth=AUTH_311)

def retreive_all_service_requests(filename:str):
    ''' 
    Continually sends HTTP Requests to the Boston 311 API for Illegal Parking Service Requests.
    Concatenates responses into a single JSON Array containing Service Request JSON Objects.
    Updates are sent to the console throughout the process.
    Returns: writes the JSON Array to the given filename, once all service requests are received.
    '''

    # Executes first query
    page = 1
    current_response = execute_single_query(page)

    # Ensure HTTP Error is captured
    if current_response.status_code != 200:
        print(f'First Query failed. Error code: {current_response.status_code}')
        return
    page+=1
    
    # Creates JSON Array Object on response
    all_service_requests_json = current_response.json()

    # Continually execute queries for the next page until there are no more (Error Code != 200, 429)
    while True:
        current_response = execute_single_query(page)
        status_code = current_response.status_code

        # Successful response has service request JSON Objects concatenated to end of overall JSON Array
        if status_code == 200:
            print(f'Query Received: {page}')
            for service_request_obj in current_response.json():
                all_service_requests_json.append(service_request_obj)
            page += 1
        
        # Error code 429 indicates rate limiting (10 GET Requests per min)
        elif status_code == 429:
            print(f'Rate Limited: Waiting 1 minute. \n Current Page {page}')
            # Wait 60 seconds to resume query
            time.sleep(60)
            print('Resuming Query...')
        
        # Any other error code, break loop.
        else: 
            print(f'Error recieved: {status_code}')
            break

    print(f'Finished querying. Total pages queried: {page}\n Printing...')

    # Write received service requests to specificed filename
    all_service_requests_file = open(filename, 'w')
    all_service_requests_file.write(json.dumps(all_service_requests_json))
    all_service_requests_file.close()

In [None]:
SERVICE_REQ_JSON_FILE_NAME = 'service_requests_last_90_days.json'
retreive_all_service_requests(SERVICE_REQ_JSON_FILE_NAME)

# 1. Data Preparation

The following method prepares the JSON array of service request JSON objects, extracting the free-form text description and service request ID from each object. These description and ID pairs are stored in a dictionary which is then pickled for later use.

In [None]:
def extract_request_id_with_description(filename:str) -> dict:
    ''' 
    For a given file containing a JSON Array of Service Request JSON Objects, extract the service number and the description
    (if it exists) into a dictionary for further preprocessing.

    Returns: Dict (Service Request ID -> Service Request Description)
    '''
    request_id_descr_dict = {}
    service_requests_json = json.loads(open(filename, 'r').read())

    for service_request in service_requests_json:
        # Ensure the description exists for this service request
        description = service_request.get('description', False)
        if not description: continue   

        # Add to dictionary using service request ID as key and service request description as value.
        request_id_descr_dict[service_request['service_request_id']] = service_request['description']

    return request_id_descr_dict

In [None]:
PICKLED_SERVICE_REQ_DESC = 'service_req_last_90_days.pkl'

In [None]:
# Create dictionary and store it as a python pickle file for later use.    
service_requests_with_description_file = open(PICKLED_SERVICE_REQ_DESC, 'wb')

dict = extract_request_id_with_description(SERVICE_REQ_JSON_FILE_NAME)

pickle.dump(dict, service_requests_with_description_file)

service_requests_with_description_file.close()

# 2. Data Preprocessing

## Tokenization

The simple_preprocess method from gensim removes accents and special characters, then downcasing and tokenizing the document, removing any word with length less than 2 or greater than 15.

In [None]:
# Remove special characters and downcase, then tokenize
def tokenize(doc):
    return simple_preprocess(doc, deacc=True, min_len=2, max_len=15)

## Training, Validation, Testing Data Split

After retreiving the pickled service request ID to description dictionary, create a DataFrame to store the data.

In [None]:
# Load pickled service descriptions
with open(PICKLED_SERVICE_REQ_DESC, 'rb') as file:
    service_id_to_descriptions:dict = pickle.load(file)

service_drescriptions_df = pd.DataFrame(data=service_id_to_descriptions.items(), columns=['ID', 'Description'])
service_drescriptions_df.head()

Split the data between training, validation, and testing using sklearn's train_test_split() method. The data is split 70/15/15 between training, validation, and testing, respectively.

In [None]:
# Train=.70, Validate=.15, Test=.15
# Split all data between train, (test AND validate)
desc_train, desc_testval = train_test_split(service_drescriptions_df, test_size=.3)

# Split (test AND validate) between test, validate
# .3 * .5 = 0.15
desc_val, desc_test = train_test_split(desc_testval, test_size=0.5)

After splitting the data, mark its data type and re-concatenate into a DataFrame.

In [None]:
desc_train['data_type'] = 'train'
desc_test['data_type'] = 'test'
desc_val['data_type'] = 'validate'

all_descriptions = pd.concat([desc_train, desc_test, desc_val]).reset_index(drop=True)
all_descriptions.head()

## Document Tagging

The Lbl2Vec model takes in tagged documents, where each document has an integer tag which is used to keep track of the document, its vector, and labels. In order to use these labels after the model is trained, a function is created which maps the service request ID to a document tag integer. The service request ID cannot be used as a document tag because when the Lbl2Vec model sees a tag like 101005244674, it assumes there are documents with tags from [0, 101005244674], resulting in too much memory being allocated. Currently, the function simply uses the index of a list of service request IDs as the document tags, which is not a very sustainable hash function at scale, but it does ensure the document tags are as small as possible, avoiding the memory allocation error.

In [None]:
service_id_list = []
def service_id_to_tag(service_id:int) -> int:
    ''' 
    Simple hash function for converting Service Request IDs to document tags.
    Simply uses the index in a list of Service Request IDs as the document tag.
    This is not the most efficient implementation, but it minimizes the size of the document tag integers.
    '''
    if service_id in service_id_list:
        return service_id_list.index(service_id)
    service_id_list.append(service_id)
    return len(service_id_list)

For each description in the DataFrame, apply tokenization and tag the document.

In [None]:
all_descriptions['tagged_desc'] = all_descriptions.apply(lambda row: TaggedDocument(tokenize(row['Description']), [service_id_to_tag(int(row['ID']))]), axis=1)
all_descriptions.head()

# 3. Model Training

## Keyword Definition

Labels:

    1.  Bike Lane Obstruction: bike, cycle, path,  
    2.  Bus Lane Obstruction: bus, stop, 
    3.  Non-resident Parking: resident, state
    4.  Blocked Fire Hydrant: fire, hydrant
    5.  Blocked Sidewalk: sidewalk, side
    6.  Blocked Driveway: driveway, drive, way, private
    7.  Blocked Crosswalk: crosswalk, cross
    8.  Blocked Handicap spot: handicap, placard,
    9.  Double Parking: double, triple
    10. No stopping zone: stopping, zone
    11. visitor spot: visitor, hour

In [None]:
category_keywords_dict = {
    'bike lane' :           ['bike', 'cycle', 'cycling', 'path', 'sharrow'],
    'bus lane' :            ['bus'],
    'resident parking' :    ['resident', 'state', 'plate', 'plates'],
    'fire hydrant' :        ['fire', 'hydrant'],
    'sidewalk' :            ['sidewalk', 'side'],
    'driveway' :            ['driveway', 'drive', 'way', 'private'],
    'crosswalk' :           ['crosswalk', 'cross'],
    'handicap' :            ['handicap', 'placard'],
    'double parking' :      ['double', 'triple'],
    'no stopping' :         ['stopping', 'loading'],
    'visitor parking' :     ['visitor', 'hour']
} 

NUM_LABELS = len(category_keywords_dict.keys())

category_keywords = pd.DataFrame(data=category_keywords_dict.items(), columns=['category', 'keywords'])
category_keywords

## Model Definition

Through hyperparameter tuning, the optimal hyperparameters for the Lbl2Vec model are as follows:
   
    min_count=1
    window=10
    similarity_threshold=0.3

In [None]:
model = Lbl2Vec(
    keywords_list=list(category_keywords['keywords']),
    tagged_documents=all_descriptions['tagged_desc'][all_descriptions['data_type'] == 'train'],
    min_count=1, 
    similarity_threshold=0.3
    )

## Fitting the model

In [None]:
model.fit()

## Labeling training documents

In [None]:
model.predict_model_docs()

# 4. Model Evaluation

Run label prediction on the testing dataset to generate labels.

In [None]:
testing_documents = all_descriptions['tagged_desc'][all_descriptions.data_type == 'testing']

testing_label_predictions = model.predict_new_docs()

## PCA

PCA reduces the dimensionality for the vectors from 300 to 2, allowing for plotting and visual inspection of clustering along with analytical evaluation like Silhouette scoring.

In [None]:
# Assuming `simScores` is your DataFrame
features = [f'label_{i}' for i in range(NUM_LABELS)]  # Adjust range based on your labels
X = testing_label_predictions[features]

# Apply PCA to reduce dimensions
pca = PCA(n_components=2)  # Adjust `n_components` as needed
X_pca = pca.fit_transform(X)

## K-means Clustering

In [None]:
# Clustering with K-means
kmeans = KMeans(n_clusters=NUM_LABELS, random_state=42)
kmeans.fit(X_pca)

## Silhouette Scoring

In [None]:
# Calculate silhouette score
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)


## Plotting the vector clusterings

In [None]:
# Scatter plot of the first two PCA components
plt.figure(figsize=(10, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6)

# Plot centroids
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.9, marker='X')

plt.title('Clusters of Documents')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.colorbar(label='Cluster Label')

plt.show()

# 5. Model Prediction