In [25]:
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from lbl2vec import Lbl2Vec
from gensim.downloader import load
import numpy as np

# Plan Overview:
0. Keyword Definition
1. Data Preparation
2. Model Training

### 0. Keyword Definition

#### Categories:

    1.  Bike Lane Obstruction: bike, cycle, path,  
    2.  Bus Lane Obstruction: bus, stop, 
    3.  Non-resident Parking: resident, state
    4.  Blocked Fire Hydrant: fire, hydrant
    5.  Blocked Sidewalk: sidewalk, side, walk
    6.  Blocked Driveway: driveway, drive, way, private
    7.  Blocked Crosswalk: crosswalk, cross, walk
    8.  Blocked Handicap spot: handicap, placard,
    9.  Double Parking: double, triple
    10. No stopping zone: stopping, zone
    11. visitor spot: visitor, hour

In [2]:
category_keywords_dict = {
    'bike lane' :           ['bike', 'cycle', 'cycling', 'path', 'sharrow'],
    'bus lane' :            ['bus'],
    'resident parking' :    ['resident', 'state', 'plate', 'plates'],
    'fire hydrant' :        ['fire', 'hydrant'],
    'sidewalk' :            ['sidewalk', 'side'],
    'driveway' :            ['driveway', 'drive', 'way', 'private'],
    'crosswalk' :           ['crosswalk', 'cross'],
    'handicap' :            ['handicap', 'placard'],
    'double parking' :      ['double', 'triple'],
    'no stopping' :         ['stopping', 'loading'],
    'visitor parking' :     ['visitor', 'hour']
} 

category_keywords = pd.DataFrame(data=category_keywords_dict.items(), columns=['category', 'keywords'])
category_keywords

Unnamed: 0,category,keywords
0,bike lane,"[bike, cycle, cycling, path, sharrow]"
1,bus lane,[bus]
2,resident parking,"[resident, state, plate, plates]"
3,fire hydrant,"[fire, hydrant]"
4,sidewalk,"[sidewalk, side]"
5,driveway,"[driveway, drive, way, private]"
6,crosswalk,"[crosswalk, cross]"
7,handicap,"[handicap, placard]"
8,double parking,"[double, triple]"
9,no stopping,"[stopping, loading]"


### 1. Data Preparation (Add Lemmatization)

In [4]:
# Load pickled service descriptions
with open('pre-processing/API-responses/service_req_desc_03-02-24_01_09_24.pkl', 'rb') as file:
    service_id_to_descriptions:dict = pickle.load(file)

# Remove special characters and downcase, then tokenize
def tokenize(doc):
    return simple_preprocess(doc, deacc=True, min_len=2, max_len=15)

service_drescriptions_df = pd.DataFrame(data=service_id_to_descriptions.items(), columns=['ID', 'Description'])
service_drescriptions_df.head()

Unnamed: 0,ID,Description
0,101005335950,Illegal parking in the commercial loading Zone...
1,101005335949,Red Honda w New York plates illegally parked o...
2,101005335934,Car completely blocking turn in fire lane
3,101005335925,Double parking alone mass ave. No enforcement....
4,101005335924,truck on sidewalk


#### Test, Training, Validate Split

In [4]:
# Train=.70, Validate=.15, Test=.15
# Split all data between train, (test AND validate)
desc_train, desc_testval = train_test_split(service_drescriptions_df, test_size=.3)

# Split (test AND validate) between test, validate
# .3 * .5 = 0.15
desc_val, desc_test = train_test_split(desc_testval, test_size=0.5)

In [5]:
desc_train['data_type'] = 'train'
desc_test['data_type'] = 'test'
desc_val['data_type'] = 'validate'

all_descriptions = pd.concat([desc_train, desc_test, desc_val]).reset_index(drop=True)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type
0,101005296957,Blue Toyota exp. Inspection,train
1,101005191577,Parking in a school bus zone,train
2,101005228840,Blocking driveway,train
3,101005329064,How is the car parked illegally: [Handicapped ...,train
4,101005266315,A large silver truck with oregon plates always...,train


#### Service ID to Tag function

In [6]:
service_id_list = []
def service_id_to_tag(service_id:int) -> int:
    if service_id in service_id_list:
        return service_id_list.index(service_id)
    service_id_list.append(service_id)
    return len(service_id_list)

#### Tagging Descriptions

In [7]:
all_descriptions['tagged_desc'] = all_descriptions.apply(lambda row: TaggedDocument(tokenize(row['Description']), [service_id_to_tag(int(row['ID']))]), axis=1)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type,tagged_desc
0,101005296957,Blue Toyota exp. Inspection,train,"([blue, toyota, exp, inspection], [1])"
1,101005191577,Parking in a school bus zone,train,"([parking, in, school, bus, zone], [2])"
2,101005228840,Blocking driveway,train,"([blocking, driveway], [3])"
3,101005329064,How is the car parked illegally: [Handicapped ...,train,"([how, is, the, car, parked, illegally, handic..."
4,101005266315,A large silver truck with oregon plates always...,train,"([large, silver, truck, with, oregon, plates, ..."


### 2. Model Training

In [8]:
model = Lbl2Vec(
    keywords_list=list(category_keywords['keywords']),
    tagged_documents=all_descriptions['tagged_desc'][all_descriptions['data_type'] == 'train'],
    vector_size=300, 
    min_count=1, 
    similarity_threshold=0.3
    )

model.fit()

2024-03-24 14:44:19,192 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-24 14:44:34,716 - Lbl2Vec - INFO - Train label embeddings


#### Predict Description Labels

In [16]:
model.predict_model_docs()

2024-03-24 15:04:14,697 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-24 15:04:14,714 - Lbl2Vec - INFO - Calculate document<->label similarities


Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10
0,0,label_10,0.030059,0.030044,0.029812,0.030044,0.030041,0.030045,0.030045,0.030045,0.030044,0.030046,0.030043,0.030059
1,1,label_8,0.891777,0.891746,0.891280,0.891738,0.891760,0.891750,0.891740,0.891714,0.891701,0.891777,0.891738,0.891652
2,2,label_10,0.925019,0.924969,0.924613,0.924977,0.924966,0.924966,0.924971,0.924978,0.924982,0.924956,0.924966,0.925019
3,3,label_8,0.425022,0.424991,0.424655,0.424978,0.425001,0.424993,0.424983,0.424952,0.424942,0.425022,0.424979,0.424882
4,4,label_1,0.952883,0.952849,0.952883,0.952850,0.952850,0.952850,0.952849,0.952853,0.952853,0.952845,0.952848,0.952859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6838,6838,label_8,0.976301,0.976290,0.976172,0.976287,0.976296,0.976291,0.976286,0.976277,0.976273,0.976301,0.976286,0.976248
6839,6839,label_1,0.983041,0.982897,0.983041,0.982900,0.982894,0.982897,0.982900,0.982908,0.982911,0.982889,0.982900,0.982928
6840,6840,label_10,0.925339,0.925228,0.924340,0.925246,0.925211,0.925222,0.925233,0.925255,0.925260,0.925201,0.925231,0.925339
6841,6841,label_8,0.882172,0.882154,0.881970,0.882149,0.882164,0.882157,0.882151,0.882137,0.882129,0.882172,0.882146,0.882101


####  Manually Catagorize Data

In [8]:
def getDesAndIDOfKeyWord(dataframe, column, keyword, n_entries):
    """
    Filters the dataframe to only include rows where the specified column contains the keyword.

    Parameters:
    - dataframe: The pandas DataFrame to filter.
    - column: The name of the column in the DataFrame to search for the keyword.
    - keyword: The keyword to search for within the specified column.
    - n_entries: Number of entries to display.

    Prints the ID and Description of entries where the specified column contains the keyword
    """
    # Filter the DataFrame
    filtered_df = dataframe[dataframe[column].astype(str).str.contains(keyword, case=False, na=False)]
    
    # Get the first n_entries
    n_entDF = filtered_df.head(n_entries)

    # Print the desired output
    for index, row in n_entDF.iterrows():
        print(f"ID: {row['ID']}, {column}: {row[column]}")
    return n_entDF

In [19]:
busIDS = getDesAndIDOfKeyWord(service_drescriptions_df, "Description", " bus ", 15)
# IDs of 10 Bus Lane Incidents:
gt_bus_ids = [101005335507, 101005334964, 101005334964, 101005334186, 101005333847, 101005333973, 101005333169, 101005332709, 101005332197, 101005331547]

ID: 101005335507, Description: -- auto translated (en) -- Parked in bus lane…almost daily occurrence -- original (en) -- Parked in bus lane…almost daily occurrence
ID: 101005335153, Description: Black ford with Maine license plates parked in active Bus stop.
ID: 101005335041, Description: This red truck and a few other cars frequently park in this spot and leaves it for more than an hour. Fidelis way and Washington are busy streets and it makes for cars coming out of Fidelis incredibly hard to turn when you have these monstrosities blocking and using them as long term parking. These spaces are for 10-15 minutes only and this is a bps school bus stop!
ID: 101005334964, Description: Cars blocking the pm bus lane, Washington St in Roslindale please ticket
ID: 101005334810, Description: Constituent requests Copley Square be monitored regularly for ongoing violations of double parking and parking in bus lanes. States this creates safety issues for the public. | How is the car parked illegal

In [18]:
getDesAndIDOfKeyWord(service_drescriptions_df, "Description", " bike ", 15)
# IDs of 10 Bike Lane Incidents:
gt_bike_ids = [101005335795, 101005335096, 101005334866, 101005334683, 101005334584, 101005332739, 101005333947, 101005333057, 101005332081, 101005331429]

ID: 101005335795, Description: -- auto translated (en) -- Cars parked in “separated” bike lane -- original (en) -- Cars parked in “separated” bike lane
ID: 101005335096, Description: This red Camry is consistently parked in the bike lane across from the West Roxbury Branch of the Public Library, in front of the pizza joint and the dry cleaners.
ID: 101005334866, Description: Truck parked in bike lane
ID: 101005334723, Description: Truck parked in bike lane
ID: 101005334683, Description: SUV parked on bike lane
ID: 101005334584, Description: Illegally parked cars in no parking zone and bike lane
ID: 101005334083, Description: Car parked in bike lane
ID: 101005332739, Description: MBTA truck parked in bike lane, blocking fire hydrant, mailbox, and crosswalk. The quadradecta! Tow this pos
ID: 101005333947, Description: White SUV parked 2 feet from the curb in the bike lane.
ID: 101005333057, Description: Illegal bike lane obstruction. Cars need to be ticketed and towed. Need flex posts to

In [17]:
getDesAndIDOfKeyWord(service_drescriptions_df, "Description", " resident ", 15)
# IDs of non-resident parking Incidents:
gt_resident_ids = [101005335792, 101005335731, 101005335399, 101005335228, 101005335225, 101005335054, 101005334908, 101005334840, 101005333487, 101005334182]

ID: 101005335792, Description: No resident sticker parked in resident only
ID: 101005335731, Description: Non-resident vehicle parked for several days in resident parking. Segel Street
ID: 101005335399, Description: Red car parked in front of 1940 Commonwealth Ave without resident parking sticker.
ID: 101005335384, Description: Illegal parking in resident only spots
ID: 101005335228, Description: -- auto translated (en) -- White Toyota Corolla has been parked here in a resident only part of Emmons Street in East Boston. The car has a Mission Hill resident sticker. Please come to ticket or even better, tow this car as it hasn’t moved for two weeks now. Thanks.  -- original (en) -- White Toyota Corolla has been parked here in a resident only part of Emmons Street in East Boston. The car has a Mission Hill resident sticker. Please come to ticket or even better, tow this car as it hasn’t moved for two weeks now. Thanks.
ID: 101005335225, Description: Black Toyota SUV parked illegally in re

In [16]:
getDesAndIDOfKeyWord(service_drescriptions_df, "Description", " hydrant ", 15)
# IDs of fire hydrant parking Incidents:
gt_hydrant_ids = [101005335836, 101005335320, 101005335227, 101005334484, 101005334236, 101005331891, 101005333965, 101005333882, 101005333064, 101005331333]

ID: 101005335836, Description: Vehicle near hydrant  MKE: HNDA  MDEL: CIVIC 3XYK69
ID: 101005335320, Description: Vechile parked too close to fire Hydrant, less than 5 to 10 from the Hydrant dk color sedan.
ID: 101005335227, Description: Truck parked in front of fire hydrant on Athol Street.
ID: 101005334484, Description: Two motor vehicles blocking fire hydrant behind 11 Channel Ctr. St. On Medallion Ave. ongoing problem!
ID: 101005334236, Description: Same car has been parked blocking the fire hydrant overnight! License plate reads 3XFB78. I have noticed this car does this frequently.  This is not safe for neighbors who live here. Please remedy this!
ID: 101005331891, Description: White Honda less than 1000 ft from hydrant as per 700CMR ORDINANCE
ID: 101005333965, Description: -- auto translated (en) -- In front of 94 Litchfield St/Brighton. White 4 door car blocking fire hydrant & trash cans (trash pickup b/w 7-8am today/Friday’s). -- original (en) -- In front of 94 Litchfield St/Br

In [15]:
getDesAndIDOfKeyWord(service_drescriptions_df, "Description", " sidewalk ", 15)
# IDs of sidewalk blocking Incidents:
gt_sidewalk_ids = [101005335787, 101005335744, 101005335226, 101005335233, 101005335226, 101005335135, 101005335120, 101005334919, 101005334755, 101005334258]

ID: 101005335787, Description: Tow zone & parked on sidewalk | How is the car parked illegally: [Other]  Details: [black nissian and grey sedan]  Make: [Hyundai]  Color: [Blue]
ID: 101005335744, Description: White Cadillac suv on sidewalk 30 baker st WR
ID: 101005335283, Description: Illegal parking blocking sidewalk at a handicapped space.
ID: 101005335233, Description: -- auto translated (en) -- Date 3/2/24 12:30am  Front of 14 porter st east Boston. I work for public safety.  My client is 14 porter st east Boston. North Suffolk community service.  The client told me to try to keep the sidewalk handicap ramp open. Due to having medical treatment, appt, as well for the First responder.  There is a 24/7 crisis team here. Sometime is difficult to bring patients and due to the blocking of the handicap ramp.  As well if there can be a sign and pole. Place there as well. This has been on going issues. Plus at night when people park.  Because they go to the night clubs.. I know because I se

In [20]:
getDesAndIDOfKeyWord(service_drescriptions_df, 'Description', ' driveway ', 15)['ID'].array

# IDs of hydrant blocking incidents
gt_hydrant_ids = [101005335798, 101005335658, 101005335501, 101005335488,
 101005335375, 101005335201, 101005335039, 101005334906,
 101005334816, 101005334636, 101005334480, 101005332880,
 101005332587, 101005332712, 101005332575]

ID: 101005335798, Description: W53069 overhanging driveway at 838 Dot Ave
ID: 101005335658, Description: PARTIALLY BLOCKING DRIVEWAY | How is the car parked illegally: [Other]  Make: [Jeep]  Model: [SUV]  Color: [RedOrange]  Vehicle License Plate Registration: [7MEN50]
ID: 101005335501, Description: Can the city please enforce this on weekends? Car in marked tow zone, residents cant exit driveway on one way street.
ID: 101005335488, Description: PARTIALLTY BLOCKING DRIVEWAY | How is the car parked illegally: [Other]  Make: [Ford]  Model: [SUV]  Color: [Black]  Vehicle License Plate Registration: [H12315]  Vehicle License Plate State: [MA]
ID: 101005335375, Description: Car parked in front of driveway 50 Woodward street
ID: 101005335201, Description: Illegal parking. White Subaru blocking wheelchair access to my driveway and garage
ID: 101005335039, Description: Illegally parked car blocking driveway for 5 hours
ID: 101005334906, Description: Illegal driveway being used yet again
ID: 10

In [27]:
getDesAndIDOfKeyWord(service_drescriptions_df, 'Description', ' crosswalk ', 15)['ID'].array.astype(np.int64)

gt_crosswalk_ids = [101005333519, 101005329346, 101005328951, 101005326287,
       101005324592, 101005325131, 101005303753, 101005299140,
       101005299206, 101005299868, 101005297355, 101005297872,
       101005296848, 101005296131, 101005284078]

ID: 101005333519, Description: Car blocking crosswalk for over 24 hours.
ID: 101005329346, Description: Car parked on crosswalk blocking handicap ramp.
ID: 101005328951, Description: Cars parked in crosswalk everyday. Is this legal? Thanks
ID: 101005326287, Description: White Nissan suv parked blocking crosswalk completely
ID: 101005324592, Description: Parking in crosswalk and bike lane.
ID: 101005325131, Description: This idiot has been parked in the crosswalk for over two hours. Please send BTD to check our neighborhood. This is a busy crosswalk going to the park.
ID: 101005303753, Description: Please ticket. This same McLane truck blocks the entire crosswalk every week. We need our crosswalk clear and safe.
ID: 101005299140, Description: White Hyundai with CT plate parked in the crosswalk at O & 4th
ID: 101005299206, Description: Same Car covering crosswalk and hydrant
ID: 101005299868, Description: -- auto translated (en) -- Please note this Classic Crosswalk Catastrophe™. Vehicle

In [33]:
gt_handicap_ids = getDesAndIDOfKeyWord(service_drescriptions_df, 'Description', ' handicap ', 15)['ID'].array.astype(np.int64)

ID: 101005335352, Description: The constituent states that this vehicle is parked in the handicap spot. The constituetn who had a plaque couldnt park there and called us to report it. | How is the car parked illegally: [Handicapped parking space]  Details: [Black vehicle]  Make: [Unknown]  Model: [4DR]  Color: [Black]  Vehicle License Plate Registration: [3HAT53]  Vehicle License Plate State: [MA]
ID: 101005335255, Description: -- auto translated (en) -- Here we go again. My sister is handicapped and all these vehicles keep parking here illegally! No handicap plate or placard.  My sister is handicap and can’t walk far so if she goes out that’s where the van or cars pull up. She has braces on both her ankles.  I shouldn’t have to even have talk about her disability. -- original (en) -- Here we go again. My sister is handicapped and all these vehicles keep parking here illegally! No handicap plate or placard.  My sister is handicap and can’t walk far so if she goes out that’s where the v

In [36]:
gt_double_parking_ids = getDesAndIDOfKeyWord(service_drescriptions_df, 'Description', ' double ', 15)['ID'].array.astype(np.int64)

ID: 101005335675, Description: Parking and double parking in no stopping zone, across the street from a parking garage
ID: 101005334898, Description: Another double parked car on Boylston Street
ID: 101005334810, Description: Constituent requests Copley Square be monitored regularly for ongoing violations of double parking and parking in bus lanes. States this creates safety issues for the public. | How is the car parked illegally: [Other]
ID: 101005334627, Description: White car double parked for an hour
ID: 101005334300, Description: Illegally double parked
ID: 101005333973, Description: -- auto translated (en) -- Illegally Double parked blocking the bus lane Dunkin’ Donuts customers -- original (en) -- Illegally Double parked blocking the bus lane Dunkin’ Donuts customers
ID: 101005333847, Description: Bedlam in Copley square.  Busses stopped in the bus lanes.  Double parking, motor scooters traveling on the sidewalks.   Someone is going to get hurt in all this craziness.  Mayor Wu,

In [37]:
gt_stopping_ids = getDesAndIDOfKeyWord(service_drescriptions_df, 'Description', ' stopping ', 15)['ID'].array.astype(np.int64)

ID: 101005335675, Description: Parking and double parking in no stopping zone, across the street from a parking garage
ID: 101005335373, Description: No Stopping | How is the car parked illegally: [Other]  Type of vehicle: [SUV]  Make: [Ford]  Model: [SUV]
ID: 101005335207, Description: Car parked in no stopping zone and very far from curb. Hard for cars to pass
ID: 101005334804, Description: large amount of cars in no stopping tow zone in front of the hodspital. At least 10. | How is the car parked illegally: [Fire lane]  Make: [Unknown]  Model: [4DR]  Color: [Unknown]
ID: 101005333987, Description: Green mini cooper parked in no stopping zone
ID: 101005333943, Description: Mercedes parked in No stopping zone
ID: 101005333811, Description: Red bmw parked in No stopping zone going up brooks
ID: 101005332081, Description: Can BTD please enforce all the cars blocking the No Stopping areas around Longwood.  Brookline Ave is filled with illegally parked cars blocking bike lane and travel l

In [38]:
gt_visitor_ids = getDesAndIDOfKeyWord(service_drescriptions_df, 'Description', ' visitor ', 15).ID.array.astype(np.int64)

ID: 101005335815, Description: Mar-2-2024, cars using this spot for garage parking. Area is intersection between Mansur St & Grew Ave Roslindale. Iron-gate/shed area please refer to picture.  There is garbage in that same area as well. Dark blue sedan plate # 3KAG26 uses this place for extended periods of time Grey Camry 3AAB49 frequent visitor to Unit 137B who  plays EXTREMELY LOUD music     Please put signage as requested before.
ID: 101005334932, Description: Vehicle with RI plates parked in 2 hour visitor spot for 24 hours now. No ticket.
ID: 101005332804, Description: Cars without resident stickers parking all day in 2-hour visitor zone, leaving visitors no place to park. Noted especially Blue Infiniti SUV George plate SBE6559 that has become a regular. Need to be overtimed.
ID: 101005329589, Description: Out of state car parked in 2 hour visitor spot for +24 hours
ID: 101005328000, Description: -- auto translated (en) -- There is silver Toyota SUV (plate #5RW 714) that doesn’t ha