In [25]:
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from lbl2vec import Lbl2Vec
from gensim.downloader import load
import numpy as np

# Plan Overview:
0. Keyword Definition
1. Data Preparation
2. Model Training

### 0. Keyword Definition

#### Categories:

    1.  Bike Lane Obstruction: bike, cycle, path,  
    2.  Bus Lane Obstruction: bus, stop, 
    3.  Non-resident Parking: resident, state
    4.  Blocked Fire Hydrant: fire, hydrant
    5.  Blocked Sidewalk: sidewalk, side, walk
    6.  Blocked Driveway: driveway, drive, way, private
    7.  Blocked Crosswalk: crosswalk, cross, walk
    8.  Blocked Handicap spot: handicap, placard,
    9.  Double Parking: double, triple
    10. No stopping zone: stopping, zone
    11. visitor spot: visitor, hour

In [2]:
category_keywords_dict = {
    'bike lane' :           ['bike', 'cycle', 'cycling', 'path', 'sharrow'],
    'bus lane' :            ['bus'],
    'resident parking' :    ['resident', 'state', 'plate', 'plates'],
    'fire hydrant' :        ['fire', 'hydrant'],
    'sidewalk' :            ['sidewalk', 'side'],
    'driveway' :            ['driveway', 'drive', 'way', 'private'],
    'crosswalk' :           ['crosswalk', 'cross'],
    'handicap' :            ['handicap', 'placard'],
    'double parking' :      ['double', 'triple'],
    'no stopping' :         ['stopping', 'loading'],
    'visitor parking' :     ['visitor', 'hour']
} 

category_keywords = pd.DataFrame(data=category_keywords_dict.items(), columns=['category', 'keywords'])
category_keywords

Unnamed: 0,category,keywords
0,bike lane,"[bike, cycle, cycling, path, sharrow]"
1,bus lane,[bus]
2,resident parking,"[resident, state, plate, plates]"
3,fire hydrant,"[fire, hydrant]"
4,sidewalk,"[sidewalk, side]"
5,driveway,"[driveway, drive, way, private]"
6,crosswalk,"[crosswalk, cross]"
7,handicap,"[handicap, placard]"
8,double parking,"[double, triple]"
9,no stopping,"[stopping, loading]"


### 1. Data Preparation (Add Lemmatization)

In [4]:
# Load pickled service descriptions
with open('pre-processing/API-responses/service_req_desc_03-02-24_01_09_24.pkl', 'rb') as file:
    service_id_to_descriptions:dict = pickle.load(file)

# Remove special characters and downcase, then tokenize
def tokenize(doc):
    return simple_preprocess(doc, deacc=True, min_len=2, max_len=15)

service_drescriptions_df = pd.DataFrame(data=service_id_to_descriptions.items(), columns=['ID', 'Description'])
service_drescriptions_df.head()

Unnamed: 0,ID,Description
0,101005335950,Illegal parking in the commercial loading Zone...
1,101005335949,Red Honda w New York plates illegally parked o...
2,101005335934,Car completely blocking turn in fire lane
3,101005335925,Double parking alone mass ave. No enforcement....
4,101005335924,truck on sidewalk


#### Test, Training, Validate Split

In [40]:
# Train=.70, Validate=.15, Test=.15
# Split all data between train, (test AND validate)
desc_train, desc_testval = train_test_split(service_drescriptions_df, test_size=.3)

# Split (test AND validate) between test, validate
# .3 * .5 = 0.15
desc_val, desc_test = train_test_split(desc_testval, test_size=0.5)

In [41]:
desc_train['data_type'] = 'train'
desc_test['data_type'] = 'test'
desc_val['data_type'] = 'validate'

all_descriptions = pd.concat([desc_train, desc_test, desc_val]).reset_index(drop=True)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type
0,101005273058,Constituent reports a white tesla parked in fr...,train
1,101005239921,More none RPP vehicles with CT plates with no ...,train
2,101005282207,sidewalk blocked,train
3,101005300480,Vehicle parked in emergency vehicle access zon...,train
4,101005274751,Dumpster permit expired a week ago. Dumpster s...,train


#### Service ID to Tag function

In [42]:
service_id_list = []
def service_id_to_tag(service_id:int) -> int:
    if service_id in service_id_list:
        return service_id_list.index(service_id)
    service_id_list.append(service_id)
    return len(service_id_list)

#### Tagging Descriptions

In [43]:
all_descriptions['tagged_desc'] = all_descriptions.apply(lambda row: TaggedDocument(tokenize(row['Description']), [service_id_to_tag(int(row['ID']))]), axis=1)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type,tagged_desc
0,101005273058,Constituent reports a white tesla parked in fr...,train,"([constituent, reports, white, tesla, parked, ..."
1,101005239921,More none RPP vehicles with CT plates with no ...,train,"([more, none, rpp, vehicles, with, ct, plates,..."
2,101005282207,sidewalk blocked,train,"([sidewalk, blocked], [3])"
3,101005300480,Vehicle parked in emergency vehicle access zon...,train,"([vehicle, parked, in, emergency, vehicle, acc..."
4,101005274751,Dumpster permit expired a week ago. Dumpster s...,train,"([dumpster, permit, expired, week, ago, dumpst..."


### 2. Model Training

In [8]:
model = Lbl2Vec(
    keywords_list=list(category_keywords['keywords']),
    tagged_documents=all_descriptions['tagged_desc'][all_descriptions['data_type'] == 'train'],
    vector_size=300, 
    min_count=1, 
    similarity_threshold=0.3
    )

model.fit()

2024-03-24 14:44:19,192 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-24 14:44:34,716 - Lbl2Vec - INFO - Train label embeddings


#### Predict Description Labels

In [16]:
model.predict_model_docs()

2024-03-24 15:04:14,697 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-24 15:04:14,714 - Lbl2Vec - INFO - Calculate document<->label similarities


Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10
0,0,label_10,0.030059,0.030044,0.029812,0.030044,0.030041,0.030045,0.030045,0.030045,0.030044,0.030046,0.030043,0.030059
1,1,label_8,0.891777,0.891746,0.891280,0.891738,0.891760,0.891750,0.891740,0.891714,0.891701,0.891777,0.891738,0.891652
2,2,label_10,0.925019,0.924969,0.924613,0.924977,0.924966,0.924966,0.924971,0.924978,0.924982,0.924956,0.924966,0.925019
3,3,label_8,0.425022,0.424991,0.424655,0.424978,0.425001,0.424993,0.424983,0.424952,0.424942,0.425022,0.424979,0.424882
4,4,label_1,0.952883,0.952849,0.952883,0.952850,0.952850,0.952850,0.952849,0.952853,0.952853,0.952845,0.952848,0.952859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6838,6838,label_8,0.976301,0.976290,0.976172,0.976287,0.976296,0.976291,0.976286,0.976277,0.976273,0.976301,0.976286,0.976248
6839,6839,label_1,0.983041,0.982897,0.983041,0.982900,0.982894,0.982897,0.982900,0.982908,0.982911,0.982889,0.982900,0.982928
6840,6840,label_10,0.925339,0.925228,0.924340,0.925246,0.925211,0.925222,0.925233,0.925255,0.925260,0.925201,0.925231,0.925339
6841,6841,label_8,0.882172,0.882154,0.881970,0.882149,0.882164,0.882157,0.882151,0.882137,0.882129,0.882172,0.882146,0.882101


####  Manually Catagorize Data

In [8]:
def getDesAndIDOfKeyWord(dataframe, column, keyword, n_entries):
    """
    Filters the dataframe to only include rows where the specified column contains the keyword.

    Parameters:
    - dataframe: The pandas DataFrame to filter.
    - column: The name of the column in the DataFrame to search for the keyword.
    - keyword: The keyword to search for within the specified column.
    - n_entries: Number of entries to display.

    Prints the ID and Description of entries where the specified column contains the keyword
    """
    # Filter the DataFrame
    filtered_df = dataframe[dataframe[column].astype(str).str.contains(keyword, case=False, na=False)]
    
    # Get the first n_entries
    n_entDF = filtered_df.head(n_entries)

    # Print the desired output
    for index, row in n_entDF.iterrows():
        print(f"ID: {row['ID']}, {column}: {row[column]}")
    return n_entDF

In [57]:
gt_bus_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], "Description", " bus ", 15).ID.array.astype(np.int64)
# IDs of 10 Bus Lane Incidents:

ID: 101005231551, Description: Tow cars parked in bus stop
ID: 101005284921, Description: Blocking bus due to double parked donut patrons inbound and outbound sides!!!  No enforcement cars been parked 20 minutes here.
ID: 101005297064, Description: bus stop, caller has to take the 47 bus to BMC.  Every day there are people parked in the bus stop | The closest intersecting street: [harrison]  How is the car parked illegally: [Other]  Type of vehicle: [Truck]  Make: [Unknown]  Model: [Unknown]  Color: [Unknown]
ID: 101005324559, Description: Parked in bus stop. Please ticket.
ID: 101005265025, Description: Illegally parked vehicle in bus stop at intersection of E 8th st and N st.  Bus stop has two signs.
ID: 101005327864, Description: Illegally parked vehicle in bus stop at intersection of E 8th st and N st.  The bus stop has two signs.
ID: 101005276653, Description: -- auto translated (en) -- STATE TROOPER CAR PARKED IN BUS STOP! Please enforce or he’ll think this is okay. -- original (

In [58]:
gt_bike_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], "Description", " bike ", 15).ID.array.astype(np.int64)
# IDs of 10 Bike Lane Incidents:


ID: 101005186135, Description: Truck parked in bike lane
ID: 101005288825, Description: 3 SUVs parked on the bike lane. Blocking access for bikes.
ID: 101005332805, Description: Same car same bike lane 1SJV93
ID: 101005186133, Description: Truck blocking bike lane
ID: 101005266206, Description: illegal parking in bike lane. Recurring issue in this area. Centre St. West Roxbury
ID: 101005231210, Description: Car parked on bike lane
ID: 101005186468, Description: Party trolley parked in the bike lane in front of the Liberty Hotel for a long time.
ID: 101005187322, Description: Car illegally parked in bike lane
ID: 101005296483, Description: Cars parked in no stopping and bike lanes. Intersection of Washington and milk streets
ID: 101005333057, Description: Illegal bike lane obstruction. Cars need to be ticketed and towed. Need flex posts to prevent illegal parking.
ID: 101005281217, Description: Truck parked right over the crosswalk, in front of Sugar Bakery, (1) blocking the crosswalk, 

In [59]:
gt_resident_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], "Description", " resident ", 15)
# IDs of non-resident parking Incidents:

ID: 101005278709, Description: Non-resident parked in resident parking
ID: 101005285222, Description: Black Toyota Corolla VA plate UEN-6940 parked in resident permit parking zone during enforced hours without resident parking permit for the last 3 weeks without moving.
ID: 101005203879, Description: -- auto translated (en) -- The truck is not getting any permission and permission on resident parking space for more than 6 weeks! Is it legal? I saw 101 Hudson construction workers wanted some from truck!   -- original (zh-CN) -- The truck is not get any permission and stopping on resident parking space for more than 6 weeks !  Is it legal? I saw 101 Hudson construction workers took some from truck!
ID: 101005293710, Description: Several cars without resident stickers on our street
ID: 101005329804, Description: This car is parked all day and is not a resident of East Boston. 2/27/2014 open 9:01 am
ID: 101005301125, Description: Constituent reports a commericsl car parked at this location

In [60]:
gt_hydrant_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], "Description", " hydrant ", 15).ID.array.astype(np.int64)

ID: 101005274669, Description: -- auto translated (en) -- Not 10’ from hydrant  -- original (en) -- Not 10’ from hydrant
ID: 101005277471, Description: Too close to hydrant - third time reporting this issue
ID: 101005274476, Description: Constituent states her case keeps getting closed out and the car is still blocking the fire hydrant. Constituent states the car has not moved in 3 days and if there is an emergency the hydrant is blocked. | How is the car parked illegally: [Blocking hydrant]  Details: [Constituent states her case keeps getting closed out and the car is still blocking the fire hydrant. Constituent states the car has not moved in 3 days and if there is an emergency the hydrant is blocked.]  Make: [Nissan]  Model: [4DR]  Color: [White]  Vehicle License Plate Registration: [21509]  Vehicle License Plate State: [IL]
ID: 101005293832, Description: Commercial truck constantly parked in front of fire hydrant for residential buliding. White Ford F150 with License Plate X53247. 

In [61]:
gt_sidewalk_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], "Description", " sidewalk ", 15).ID.array.astype(np.int64)

ID: 101005275229, Description: Individual still continuous to park in sidewalk. Can someone speak to this individual. Sidewalk width requirements are especially important for wheelchair-bound individuals. For ADA compliance,
ID: 101005334258, Description: Car illegally parked on the sidewalk and a person in a wheelchair had to go out in the street to get around it
ID: 101005288957, Description: Tow car parked in road and blocking sidewalk completely
ID: 101005298907, Description: Truck parked on sidewalk in front of Napa again. This person pulled in front of me while I was walking up the sidewalk parked his truck completely blocking the sidewalk got out and walked away. The city really needs to do a better job enforcing this because this happens multiple times a week, almost every day there is a car parked on the sidewalk in front of Napa
ID: 101005190454, Description: TRUCK PARKED ON SIDEWALK ILLEGALLY
ID: 101005328152, Description: Construction Lift blocking sidewalk and impeding mot

In [62]:
gt_driveway_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], 'Description', ' driveway ', 15).ID.array.astype(np.int64)

ID: 101005229843, Description: 610 Walk Hill Street. White GMC parks in front of driveway every day. No one patrols the street so they continue to do it.
ID: 101005203145, Description: Kia Forte MA Plates 5HRZ79 obstructing driveway between 211 & 219 W 3RD ST
ID: 101005299474, Description: partially blocking driveway | How is the car parked illegally: [Other]  Details: [accord]  Make: [Honda]  Model: [4DR]  Color: [GreySilver]
ID: 101005289336, Description: Car blocking our driveway on Linden street
ID: 101005292884, Description: Auto from Hudeifa Auto is partially blocking my driveway at 25 Centre Street, Roxbury.
ID: 101005334906, Description: Illegal driveway being used yet again
ID: 101005281960, Description: Truck parked blocking an active driveway again. In front of format fitness on Hanover street
ID: 101005284454, Description: A black jeep 4SRK37 has been blocking the driveway adjacent to 10 Roach Street for many hours. The driver was informed about the violation but did not co

In [63]:
gt_crosswalk_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], 'Description', ' crosswalk ', 15)['ID'].array.astype(np.int64)

ID: 101005253008, Description: Car parked in a crosswalk at a handicap ramp.
ID: 101005279365, Description: -- auto translated (en) -- Corner of 7th & Nst car in crosswalk also blocking part of HP ramp. It is still in ur system from 4days ago. Not 1 ticket because it’s outside my kitchen window so I look starting at5am when I’m up. Let’s do better  -- original (en) -- Corner of 7th & Nst car in crosswalk also blocking part of HP ramp. It is still in ur system from 4days ago. Not 1 ticket because it’s outside my kitchen window so I look starting at5am when I’m up. Let’s do better
ID: 101005187394, Description: -- auto translated (en) -- Truck repeatedly parks in tow zone at intersection of Walter and Hewlett street. It block the line-of-sight to the crosswalk and you can’t see children at all. Please come ticket and tow, someone is going to get hit. MA plate V755 -- original (en) -- Truck repeatedly parks in tow zone at intersection of Walter and Hewlett street. It block the line-of-sig

In [64]:
gt_handicap_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], 'Description', ' handicap ', 15)['ID'].array.astype(np.int64)

ID: 101005279450, Description: Parked in handicap spot with no placard. The area needs enforcement please
ID: 101005187324, Description: Car blocking handicap ramp
ID: 101005288647, Description: White car parked in handicap spot
ID: 101005253008, Description: Car parked in a crosswalk at a handicap ramp.
ID: 101005239288, Description: This vehicle was parked legally before the handicap space was installed on 1/4 for my disabled husband (needs car to wheelchair transfer). It foes not have a placard so is now parked illegally. I have tried reaching out to neighbors with no luck. It has been parked here for 3 days. I am not sure how to proceed, as we really need to start using the space, especially with the impending snow on Sunday.
ID: 101005275621, Description: Illegal parking in hc. 6 Trenton. East Boston. 02128. No placard in handicap spot.
ID: 101005333171, Description: Car blocking handicap ramp
ID: 101005300503, Description: Vehicle is parked on sidewalk & in a Handicap parking spa

In [65]:
gt_double_parking_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], 'Description', ' double ', 15)['ID'].array.astype(np.int64)

ID: 101005284921, Description: Blocking bus due to double parked donut patrons inbound and outbound sides!!!  No enforcement cars been parked 20 minutes here.
ID: 101005274372, Description: Constituent states multiple cars double parked here everynight and nobody is ticketed. Constituent states the traffic is backed up for miles because they just leave there car's double parked all night. | How is the car parked illegally: [Other]
ID: 101005243542, Description: Cars double parked
ID: 101005329443, Description: Red bmw double parked with nobody in it. These people like to leave their cars in the street all night
ID: 101005278934, Description: -- auto translated (en) -- I I just gotthe attached close from 311 saying all the double parked cars are not there, but I just looked out and the whole street is lined with double parked cars! It is clear no one came to look. Can you please send BTD traffic enforcement to ticket. It’s like this every day ad they never come and people have figured t

In [66]:
gt_stopping_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], 'Description', ' stopping ', 15)['ID'].array.astype(np.int64)

ID: 101005203879, Description: -- auto translated (en) -- The truck is not getting any permission and permission on resident parking space for more than 6 weeks! Is it legal? I saw 101 Hudson construction workers wanted some from truck!   -- original (zh-CN) -- The truck is not get any permission and stopping on resident parking space for more than 6 weeks !  Is it legal? I saw 101 Hudson construction workers took some from truck!
ID: 101005324187, Description: -- auto translated (en) -- No Chinatown residents parking restrictions in residents parking space for more than 5 days  -- original (zh-CN) -- No Chinatown residents parking permits stopping  in residents parking space for more than 5 days
ID: 101005324539, Description: No stopping anytime
ID: 101005278740, Description: White suv parked in No stopping zone
ID: 101005296483, Description: Cars parked in no stopping and bike lanes. Intersection of Washington and milk streets
ID: 101005280160, Description: Car parked in No Stopping 

In [67]:
gt_visitor_ids = getDesAndIDOfKeyWord(all_descriptions[all_descriptions.data_type == 'test'], 'Description', ' visitor ', 15).ID.array.astype(np.int64)

ID: 101005266551, Description: Car parked in 2 hour visitor parking for multiple days
ID: 101005285260, Description: 51-71 Robinwood Avenue, JP: Cars without permit parking in permit only areas and allay in 2-hour visitor area. the last will need to be overtimed.
ID: 101005287505, Description: -- auto translated (en) -- Car with New Jersey plates parked in residential space in South Boston. And he’s taking up two spaces. Car remains parked there for weeks at a time and never gets ticketed despite numerous calls to 311 because he’s parked in two-hour visitor spot and apparently traffic enforcement can’t swing by twice in one evening. Opposite 428 East 8th Street.  -- original (en) -- Car with New Jersey plates parked in residential space in South Boston. And he’s taking up two spaces. Car remains parked there for weeks at a time and never gets ticketed despite numerous calls to 311 because he’s parked in two-hour visitor spot and apparently traffic enforcement can’t swing by twice in on

##### Evaluation Function:
idea: take in list of ground truths, label id, evaluate a models predictions on 