In [48]:
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from lbl2vec import Lbl2Vec
from gensim.downloader import load

# Plan Overview:
0. Keyword Definition
1. Data Preparation
2. Model Training

### 0. Keyword Definition

#### Categories:

    1.  Bike Lane Obstruction: bike, cycle, path,  
    2.  Bus Lane Obstruction: bus, stop, 
    3.  Non-resident Parking: resident, state
    4.  Blocked Fire Hydrant: fire, hydrant
    5.  Blocked Sidewalk: sidewalk, side, walk
    6.  Blocked Driveway: driveway, drive, way, private
    7.  Blocked Crosswalk: crosswalk, cross, walk
    8.  Blocked Handicap spot: handicap, placard,
    9.  Double Parking: double, triple
    10. No stopping zone: stopping, zone
    11. visitor spot: visitor, hour

In [61]:
category_keywords_dict = {
    'bike lane' :           ['bike', 'cycle', 'cycling', 'path', 'sharrow'],
    'bus lane' :            ['bus'],
    'resident parking' :    ['resident', 'state', 'plate', 'plates'],
    'fire hydrant' :        ['fire', 'hydrant'],
    'sidewalk' :            ['sidewalk', 'side'],
    'driveway' :            ['driveway', 'drive', 'way', 'private'],
    'crosswalk' :           ['crosswalk', 'cross'],
    'handicap' :            ['handicap', 'placard'],
    'double parking' :      ['double', 'triple'],
    'no stopping' :         ['stopping', 'loading'],
    'visitor parking' :     ['visitor', 'hour']
} 

category_keywords = pd.DataFrame(data=category_keywords_dict.items(), columns=['category', 'keywords'])
category_keywords

Unnamed: 0,category,keywords
0,bike lane,"[bike, cycle, cycling, path, sharrow]"
1,bus lane,[bus]
2,resident parking,"[resident, state, plate, plates]"
3,fire hydrant,"[fire, hydrant]"
4,sidewalk,"[sidewalk, side]"
5,driveway,"[driveway, drive, way, private]"
6,crosswalk,"[crosswalk, cross]"
7,handicap,"[handicap, placard]"
8,double parking,"[double, triple]"
9,no stopping,"[stopping, loading]"


### 1. Data Preparation

In [50]:
# Load pickled service descriptions
with open('pre-processing/API-responses/service_req_desc_03-02-24_01_09_24.pkl', 'rb') as file:
    service_id_to_descriptions:dict = pickle.load(file)

# Remove special characters and downcase, then tokenize
def tokenize(doc):
    return simple_preprocess(doc, deacc=True, min_len=2, max_len=15)

service_drescriptions_df = pd.DataFrame(data=service_id_to_descriptions.items(), columns=['ID', 'Description'])
service_drescriptions_df.head()

Unnamed: 0,ID,Description
0,101005335950,Illegal parking in the commercial loading Zone...
1,101005335949,Red Honda w New York plates illegally parked o...
2,101005335934,Car completely blocking turn in fire lane
3,101005335925,Double parking alone mass ave. No enforcement....
4,101005335924,truck on sidewalk


#### Test, Training, Validate Split

In [51]:
# Train=.70, Validate=.15, Test=.15
# Split all data between train, (test AND validate)
desc_train, desc_testval = train_test_split(service_drescriptions_df, test_size=.3)

# Split (test AND validate) between test, validate
# .3 * .5 = 0.15
desc_val, desc_test = train_test_split(desc_testval, test_size=0.5)

In [52]:
desc_train['data_type'] = 'train'
desc_test['data_type'] = 'test'
desc_val['data_type'] = 'validate'

all_descriptions = pd.concat([desc_train, desc_test, desc_val]).reset_index(drop=True)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type
0,101005277280,White truck here again. Likely parking for the...,train
1,101005274364,Car parked in emergency vehicle access zone co...,train
2,101005294691,DO BTD WORK ANYMORE? PATROL? TICKET? ANYTHING....,train
3,101005244838,No parking,train
4,101005293463,tinted windows | How is the car parked illegal...,train


#### Service ID to Tag function

In [53]:
service_id_list = []
def service_id_to_tag(service_id:int) -> int:
    if service_id in service_id_list:
        return service_id_list.index(service_id)
    service_id_list.append(service_id)
    return len(service_id_list)

#### Tagging Descriptions

In [54]:
all_descriptions['tagged_desc'] = all_descriptions.apply(lambda row: TaggedDocument(tokenize(row['Description']), [service_id_to_tag(int(row['ID']))]), axis=1)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type,tagged_desc
0,101005277280,White truck here again. Likely parking for the...,train,"([white, truck, here, again, likely, parking, ..."
1,101005274364,Car parked in emergency vehicle access zone co...,train,"([car, parked, in, emergency, vehicle, access,..."
2,101005294691,DO BTD WORK ANYMORE? PATROL? TICKET? ANYTHING....,train,"([do, btd, work, anymore, patrol, ticket, anyt..."
3,101005244838,No parking,train,"([no, parking], [4])"
4,101005293463,tinted windows | How is the car parked illegal...,train,"([tinted, windows, how, is, the, car, parked, ..."


### 2. Model Training

In [56]:
model = Lbl2Vec(
    keywords_list=list(category_keywords['keywords']),
    tagged_documents=all_descriptions['tagged_desc'][all_descriptions['data_type'] == 'train'],
    vector_size=300,
    min_count=1,
    similarity_threshold=0.43
    )

model.fit()

2024-03-23 21:58:17,664 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-23 21:58:17,664 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-23 21:58:17,664 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-23 21:58:17,664 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-23 21:58:17,664 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-23 21:58:17,664 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-23 21:58:25,834 - Lbl2Vec - INFO - Train label embeddings
2024-03-23 21:58:25,834 - Lbl2Vec - INFO - Train label embeddings
2024-03-23 21:58:25,834 - Lbl2Vec - INFO - Train label embeddings
2024-03-23 21:58:25,834 - Lbl2Vec - INFO - Train label embeddings
2024-03-23 21:58:25,834 - Lbl2Vec - INFO - Train label embeddings
2024-03-23 21:58:25,834 - Lbl2Vec - INFO - Train label embeddings


#### Predict Description Labels

In [57]:
model.predict_model_docs()

2024-03-23 21:58:26,259 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-23 21:58:26,259 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-23 21:58:26,259 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-23 21:58:26,259 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-23 21:58:26,259 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-23 21:58:26,259 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-23 21:58:26,273 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-23 21:58:26,273 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-23 21:58:26,273 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-23 21:58:26,273 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-23 21:58:26,273 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-23 21:58:26,273 - Lbl2Vec - INFO - Calculate document<->label similarities


Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10
0,0,label_2,-0.025171,-0.026041,-0.027399,-0.025171,-0.025248,-0.026047,-0.026153,-0.026138,-0.025579,-0.025925,-0.025702,-0.036905
1,1,label_10,0.959610,0.951955,0.950389,0.949800,0.949050,0.951392,0.951914,0.951926,0.950234,0.950271,0.950207,0.959610
2,2,label_1,0.940282,0.938889,0.940282,0.938595,0.938900,0.939025,0.938780,0.938789,0.939032,0.939207,0.938785,0.921954
3,3,label_10,0.820377,0.810840,0.806562,0.808606,0.806903,0.809739,0.810648,0.810718,0.807964,0.808009,0.807919,0.820377
4,4,label_10,-0.063215,-0.076994,-0.076809,-0.078101,-0.078335,-0.077203,-0.076899,-0.076895,-0.077802,-0.077550,-0.077738,-0.063215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6838,6838,label_10,0.826236,0.816383,0.814295,0.813110,0.812130,0.815247,0.815920,0.815997,0.813319,0.813510,0.813052,0.826236
6839,6839,label_10,0.907834,0.902611,0.907658,0.901583,0.902246,0.903256,0.903158,0.903097,0.902915,0.903731,0.903055,0.907834
6840,6840,label_3,0.893190,0.886889,0.882184,0.892960,0.893190,0.887662,0.886779,0.886805,0.890373,0.889404,0.890531,0.816888
6841,6841,label_10,0.981110,0.980747,0.980809,0.978871,0.978746,0.980391,0.980645,0.980647,0.979637,0.979692,0.979550,0.981110
