In [26]:
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from lbl2vec import Lbl2Vec

# Plan Overview:
## 0. Keyword Definition
## 1. Data Preparation
### a. remove special characters
### b. downcase
### c. tag and tokenize
## 2.
## 3. Model Training

Reused Functions:
1. Description, Service ID -> Tagged Document

## 0. Keyword Definition
### Categories:
    - Bike Lane Obstruction: bike, cycle, path,  
    - Bus Lane Obstruction: bus, stop, 
    - Non-resident Parking: resident, state
    - Blocked Fire Hydrant: fire, hydrant
    - Blocked Sidewalk: sidewalk, side, walk
    - Blocked Driveway: driveway, drive, way, private
    - Blocked Crosswalk: crosswalk, cross, walk
    - Blocked Handicap spot: handicap, placard,
    - Double Parking: double, triple
    - No stopping zone: stopping, zone
    - visitor spot: visitor, hour

In [11]:
category_keywords_dict = {
    'bike lane' :           ['bike', 'cycle', 'cycling', 'path', 'sharrow'],
    'bus lane' :            ['bus'],
    'resident parking' :    ['resident', 'state'],
    'fire hydrant' :        ['fire', 'hydrant'],
    'sidewalk' :            ['sidewalk', 'side'],
    'driveway' :            ['driveway', 'drive', 'way', 'private'],
    'crosswalk' :           ['crosswalk', 'cross'],
    'handicap' :            ['handicap', 'placard'],
    'double parking' :      ['double', 'triple'],
    'no stopping' :         ['stopping',],
    'visitor parking' :     ['visitor', 'hour']
}
category_keywords = pd.DataFrame(data=category_keywords_dict.items(), columns=['category', 'keywords'])
category_keywords

Unnamed: 0,category,keywords
0,bike lane,"[bike, cycle, cycling, path, sharrow]"
1,bus lane,[bus]
2,resident parking,"[resident, state]"
3,fire hydrant,"[fire, hydrant]"
4,sidewalk,"[sidewalk, side]"
5,driveway,"[driveway, drive, way, private]"
6,crosswalk,"[crosswalk, cross]"
7,handicap,"[handicap, placard]"
8,double parking,"[double, triple]"
9,no stopping,[stopping]


## 1. Data Preparation

In [15]:
# Load pickled service descriptions
with open('pre-processing/API-responses/service_req_desc_03-02-24_01_09_24.pkl', 'rb') as file:
    service_id_to_descriptions:dict = pickle.load(file)

# Remove special characters and downcase, then tokenize
def tokenize(doc):
    return simple_preprocess(doc, deacc=True, min_len=1, max_len=15)

service_drescriptions_df = pd.DataFrame(data=service_id_to_descriptions.items(), columns=['ID', 'Description'])
service_drescriptions_df.head()

Unnamed: 0,ID,Description
0,101005335950,Illegal parking in the commercial loading Zone...
1,101005335949,Red Honda w New York plates illegally parked o...
2,101005335934,Car completely blocking turn in fire lane
3,101005335925,Double parking alone mass ave. No enforcement....
4,101005335924,truck on sidewalk


In [17]:
# Train=.70, Validate=.15, Test=.15
# Split all data between train, (test AND validate)
desc_train, desc_testval = train_test_split(service_drescriptions_df, test_size=.30)

# Split (test AND validate) between test, validate
# .3 * .5 = 0.15
desc_val, desc_test = train_test_split(desc_testval, test_size=0.5)

In [23]:
desc_train['data_type'] = 'train'
desc_test['data_type'] = 'test'
desc_val['data_type'] = 'validate'

all_descriptions = pd.concat([desc_train, desc_test, desc_val]).reset_index(drop=True)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type
0,101005281684,Crane blocking alley,train
1,101005330303,Abandoned vehicle,train
2,101005244644,Non resident car parked in residential parking,train
3,101005244696,Car parked in front of hydrant for the past 24...,train
4,101005190454,TRUCK PARKED ON SIDEWALK ILLEGALLY,train


In [25]:
all_descriptions['tagged_desc'] = all_descriptions.apply(lambda row: TaggedDocument(tokenize(row['Description']), [int(row['ID'])]), axis=1)
all_descriptions.head()

Unnamed: 0,ID,Description,data_type,tagged_desc
0,101005281684,Crane blocking alley,train,"([crane, blocking, alley], [101005281684])"
1,101005330303,Abandoned vehicle,train,"([abandoned, vehicle], [101005330303])"
2,101005244644,Non resident car parked in residential parking,train,"([non, resident, car, parked, in, residential,..."
3,101005244696,Car parked in front of hydrant for the past 24...,train,"([car, parked, in, front, of, hydrant, for, th..."
4,101005190454,TRUCK PARKED ON SIDEWALK ILLEGALLY,train,"([truck, parked, on, sidewalk, illegally], [10..."


In [31]:
model = Lbl2Vec(keywords_list=list(category_keywords['keywords']), tagged_documents=all_descriptions['tagged_desc'][all_descriptions['data_type'] == 'train'])

model.fit()

2024-03-21 14:18:02,492 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-21 14:18:02,492 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-21 14:18:02,492 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-21 14:18:02,492 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-21 14:18:02,492 - Lbl2Vec - INFO - Train document and word embeddings


MemoryError: 