In [55]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
import pickle
import re
import pandas as pd
import itertools

# Plan Overview:
## 1. Data Preparation
### a. remove special characters
### b. downcase
## 2. TF-IDF Representation
### a. Train / Validate / Test Data Split
### b. TF-IDF Vectorization
## 3. Model Training
### a. Affinity Propagation: https://scikit-learn.org/stable/modules/clustering.html#affinity-propagation
### b. Mean-shift: https://scikit-learn.org/stable/modules/clustering.html#mean-shift

## 1. Data Preparation

In [18]:
# Load pickled service descriptions
with open('API-responses/service_req_desc_03-02-24_01_09_24.pkl', 'rb') as file:
    service_drescriptions:dict = pickle.load(file)

# Remove special characters and downcase
def remove_special_characters(input:str) -> str:
    return re.sub(pattern=r'[^A-Za-z ]', repl='', string=input)

service_drescriptions_clean = {}
for key, value in service_drescriptions.items():
    service_drescriptions_clean[key] = remove_special_characters(value).lower()

list(service_drescriptions_clean.values())[:5]

['illegal parking in the commercial loading zone and the  minute drop off zone   same cars i beg of you again to ticket  thank you for last time all  cars',
 'red honda w new york plates illegally parked on all weekend',
 'car completely blocking turn in fire lane',
 'double parking alone mass ave no enforcement really a two lane becomes one lane are you all sleeping',
 'truck on sidewalk']

In [22]:
service_drescriptions_df = pd.DataFrame(data=service_drescriptions_clean.items(), columns=['ID', 'Description'])
service_drescriptions_df.head()

Unnamed: 0,ID,Description
0,101005335950,illegal parking in the commercial loading zone...
1,101005335949,red honda w new york plates illegally parked o...
2,101005335934,car completely blocking turn in fire lane
3,101005335925,double parking alone mass ave no enforcement r...
4,101005335924,truck on sidewalk


## 2. TF-IDF Representation

In [43]:
# Train=.70, Validate=.15, Test=.15
# Split all data between train, (test AND validate)
desc_train, desc_testval = train_test_split(service_drescriptions_df, test_size=.30)

# Split (test AND validate) between test, validate
# .3 * .5 = 0.15
desc_val, desc_test = train_test_split(desc_testval, test_size=0.5)

In [42]:
desc_val

Unnamed: 0,ID,Description,upcase_test
734,101005330582,no parking,NO PARKING
3671,101005288189,parking in commercial spot now parents have to...,PARKING IN COMMERCIAL SPOT NOW PARENTS HAVE TO...
7376,101005203870,car parked in crosswalk,CAR PARKED IN CROSSWALK
9301,101005251004,illegally parked car blocking buildings garage...,ILLEGALLY PARKED CAR BLOCKING BUILDINGS GARAGE...
1075,101005328291,auto translated en white toyota corolla stil...,AUTO TRANSLATED EN WHITE TOYOTA COROLLA STIL...
...,...,...,...
8631,101005246959,non resident gray audi parked over the hour l...,NON RESIDENT GRAY AUDI PARKED OVER THE HOUR L...
8117,101005248902,numerous cars double parke busses cant make th...,NUMEROUS CARS DOUBLE PARKE BUSSES CANT MAKE TH...
4597,101005287544,no sticker parked by lot,NO STICKER PARKED BY LOT
9688,101005244980,type of vehicle other make toyota model pick...,TYPE OF VEHICLE OTHER MAKE TOYOTA MODEL PICK...


In [48]:
# Initialize Count Vectorizer
cv = CountVectorizer(stop_words='english', lowercase=False, strip_accents='ascii', analyzer='word')

# Initialize Word Count Vector
word_count_vector = cv.fit_transform(desc_train['Description'])

# Initialize Transformer
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

# Create Test, Validate, and Train TF-IDF Matrices
desc_train_matrix = tfidf_transformer.transform(cv.transform(desc_train['Description']))
desc_test_matrix = tfidf_transformer.transform(cv.transform(desc_test['Description']))
desc_val_matrix = tfidf_transformer.transform(cv.transform(desc_val['Description']))

In [56]:
# Create Dictionaries to Map Service Request IDs to TF-IDF Matrices
def id_to_matrix(ids, matrices) -> dict:
    id_to_matrix = {}
    for i, id in zip(range(matrices.get_shape()[0]), ids):
        id_to_matrix[id] = matrices.getrow(i)
    return id_to_matrix

train_id_to_matrix:dict = id_to_matrix(desc_train['ID'], desc_train_matrix)
test_id_to_matrix:dict = id_to_matrix(desc_test['ID'], desc_test_matrix)
val_id_to_matrix:dict = id_to_matrix(desc_val['ID'], desc_val_matrix)

## 3. Model Training

### a. Affinity Propagation

In [62]:
from sklearn.cluster import AffinityPropagation
import matplotlib.pyplot as plt
import numpy as np

In [61]:
af = AffinityPropagation(preference=-50, random_state=0).fit(desc_train_matrix)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print("Estimated number of clusters: %d" % n_clusters_)

Estimated number of clusters: 0




### b. Mean-Shift

In [63]:
from sklearn.cluster import MeanShift, estimate_bandwidth

In [69]:
# Estimate Bandwidth
bandwidth = estimate_bandwidth(desc_test_matrix.toarray(), quantile=0.2, n_samples=500)

meanshift = MeanShift(bandwidth=bandwidth, bin_seeding=True)
meanshift.fit(desc_test_matrix.toarray())

labels = meanshift.labels_
cluster_centers = meanshift.cluster_centers_

unique_labels = np.unique(labels)
n_clusters_ = len(unique_labels)

print("number of estimated clusters : %d" % n_clusters_)

number of estimated clusters : 1


### c. K-Means

In [70]:
from sklearn.cluster import KMeans