# Tensoflow NLP Model for Text Classification

## Imports

In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
import pickle

## Pre-Processing

In [9]:
with open('../data/processed_21_data.pkl', 'rb') as file:
    processed_21_data = pickle.load(file)

In [10]:
#only keep relevant columns
processed_21_data = processed_21_data[['detailed_type', 'public_description']]

In [11]:
#split train and test data
(requests_train, requests_test) = train_test_split(processed_21_data, test_size = 0.2, random_state = 42)

## Tensorflow Model

### Imports

In [12]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import os
import datetime
import tensorflow_hub as hub

In [13]:
pd.set_option('display.max_colwidth', None)

In [14]:
requests_train.sample(10)

Unnamed: 0,detailed_type,public_description
156493,Graffiti Removal - Commercial,Graffiti on utility box in Ally just off Howard Avenue
259241,Graffiti Removal,Graffiti on concrete wall on the underpass of the I-5. Contact is Bill at 619-990-0130.
123278,Shared Mobility Device,Scooter blocking sidewalk
206830,Graffiti Removal,Graffiti
145022,Encampment,Homeless encampment
254909,Shared Mobility Device,ADA
312349,Missed Collection,Entire alley
140604,Graffiti Removal,Graffiti
321176,Shared Mobility Device,Blocking Sidewalk
275949,Encampment,Homeless camp


In [15]:
X_train, X_test = train_test_split(requests_train, test_size=0.2, random_state=42)

In [16]:
from sklearn.utils import class_weight

In [17]:
class_weights = list(class_weight.compute_class_weight(class_weight = 'balanced',
                                                       classes= np.unique(X_train['detailed_type']),
                                                      y=X_train['detailed_type']
                                                      ))

In [20]:
len(X_train['detailed_type'].value_counts())

51

In [21]:
class_weights.sort()

In [23]:
class_weights

[0.1321752389562574,
 0.18064980430210786,
 0.18905638719047954,
 0.20010141132831483,
 0.2511350517035889,
 0.31791099161914893,
 0.32816299477069966,
 0.5369534987391631,
 0.541825701347137,
 1.1408810099382218,
 1.3856409463960715,
 1.4437974094747388,
 1.4497498495504477,
 1.4604540477935521,
 1.6330257593233373,
 1.6491943311978257,
 1.7533539731682146,
 1.8616493656286044,
 2.21469833559736,
 2.3777640282258656,
 2.4400955447714936,
 2.5605209721429025,
 2.5793022995669337,
 2.6220413600402876,
 2.684312062399175,
 2.6888733403301845,
 2.709592398603277,
 2.9330898253648074,
 3.0460095491517105,
 3.22609981811277,
 3.282991619884468,
 3.785650623885918,
 4.126211110258967,
 4.371276134763297,
 4.521148459383753,
 4.5536747072929895,
 4.674747299215107,
 5.063686274509804,
 5.137668703845175,
 5.188203150112504,
 6.279372860255213,
 6.380653067678685,
 6.8650844285653525,
 8.507537423571579,
 10.046996576408342,
 11.678243253020765,
 11.942656307806141,
 23.975787284610814,
 38.13

In [24]:
num_options = len(class_weights)
num_options

51

In [25]:
weights = {}

In [26]:
for index, weight in enumerate(class_weights) :
    weights[index]=weight

In [27]:
weights

{0: 0.1321752389562574,
 1: 0.18064980430210786,
 2: 0.18905638719047954,
 3: 0.20010141132831483,
 4: 0.2511350517035889,
 5: 0.31791099161914893,
 6: 0.32816299477069966,
 7: 0.5369534987391631,
 8: 0.541825701347137,
 9: 1.1408810099382218,
 10: 1.3856409463960715,
 11: 1.4437974094747388,
 12: 1.4497498495504477,
 13: 1.4604540477935521,
 14: 1.6330257593233373,
 15: 1.6491943311978257,
 16: 1.7533539731682146,
 17: 1.8616493656286044,
 18: 2.21469833559736,
 19: 2.3777640282258656,
 20: 2.4400955447714936,
 21: 2.5605209721429025,
 22: 2.5793022995669337,
 23: 2.6220413600402876,
 24: 2.684312062399175,
 25: 2.6888733403301845,
 26: 2.709592398603277,
 27: 2.9330898253648074,
 28: 3.0460095491517105,
 29: 3.22609981811277,
 30: 3.282991619884468,
 31: 3.785650623885918,
 32: 4.126211110258967,
 33: 4.371276134763297,
 34: 4.521148459383753,
 35: 4.5536747072929895,
 36: 4.674747299215107,
 37: 5.063686274509804,
 38: 5.137668703845175,
 39: 5.188203150112504,
 40: 6.27937286025521

In [28]:
#Instantiate
dataset_train = tf.data.Dataset.from_tensor_slices((X_train['public_description'].values, X_train['detailed_type'].values))
dataset_test = tf.data.Dataset.from_tensor_slices((X_test['public_description'].values, X_test['detailed_type'].values))

In [29]:
for text, target in dataset_train.take(5):
    print('description: {}, Target: {}'.format(text, target))

description: b'Graffiti', Target: b'Graffiti Removal'
description: b'Illegal Dumpling', Target: b'Illegal Dumping'
description: b'Pot holes, cracks in pavement', Target: b'Pothole'
description: b'Our street is never cleaned. It needs a scheduled day for cleaning', Target: b'Street Sweeping'
description: b'Prohibited parking always parked on entrance', Target: b'Parking Zone Violation'


In [30]:
for text, target in dataset_test.take(5):
    print('description: {}, Target: {}'.format(text, target))

description: b'Scooter', Target: b'Shared Mobility Device'
description: b'http://www.reliablepipe.com/contact.html', Target: b'Graffiti Removal'
description: b'Telephone boxes damaged', Target: b'Other'
description: b'It looks like What may be an abandoned vehicle. Possibly stolen.', Target: b'72 Hour Violation'
description: b'FECES AND OTHER FLUIDS BLOCKING THE SIDEWALK FROM KNOW FENTANYL DEALERS', Target: b'Illegal Dumping'


In [31]:
detailed_types = list(np.unique(requests_train['detailed_type']))

In [35]:
vals = list(range(num_options))

In [36]:
num_options == len(vals)

True

In [37]:
table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant(detailed_types),
        values=tf.constant(vals),
    ),
        default_value=tf.constant(-1),
        name="target_encoding"
)


In [38]:
@tf.function
def target(x):
    return table.lookup(x)

In [39]:
def show_batch(dataset, size=5):
    for batch, label in dataset.take(size):
        print(batch.numpy())
        print(target(label).numpy())

In [41]:
show_batch(dataset_test, 6)

b'Scooter'
29
b'http://www.reliablepipe.com/contact.html'
13
b'Telephone boxes damaged'
18
b'It looks like What may be an abandoned vehicle. Possibly stolen.'
0
b'FECES AND OTHER FLUIDS BLOCKING THE SIDEWALK FROM KNOW FENTANYL DEALERS'
15
b'Sign in median tagged'
13


In [42]:
 def fetch(text, labels):
        return text, tf.one_hot(target(labels),162)

In [43]:
train_data_f = dataset_train.map(fetch)
test_data_f=dataset_test.map(fetch)

In [44]:
next(iter(train_data_f))

(<tf.Tensor: shape=(), dtype=string, numpy=b'Graffiti'>,
 <tf.Tensor: shape=(162,), dtype=float32, numpy=
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>)

In [45]:
train_data, train_labels = next(iter(train_data_f.batch(2)))
train_data, train_labels

(<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Graffiti', b'Illegal Dumpling'], dtype=object)>,
 <tf.Tensor: shape=(2, 162), dtype=float32, numpy=
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 

In [46]:
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(embedding, output_shape=[128], input_shape=[],
                           dtype=tf.string, trainable=True)
hub_layer(train_data[:1])



<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-1.39179647e-01,  9.52817723e-02, -3.63736078e-02,
         7.33999759e-02, -6.32884502e-02,  4.36592810e-02,
        -2.29714476e-02,  4.14458998e-02, -4.11153920e-02,
        -6.08860562e-03,  9.22431499e-02, -4.87491265e-02,
         1.05019417e-02, -1.54585522e-02,  1.27385616e-01,
         1.49793131e-02, -1.13783076e-01,  9.32408795e-02,
        -4.50370908e-02, -3.31810936e-02, -7.40331113e-02,
        -1.03484608e-01, -2.43988354e-02, -1.58706561e-01,
        -1.52567342e-01,  2.98491446e-04, -1.05073124e-01,
        -9.02146548e-02,  1.40838390e-02, -6.45939633e-02,
        -1.94969643e-02,  2.48357281e-02, -1.87971115e-01,
        -9.87707153e-02, -5.50029874e-02, -1.35416985e-01,
         4.75696214e-02, -1.40368447e-01, -5.28071634e-02,
         9.52177402e-03, -4.54429984e-02,  2.86252256e-02,
         2.51011681e-02,  1.88235510e-02,  3.26904953e-02,
         3.77090764e-03, -1.11999363e-01, -5.04006408e-02,
      

In [47]:
model = tf.keras.Sequential()
model.add(hub_layer)
for units in [128, 128, 64, 32]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(162, activation='softmax'))

model.summary()

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 128)               124642688 
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0

In [48]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
             metrics=['accuracy'])

In [49]:
train_data_f=train_data_f.shuffle(70000).batch(512)
test_data_f=test_data_f.batch(512)

In [50]:
history = model.fit(train_data_f,
                   epochs=5,
                   validation_data=test_data_f,
                   verbose=1,
                   class_weight=weights)

Epoch 1/5


  output, from_logits = _get_logits(


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
len(list(dataset_test))

In [None]:
results = model.evaluate(dataset_test.map(fetch).batch(40406), verbose=2)

In [None]:
print(results)

In [None]:
test_data, test_labels = next(iter(dataset_test.map(fetch).batch(45963)))

In [None]:
y_pred=model.predict(test_data)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1), zero_division = 1))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1))