In [None]:
#fifi_inference
# Calls retrained model to reclassify data.  Uses TensorFlow instead of PyTorch.

In [1]:
!pip install datasets transformers evaluate sentencepiece accelerate huggingface_hub



In [17]:
# Get the model from HuggingFace and classify some examples
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np

texts = ["There is a dead dog.", "Somebody tagged the wall of my store", "There are tents blocking the sidewalk of my business",
         "The streetlight in front of my house is out"]

# Load the retrained tokenizer and tokenize the input
tokenizer = AutoTokenizer.from_pretrained("mjbeattie/fifi_classification")

inputs = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="tf")

# Load the retrained model and predict the classification of the test text
model = TFAutoModelForSequenceClassification.from_pretrained("mjbeattie/fifi_classification")
logits = model(**inputs).logits

# Get predicted labels
predicted_labels = tf.math.argmax(logits, 1).numpy().tolist()

# Print the predicted labels
for i, text in enumerate(texts):
    print(f"Text: {text} | Predicted Label: {predicted_labels[i]}")


All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Text: There is a dead dog. | Predicted Label: 14
Text: Somebody tagged the wall of my store | Predicted Label: 0
Text: There are tents blocking the sidewalk of my business | Predicted Label: 1
Text: The streetlight in front of my house is out | Predicted Label: 13


In [2]:
# Function to document runtime stats
def append_line_to_file(file_path, line):
    # Open the file in append mode ('a+')
    with open(file_path, 'a+') as file:
        # Move the cursor to the end of the file
        file.seek(0, 2)
        # If the file is not empty, add a newline before appending
        if file.tell() > 0:
            file.write('\n')
        # Append the line to the file
        file.write(line)

In [3]:
# Reclassify the service requests from prior to Unauthorized Encampments
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np
import pandas as pd, time

# Record the run times to a log file
logf = 'fifi_inference_runtimes.txt'
start_time = time.time()

file_path = ''
unlabelledf = 'shuffled_unlabelled.csv'

unldf = pd.read_csv(file_path + unlabelledf)

# Run the routine on blocks of 500 -- memory can't handle more
blocknum = 0
startrow = 0
endrow = 999
samplesize = endrow - startrow + 1
unlsubset = unldf.iloc[startrow:endrow]

texts = unlsubset['text'].tolist()
servreqids = unlsubset['servreqid'].tolist()

# Load the retrained tokenizer and tokenize the input
tokenizer = AutoTokenizer.from_pretrained("mjbeattie/fifi_classification")
inputs = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="tf")

# Predict the classification of the test text
model = TFAutoModelForSequenceClassification.from_pretrained("mjbeattie/fifi_classification")
logits = model(**inputs).logits

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
runtime_entry = "Blocknum: " + str(blocknum) + " Startrow: " + str(startrow) + " Endrow: " + str(endrow) + " Classified texts: " + str(samplesize) + " Elapsed time: " + str(elapsed_time)
append_line_to_file(logf, runtime_entry)

print(f"Elapsed time: {elapsed_time:.6f} seconds")

# Get predicted labels
predicted_labels = tf.math.argmax(logits, 1).numpy().tolist()

# Join new labels to original dataset
relabelleddf = pd.DataFrame({'servreqid': servreqids, 'newlabel': predicted_labels})
unlsubset = pd.merge(unlsubset, relabelleddf, on='servreqid', how='inner')


2024-04-22 04:33:19.911623: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-22 04:33:20.717341: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.1.163/linux/tbb/lib/intel64_lin/gcc4.7:/opt/intel/compilers_and_libraries_2018.1.163/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/
2024-04-22 04:33:20.717466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not

Elapsed time: 162.145568 seconds


In [58]:
# Add the new labels and IDs back to the sample and save
label2id = {
    'SPU-Graffiti Report': 0,
    'SEA-Unauthorized Encampment': 1,
    'SDOT-Abandoned Vehicle': 2,
    'SPU-Illegal Dumping Report': 3,
    'SPD-Parking Enforcement': 4,
    'SPU-Clogged Drains': 5,
    'SPR-Maintenance': 6,
    'CSB-General Inquiry': 7,
    'SDOT-Sign and Signal Maintenance': 8,
    'SPU-Public Litter Cans': 9,
    'SDOT-Shared Micromobility': 10,
    'SDOT-Pothole': 11,
    'SEA-Overgrown Vegetation': 12,
    'SCL-Streetlight Report': 13,
    'FAS-SAS-Dead Animal': 14
}

label2iddf = pd.DataFrame(label2id.items(), columns=['newid', 'newlabel'])

# Add new IDs to original dataset and save to a file
unlsubset = pd.merge(unlsubset, label2iddf, on='newlabel', how='inner')
savef = 'fifi_inference_reclass_' + str(blocknum) + '.csv'
unlsubset.to_csv(file_path + savef, index=False)

In [61]:
# Read in the reclassified files and consolidate into one dataframe
flead = 'fifi_inference_reclass_'
fclose = '.csv'
fname = flead + str(0) + fclose

finalreclassdf = pd.read_csv(fname)
print(finalreclassdf.shape)
rowcount = finalreclassdf.shape[0]

for blocknum in range(1,32):
    fname = flead + str(blocknum) + fclose
    nextclassdf = pd.read_csv(fname)
    print('Reading in blocknum', blocknum, 'Shape is:', nextclassdf.shape)
    rowcount += nextclassdf.shape[0]
    finalreclassdf = pd.concat([finalreclassdf, nextclassdf], ignore_index=True)

finalreclassdf.head()

(999, 6)
Reading in blocknum 1 Shape is: (999, 6)
Reading in blocknum 2 Shape is: (999, 6)
Reading in blocknum 3 Shape is: (499, 6)
Reading in blocknum 4 Shape is: (499, 6)
Reading in blocknum 5 Shape is: (499, 6)
Reading in blocknum 6 Shape is: (499, 6)
Reading in blocknum 7 Shape is: (499, 6)
Reading in blocknum 8 Shape is: (499, 6)
Reading in blocknum 9 Shape is: (999, 6)
Reading in blocknum 10 Shape is: (499, 6)
Reading in blocknum 11 Shape is: (499, 6)
Reading in blocknum 12 Shape is: (749, 6)
Reading in blocknum 13 Shape is: (599, 6)
Reading in blocknum 14 Shape is: (649, 6)
Reading in blocknum 15 Shape is: (749, 6)
Reading in blocknum 16 Shape is: (649, 6)
Reading in blocknum 17 Shape is: (499, 6)
Reading in blocknum 18 Shape is: (599, 6)
Reading in blocknum 19 Shape is: (499, 6)
Reading in blocknum 20 Shape is: (599, 6)
Reading in blocknum 21 Shape is: (599, 6)
Reading in blocknum 22 Shape is: (599, 6)
Reading in blocknum 23 Shape is: (699, 6)
Reading in blocknum 24 Shape is: (

Unnamed: 0,servreqid,text,id,label,newlabel,newid
0,225303109,Unauthorized camping,CSB-General Inquiry,7,7,CSB-General Inquiry
1,208175246,Broken glass and a pile of cigarette butts un...,CSB-General Inquiry,7,7,CSB-General Inquiry
2,224720640,Due to 72 hr moratorium by Mayor Durkan this r...,SPD-Parking Enforcement,4,7,CSB-General Inquiry
3,228696411,The tree pictured has some very large limbs t...,CSB-General Inquiry,7,7,CSB-General Inquiry
4,216719653,The street cleaner may have pushed them away f...,SEA-Overgrown Vegetation,12,7,CSB-General Inquiry


In [62]:
# Check to ensure unique service records and new count for request type
unique_counts = finalreclassdf.nunique(axis=0)
print("No. of unique values in each column:\n", unique_counts)

No. of unique values in each column:
 servreqid    20000
text         16840
id              14
label           14
newlabel        15
newid           15
dtype: int64


In [63]:
# Save the final reclassified data into a file on disk
fname = 'final_fifi_inference_reclass.csv'
finalreclassdf.to_csv(file_path + fname, index=False)