#findit_fixit_classification
##April 13, 2024
##University of Oklahoma

This script reads in verbatims and classifications from Seattle findit/fixit data and prepares the data as a HuggingFace dataset.  It then uses the dataset to retrain an LLM classifier to recognize Service Request Types from detailed text.  The retrained model is used to reclassify data from the period of time prior to the inclusion of the Unauthorized Encampment service type value.

In [1]:
!pip install datasets transformers evaluate sentencepiece accelerate huggingface_hub

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Check working directory
import os
os.getcwd()

'/content'

#Start here if dataset is not already in HuggingFace hub

In [7]:
# Read in FindIt_FixIt data and transform appropriately
import pandas as pd

# Specify the path to your CSV files
file_path = 'drive/MyDrive/FindItFixIt/'
labelledf = 'after_unauthorized_encampment.csv'
unlabelledf = 'before_unauthorized_encampment.csv'

# Read the labelled CSV file into a Pandas DataFrame
df = pd.read_csv(file_path + labelledf)
df = df.loc[:, ['Service Request ID', 'alldetails', 'Service Request Type']]
df.dropna(inplace=True)

# Rename columns
column_mapping = {
    'Service Request ID': 'servreqid',
    'alldetails': 'text',
    'Service Request Type': 'label'
}
df.rename(columns=column_mapping, inplace=True)

# Set the data types explicitly
dtypes = {'servreqid': str,
          'text': str,
          'label': }

df = df.astype(dtypes)

# Display the first few rows of the DataFrame
print(df.head())


   servreqid                                               text  \
0  250398107   It's almost midnight and I am still hearing m...   
1  250398152  Complaint came in 1 day ago. Vehicle would hav...   
2  250398300   RVs back where recently removed. Trash, sidew...   
3  250398410                          Duplicate to CSR #154229    
4  250398442                          Duplicate to CSR #154229    

                        id  
0      CSB-General Inquiry  
1  SPD-Parking Enforcement  
2      CSB-General Inquiry  
3   SDOT-Abandoned Vehicle  
4   SDOT-Abandoned Vehicle  


In [8]:
# Map IDs to labels and add the IDs to the dataframe
label2id = {
    'SPU-Graffiti Report': 0,
    'SEA-Unauthorized Encampment': 1,
    'SDOT-Abandoned Vehicle': 2,
    'SPU-Illegal Dumping Report': 3,
    'SPD-Parking Enforcement': 4,
    'SPU-Clogged Drains': 5,
    'SPR-Maintenance': 6,
    'CSB-General Inquiry': 7,
    'SDOT-Sign and Signal Maintenance': 8,
    'SPU-Public Litter Cans': 9,
    'SDOT-Shared Micromobility': 10,
    'SDOT-Pothole': 11,
    'SEA-Overgrown Vegetation': 12,
    'SCL-Streetlight Report': 13,
    'FAS-SAS-Dead Animal': 14
}

id2label = {
    0: 'SPU-Graffiti Report',
    1: 'SEA-Unauthorized Encampment',
    2: 'SDOT-Abandoned Vehicle',
    3: 'SPU-Illegal Dumping Report',
    4: 'SPD-Parking Enforcement',
    5: 'SPU-Clogged Drains',
    6: 'SPR-Maintenance',
    7: 'CSB-General Inquiry',
    8: 'SDOT-Sign and Signal Maintenance',
    9: 'SPU-Public Litter Cans',
    10: 'SDOT-Shared Micromobility',
    11: 'SDOT-Pothole',
    12: 'SEA-Overgrown Vegetation',
    13: 'SCL-Streetlight Report',
    14: 'FAS-SAS-Dead Animal'
}

label2iddf = pd.DataFrame(label2id.items(), columns=['id', 'label'])

df = pd.merge(df, label2iddf, on='id', how='inner')

df.head()


Unnamed: 0,servreqid,text,id,label
0,250398107,It's almost midnight and I am still hearing m...,CSB-General Inquiry,7
1,250398300,"RVs back where recently removed. Trash, sidew...",CSB-General Inquiry,7
2,250399387,Re-populated site. Homeless encampment. 1 ten...,CSB-General Inquiry,7
3,250399851,Tents,CSB-General Inquiry,7
4,250400121,Illegal camping on trails near golden gardens...,CSB-General Inquiry,7


In [9]:
# Define file save names
trainsavef = 'labelled_train.csv'
testsavef = 'labelled_test.csv'
unlabelledsavef = 'unlabelled.csv'

# Split into train and test and save to drive
traindf = df.sample(frac=0.8, random_state=42)
testdf = df.drop(traindf.index)
print('Original row count =', df.shape, 'Train count =', traindf.shape, 'Test count =', testdf.shape)
traindf.to_csv(file_path + trainsavef, index=False)
testdf.to_csv(file_path + testsavef, index=False)

Original row count = (59481, 4) Train count = (47585, 4) Test count = (11896, 4)


In [12]:
# Repeat the process for the unlabelled dataset
unldf = pd.read_csv(file_path + unlabelledf)
unldf = unldf.loc[:, ['Service Request ID', 'alldetails', 'Service Request Type']]
unldf.dropna(inplace=True)

column_mapping = {
    'Service Request ID': 'servreqid',
    'alldetails': 'text',
    'Service Request Type': 'id'
}
unldf.rename(columns=column_mapping, inplace=True)

# Set the data types explicitly
dtypes = {'servreqid': str,
          'text': str,
          'id': str}

unldf = unldf.astype(dtypes)

unldf = pd.merge(unldf, label2iddf, on='id', how='inner')

# Display the first few rows of the DataFrame
print(unldf.head())

# Save the file back to drive
unldf.to_csv(file_path + unlabelledsavef, index=False)


   servreqid                                               text  \
0  190226346  Sign down on sidewalk. SE corner of 75th St NE...   
1  190229680                          scl property closed sign    
2  190230845  No parking signs on the northbound lane of Fai...   
3  190233656                    Stop sign down after collision    
4  190234544  The street sign showing directions to Magnuson...   

                                 id  label  
0  SDOT-Sign and Signal Maintenance      8  
1  SDOT-Sign and Signal Maintenance      8  
2  SDOT-Sign and Signal Maintenance      8  
3  SDOT-Sign and Signal Maintenance      8  
4  SDOT-Sign and Signal Maintenance      8  


In [13]:
# Create a HuggingFace dataset from the train and test labelled data
from datasets import load_dataset
from huggingface_hub import login

from google.colab import userdata
hftoken = userdata.get('HF_TOKEN')

login(token=hftoken)

data_files = {"train": file_path + trainsavef, "test": file_path + testsavef}
fifidataset = load_dataset("csv", data_files=data_files)

# Publish dataset to HuggingFace (note that a write access token has to be loaded)
fifidataset.push_to_hub("finditfixit")

# Note:  I changed the data type for servreqid from int to string on the huggingface dataset card

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mjbeattie/finditfixit/commit/bf3276a7f275a7ec05f2a743d30f15a2504cb3ac', commit_message='Upload dataset', commit_description='', oid='bf3276a7f275a7ec05f2a743d30f15a2504cb3ac', pr_url=None, pr_revision=None, pr_num=None)

# Start here if dataset is already loaded onto HuggingFace hub

In [3]:
from datasets import load_dataset

# Load dataset from HuggingFace hub
fifids = load_dataset('mjbeattie/finditfixit')

# View a record
fifids["test"][0]

# Map IDs to labels and add the IDs to the dataframe
label2id = {
    'SPU-Graffiti Report': 0,
    'SEA-Unauthorized Encampment': 1,
    'SDOT-Abandoned Vehicle': 2,
    'SPU-Illegal Dumping Report': 3,
    'SPD-Parking Enforcement': 4,
    'SPU-Clogged Drains': 5,
    'SPR-Maintenance': 6,
    'CSB-General Inquiry': 7,
    'SDOT-Sign and Signal Maintenance': 8,
    'SPU-Public Litter Cans': 9,
    'SDOT-Shared Micromobility': 10,
    'SDOT-Pothole': 11,
    'SEA-Overgrown Vegetation': 12,
    'SCL-Streetlight Report': 13,
    'FAS-SAS-Dead Animal': 14
}

id2label = {
    0: 'SPU-Graffiti Report',
    1: 'SEA-Unauthorized Encampment',
    2: 'SDOT-Abandoned Vehicle',
    3: 'SPU-Illegal Dumping Report',
    4: 'SPD-Parking Enforcement',
    5: 'SPU-Clogged Drains',
    6: 'SPR-Maintenance',
    7: 'CSB-General Inquiry',
    8: 'SDOT-Sign and Signal Maintenance',
    9: 'SPU-Public Litter Cans',
    10: 'SDOT-Shared Micromobility',
    11: 'SDOT-Pothole',
    12: 'SEA-Overgrown Vegetation',
    13: 'SCL-Streetlight Report',
    14: 'FAS-SAS-Dead Animal'
}


Downloading readme:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.32M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/739k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/47585 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11896 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoTokenizer

# Import the distilbert pre-trained model for tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

# Set tokenization to use distilbert and truncate text to max token length
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# Tokenize the FindItFixIt dataset using batch to speed it up
tokenized_fifids = fifids.map(preprocess_function, batched=True)

Map:   0%|          | 0/47585 [00:00<?, ? examples/s]

Map:   0%|          | 0/11896 [00:00<?, ? examples/s]

In [6]:
# Create a batch of examples (note:  using PyTorch)
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
# Load an evaluation function
import evaluate, numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [8]:
# Load the pretrained model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=15, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Set training hyperparameters
training_args = TrainingArguments(
    output_dir="fifi_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_fifids["train"],
    eval_dataset=tokenized_fifids["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [10]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6326,0.603122,0.796066


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6326,0.603122,0.796066
2,0.4962,0.583323,0.802875
3,0.4335,0.611337,0.801362
4,0.3552,0.632328,0.798672


TrainOutput(global_step=11900, training_loss=0.517760793221097, metrics={'train_runtime': 1967.0384, 'train_samples_per_second': 96.765, 'train_steps_per_second': 6.05, 'total_flos': 5005555242868890.0, 'train_loss': 0.517760793221097, 'epoch': 4.0})

In [11]:
# Push the model to the hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/mjbeattie/fifi_classification/commit/0497ca17c0af480cb1e795d0df561931b659ec3e', commit_message='End of training', commit_description='', oid='0497ca17c0af480cb1e795d0df561931b659ec3e', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
# Get the model from HuggingFace and classify an example
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

text = "There is a dead dog."

# Load the retrained tokenizer and tokenize the input
tokenizer = AutoTokenizer.from_pretrained("mjbeattie/fifi_classification")
inputs = tokenizer(text, return_tensors="pt")

# Load the retrained model and predict the classification of the test text
model = AutoModelForSequenceClassification.from_pretrained("mjbeattie/fifi_classification")
with torch.no_grad():
    logits = model(**inputs).logits

# Convert the returned label to an id
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]


'FAS-SAS-Dead Animal'

#Start here if the retrained model has been created and published to the HuggingFace hub

In [22]:
# Get the model from HuggingFace and classify some examples
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

texts = ["There is a dead dog.", "Somebody tagged the wall of my store", "There are tents blocking the sidewalk of my business",
         "The streetlight in front of my house is out"]

# Load the retrained tokenizer and tokenize the input
tokenizer = AutoTokenizer.from_pretrained("mjbeattie/fifi_classification")

inputs = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Load the retrained model and predict the classification of the test text
model = AutoModelForSequenceClassification.from_pretrained("mjbeattie/fifi_classification")
with torch.no_grad():
    logits = model(**inputs).logits

# Get predicted labels
predicted_labels = torch.argmax(logits, dim=1)

# Print the predicted labels
for i, text in enumerate(texts):
    print(f"Text: {text} | Predicted Label: {predicted_labels[i]}")


Text: There is a dead dog. | Predicted Label: 14
Text: Somebody tagged the wall of my store | Predicted Label: 0
Text: There are tents blocking the sidewalk of my business | Predicted Label: 1
Text: The streetlight in front of my house is out | Predicted Label: 13


In [23]:
# Reclassify the service requests from prior to Unauthorized Encampments
import pandas as pd

file_path = 'drive/MyDrive/FindItFixIt/'
unlabelledf = 'unlabelled.csv'

unldf = pd.read_csv(file_path + unlabelledf)
unlsubset = unldf.sample(n=10, random_state=42)

texts = unlsubset['text'].tolist()
servreqids = unlsubset['servreqid'].tolist()

# Load the retrained tokenizer and tokenize the input
tokenizer = AutoTokenizer.from_pretrained("mjbeattie/fifi_classification")

inputs = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Load the retrained model and predict the classification of the test text
model = AutoModelForSequenceClassification.from_pretrained("mjbeattie/fifi_classification")
with torch.no_grad():
    logits = model(**inputs).logits

# Get predicted labels
predicted_labels = torch.argmax(logits, dim=1)
relabelleddf = pd.DataFrame({'servreqid': servreqids, 'newlabel': predicted_labels})
unlsubset = pd.merge(unlsubset, relabelleddf, on='servreqid', how='inner')


Unnamed: 0,servreqid,text,id,label
0,190226346,Sign down on sidewalk. SE corner of 75th St NE...,SDOT-Sign and Signal Maintenance,8
1,190229680,scl property closed sign,SDOT-Sign and Signal Maintenance,8
2,190230845,No parking signs on the northbound lane of Fai...,SDOT-Sign and Signal Maintenance,8
3,190233656,Stop sign down after collision,SDOT-Sign and Signal Maintenance,8
4,190234544,The street sign showing directions to Magnuson...,SDOT-Sign and Signal Maintenance,8
5,190234882,Bubbling water to make pool.,SDOT-Sign and Signal Maintenance,8
6,190236990,Light out at Aurora and Winnona,SDOT-Sign and Signal Maintenance,8
7,190242116,Crosswalk broken,SDOT-Sign and Signal Maintenance,8
8,190242725,Sign down,SDOT-Sign and Signal Maintenance,8
9,190250357,Pedestrian crossing button doesn't work.,SDOT-Sign and Signal Maintenance,8


In [None]:
# Add the new labels and IDs back to the sample and save
label2id = {
    'SPU-Graffiti Report': 0,
    'SEA-Unauthorized Encampment': 1,
    'SDOT-Abandoned Vehicle': 2,
    'SPU-Illegal Dumping Report': 3,
    'SPD-Parking Enforcement': 4,
    'SPU-Clogged Drains': 5,
    'SPR-Maintenance': 6,
    'CSB-General Inquiry': 7,
    'SDOT-Sign and Signal Maintenance': 8,
    'SPU-Public Litter Cans': 9,
    'SDOT-Shared Micromobility': 10,
    'SDOT-Pothole': 11,
    'SEA-Overgrown Vegetation': 12,
    'SCL-Streetlight Report': 13,
    'FAS-SAS-Dead Animal': 14
}

# Get predicted labels
relabelleddf = pd.DataFrame({'servreqid': servreqids, 'newlabel': predicted_labels})
unlsubset = pd.merge(unlsubset, relabelleddf, on='servreqid', how='inner')

label2iddf = pd.DataFrame(label2id.items(), columns=['newid', 'newlabel'])

unlsubset = pd.merge(unlsubset, label2iddf, on='newlabel', how='inner')
savef = 'reclassified_fifi_reqs.csv'
unlsubset.to_csv(file_path + savef, index=False)