# findit_fixit_classification
## First Commit:  April 13, 2024
## Update:  June 11, 2024
## University of Oklahoma

This script reads in verbatims and classifications from Seattle findit/fixit data and prepares the data as a HuggingFace dataset.  It then uses the dataset to retrain an LLM classifier to recognize Service Request Types from detailed text.  The retrained model is used to reclassify data from the period of time prior to the inclusion of the Unauthorized Encampment service type value.

__Notes for this version:__ The previous version didn't include all potential request types in the ID to label mapping and consequently lost some records.  This problem is fixed.  Also, the SPD and SDOT designations in the Abandoned Vehicle service request type values are dropped so reclassification of these items is not performed.  Model retraining is done on Google Colabs.  However, the reclassification work in Step 3 of this script is not carried out.  Instead, it is done in a separate script, _fifi_reclass_torch.py_, on OU OSCER.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets transformers evaluate sentencepiece accelerate huggingface_hub

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/542.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━

In [None]:
# Check working directory
import os
os.getcwd()

'/content/drive/MyDrive/FindItFixIt'

In [None]:
%cd /content/drive/MyDrive/FindItFixIt

/content/drive/MyDrive/FindItFixIt


In [None]:
%ls

[0m[01;34mfifi_classification[0m/               labelled_test.csv   post_encamp_value_reqs.csv  unlabelled.csv
findit_fixit_classification.ipynb  labelled_train.csv  pre_encamp_value_reqs.csv


# Step 1: Start here if dataset is not already in HuggingFace hub

In [None]:
# Read in FindIt_FixIt data and transform appropriately
import pandas as pd

# Specify the path to your CSV files
file_path = '/content/drive/MyDrive/FindItFixIt/'
labelledf = 'post_encamp_value_reqs.csv'
unlabelledf = 'pre_encamp_value_reqs.csv'


In [None]:
# Read the labelled CSV file into a Pandas DataFrame
df = pd.read_csv(file_path + labelledf)
df = df.loc[:, ['Service Request ID', 'alldetails', 'Service Request Type']]
df.dropna(inplace=True)

# Rename columns
column_mapping = {
    'Service Request ID': 'servreqid',
    'alldetails': 'text',
    'Service Request Type': 'origid'
}
df.rename(columns=column_mapping, inplace=True)

# Set the data types explicitly
dtypes = {'servreqid': str,
          'text': str,
          'origid': str}

df = df.astype(dtypes)

# Remove the SDOT vs SPD distinction for Abandoned Vehicle requests types
df['id'] = df['origid']
df['id'] = df['id'].replace(['SDOT-Abandoned Vehicle', 'SPD-Abandoned Vehicle'], 'Abandoned Vehicle')

# Display the first few rows of the DataFrame
print(df.head())


   servreqid                                               text  \
0  250398107   It's almost midnight and I am still hearing m...   
1  250398152  Complaint came in 1 day ago. Vehicle would hav...   
2  250398300   RVs back where recently removed. Trash, sidew...   
3  250398410                          Duplicate to CSR #154229    
4  250398442                          Duplicate to CSR #154229    

                    origid                       id  
0      CSB-General Inquiry      CSB-General Inquiry  
1  SPD-Parking Enforcement  SPD-Parking Enforcement  
2      CSB-General Inquiry      CSB-General Inquiry  
3   SDOT-Abandoned Vehicle        Abandoned Vehicle  
4   SDOT-Abandoned Vehicle        Abandoned Vehicle  


In [None]:
# Map IDs to labels and add the IDs to the dataframe
label2id = {
    'CSB-General Inquiry': 0,
    'FAS-CPD-Business Public Health Complaint': 1,
    'FAS-SAS-Dead Animal': 2,
    'SCL-Streetlight Report': 3,
    'Abandoned Vehicle': 4,
    'SDOT-Pothole': 5,
    'SDOT-Shared Micromobility': 6,
    'SDOT-Sign and Signal Maintenance': 7,
    'SEA-Overgrown Vegetation': 8,
    'SEA-Unauthorized Encampment': 9,
    'SPD-Parking Enforcement': 10,
    'SPR-Maintenance': 11,
    'SPU-Clogged Drains': 12,
    'SPU-Graffiti Report': 13,
    'SPU-Illegal Dumping Report': 14,
    'SPU-Public Litter Cans': 15
}

id2label = {
    0: 'CSB-General Inquiry',
    1: 'FAS-CPD-Business Public Health Complaint',
    2: 'FAS-SAS-Dead Animal',
    3: 'SCL-Streetlight Report',
    4: 'Abandoned Vehicle',
    5: 'SDOT-Pothole',
    6: 'SDOT-Shared Micromobility',
    7: 'SDOT-Sign and Signal Maintenance',
    8: 'SEA-Overgrown Vegetation',
    9: 'SEA-Unauthorized Encampment',
    10: 'SPD-Parking Enforcement',
    11: 'SPR-Maintenance',
    12: 'SPU-Clogged Drains',
    13: 'SPU-Graffiti Report',
    14: 'SPU-Illegal Dumping Report',
    15: 'SPU-Public Litter Cans'
}

label2iddf = pd.DataFrame(label2id.items(), columns=['id', 'label'])

df = pd.merge(df, label2iddf, on='id', how='inner')

df.head()

Unnamed: 0,servreqid,text,origid,id,label
0,250398107,It's almost midnight and I am still hearing m...,CSB-General Inquiry,CSB-General Inquiry,0
1,250398300,"RVs back where recently removed. Trash, sidew...",CSB-General Inquiry,CSB-General Inquiry,0
2,250399387,Re-populated site. Homeless encampment. 1 ten...,CSB-General Inquiry,CSB-General Inquiry,0
3,250399851,Tents,CSB-General Inquiry,CSB-General Inquiry,0
4,250400121,Illegal camping on trails near golden gardens...,CSB-General Inquiry,CSB-General Inquiry,0


In [None]:
# Define file save names
trainsavef = 'labelled_train.csv'
testsavef = 'labelled_test.csv'
unlabelledsavef = 'unlabelled.csv'

# Split into train and test and save to drive
traindf = df.sample(frac=0.8, random_state=42)
testdf = df.drop(traindf.index)
print('Original row count =', df.shape, 'Train count =', traindf.shape, 'Test count =', testdf.shape)
traindf.to_csv(file_path + trainsavef, index=False)
testdf.to_csv(file_path + testsavef, index=False)

Original row count = (59481, 5) Train count = (47585, 5) Test count = (11896, 5)


In [None]:
# Repeat the process for the unlabelled dataset
unldf = pd.read_csv(file_path + unlabelledf)
unldf = unldf.loc[:, ['Service Request ID', 'alldetails', 'Service Request Type']]
unldf.dropna(inplace=True)

column_mapping = {
    'Service Request ID': 'servreqid',
    'alldetails': 'text',
    'Service Request Type': 'origid'
}
unldf.rename(columns=column_mapping, inplace=True)

# Set the data types explicitly
dtypes = {'servreqid': str,
          'text': str,
          'origid': str}

unldf = unldf.astype(dtypes)

# Remove the SDOT vs SPD distinction for Abandoned Vehicle requests types
unldf['id'] = unldf['origid']
unldf['id'] = unldf['id'].replace(['SDOT-Abandoned Vehicle', 'SPD-Abandoned Vehicle'], 'Abandoned Vehicle')

# Add the labels
unldf = pd.merge(unldf, label2iddf, on='id', how='inner')

# Display the first few rows of the DataFrame
print(unldf.head())

# Save the file back to drive
unldf.to_csv(file_path + unlabelledsavef, index=False)


   servreqid                                               text  \
0  190226346  Sign down on sidewalk. SE corner of 75th St NE...   
1  190229680                          scl property closed sign    
2  190230845  No parking signs on the northbound lane of Fai...   
3  190233656                    Stop sign down after collision    
4  190234544  The street sign showing directions to Magnuson...   

                             origid                                id  label  
0  SDOT-Sign and Signal Maintenance  SDOT-Sign and Signal Maintenance      7  
1  SDOT-Sign and Signal Maintenance  SDOT-Sign and Signal Maintenance      7  
2  SDOT-Sign and Signal Maintenance  SDOT-Sign and Signal Maintenance      7  
3  SDOT-Sign and Signal Maintenance  SDOT-Sign and Signal Maintenance      7  
4  SDOT-Sign and Signal Maintenance  SDOT-Sign and Signal Maintenance      7  


In [None]:
# Create a HuggingFace dataset from the train and test labelled data
from datasets import load_dataset
from huggingface_hub import login

from google.colab import userdata
hftoken = userdata.get('HF_TOKEN')

login(token=hftoken)

data_files = {"train": file_path + trainsavef, "test": file_path + testsavef}
fifidataset = load_dataset("csv", data_files=data_files)

# Publish dataset to HuggingFace (note that a write access token has to be loaded)
fifidataset.push_to_hub("finditfixit")

# Note:  I changed the data type for servreqid from int to string on the huggingface dataset card

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mjbeattie/finditfixit/commit/0253a8f237efd8b545c16fd560de44497173fe07', commit_message='Upload dataset', commit_description='', oid='0253a8f237efd8b545c16fd560de44497173fe07', pr_url=None, pr_revision=None, pr_num=None)

# Step 2: Start here if dataset is already loaded onto HuggingFace hub

In [None]:
from datasets import load_dataset

# Load dataset from HuggingFace hub
fifids = load_dataset('mjbeattie/finditfixit')

# View a record
fifids["test"][0]

# Map IDs to labels and add the IDs to the dataframe
label2id = {
    'CSB-General Inquiry': 0,
    'FAS-CPD-Business Public Health Complaint': 1,
    'FAS-SAS-Dead Animal': 2,
    'SCL-Streetlight Report': 3,
    'Abandoned Vehicle': 4,
    'SDOT-Pothole': 5,
    'SDOT-Shared Micromobility': 6,
    'SDOT-Sign and Signal Maintenance': 7,
    'SEA-Overgrown Vegetation': 8,
    'SEA-Unauthorized Encampment': 9,
    'SPD-Parking Enforcement': 10,
    'SPR-Maintenance': 11,
    'SPU-Clogged Drains': 12,
    'SPU-Graffiti Report': 13,
    'SPU-Illegal Dumping Report': 14,
    'SPU-Public Litter Cans': 15
}

id2label = {
    0: 'CSB-General Inquiry',
    1: 'FAS-CPD-Business Public Health Complaint',
    2: 'FAS-SAS-Dead Animal',
    3: 'SCL-Streetlight Report',
    4: 'Abandoned Vehicle',
    5: 'SDOT-Pothole',
    6: 'SDOT-Shared Micromobility',
    7: 'SDOT-Sign and Signal Maintenance',
    8: 'SEA-Overgrown Vegetation',
    9: 'SEA-Unauthorized Encampment',
    10: 'SPD-Parking Enforcement',
    11: 'SPR-Maintenance',
    12: 'SPU-Clogged Drains',
    13: 'SPU-Graffiti Report',
    14: 'SPU-Illegal Dumping Report',
    15: 'SPU-Public Litter Cans'
}


Downloading readme:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/743k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/47585 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11896 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

# Import the distilbert pre-trained model for tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

# Set tokenization to use distilbert and truncate text to max token length
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Tokenize the FindItFixIt dataset using batch to speed it up
tokenized_fifids = fifids.map(preprocess_function, batched=True)

Map:   0%|          | 0/47585 [00:00<?, ? examples/s]

Map:   0%|          | 0/11896 [00:00<?, ? examples/s]

In [None]:
# Create a batch of examples (note:  using PyTorch)
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Load an evaluation function
import evaluate, numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# Load the pretrained model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=16, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set training hyperparameters
training_args = TrainingArguments(
    output_dir="fifi_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_fifids["train"],
    eval_dataset=tokenized_fifids["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6323,0.598683,0.797747
2,0.4925,0.586038,0.802707
3,0.4248,0.609174,0.800101
4,0.353,0.635266,0.800017


TrainOutput(global_step=11900, training_loss=0.5166572955476135, metrics={'train_runtime': 2147.4048, 'train_samples_per_second': 88.637, 'train_steps_per_second': 5.542, 'total_flos': 5005644489150048.0, 'train_loss': 0.5166572955476135, 'epoch': 4.0})

In [None]:
# Push the model to the hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/mjbeattie/fifi_classification/commit/2d048079a2d4c18d4645a5f961eebaed40bb7138', commit_message='End of training', commit_description='', oid='2d048079a2d4c18d4645a5f961eebaed40bb7138', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Get the model from HuggingFace and classify an example
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

text = "There is an abandoned vehicle at my house."

# Load the retrained tokenizer and tokenize the input
tokenizer = AutoTokenizer.from_pretrained("mjbeattie/fifi_classification")
inputs = tokenizer(text, return_tensors="pt")

# Load the retrained model and predict the classification of the test text
model = AutoModelForSequenceClassification.from_pretrained("mjbeattie/fifi_classification")
with torch.no_grad():
    logits = model(**inputs).logits

# Convert the returned label to an id
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]


'Abandoned Vehicle'

# Step 3: Start here if the retrained model has been created and published to the HuggingFace hub

In [None]:
# Get the model from HuggingFace and classify some examples
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

texts = ["There is a dead dog.", "Somebody tagged the wall of my store", "There are tents blocking the sidewalk of my business",
         "The streetlight in front of my house is out"]

# Load the retrained tokenizer and tokenize the input
tokenizer = AutoTokenizer.from_pretrained("mjbeattie/fifi_classification")

inputs = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Load the retrained model and predict the classification of the test text
model = AutoModelForSequenceClassification.from_pretrained("mjbeattie/fifi_classification")
with torch.no_grad():
    logits = model(**inputs).logits

# Get predicted labels
predicted_labels = torch.argmax(logits, dim=1)

# Print the predicted labels
for i, text in enumerate(texts):
    print(f"Text: {text} | Predicted Label: {predicted_labels[i]}")


Text: There is a dead dog. | Predicted Label: 2
Text: Somebody tagged the wall of my store | Predicted Label: 13
Text: There are tents blocking the sidewalk of my business | Predicted Label: 9
Text: The streetlight in front of my house is out | Predicted Label: 3


In [None]:
def append_line_to_file(file_path, line):
    # Open the file in append mode ('a+')
    with open(file_path, 'a+') as file:
        # Move the cursor to the end of the file
        file.seek(0, 2)
        # If the file is not empty, add a newline before appending
        if file.tell() > 0:
            file.write('\n')
        # Append the line to the file
        file.write(line)

In [None]:
# Shuffle the unlabelled records and save to a new file so that it can be
# chunked and reclassified
file_path = ''
unlabelledf = 'unlabelled.csv'

unldf = pd.read_csv(file_path + unlabelledf)
shuffled_unldf = unldf.sample(frac=1).reset_index(drop=True)
savef = 'shuffled_unlabelled.csv'
shuffled_unldf.to_csv(file_path + savef, index=False)

# Count the rows in the dataset to estimate time to complete
print("Number of rows in the DataFrame:", shuffled_unldf.shape[0])


In [None]:
# Reclassify the service requests from prior to Unauthorized Encampments
import pandas as pd, time

# Record the run times to a log file
logf = 'fifi_running_times.txt'
start_time = time.time()

# Read the shuffled file in for reclassification
file_path = ''
unlabelledf = 'shuffled_unlabelled.csv'

unldf = pd.read_csv(file_path + unlabelledf)

# Run the routine on blocks of 500 -- memory can't handle more
blocknum = 0
startrow = 0
endrow = 999
samplesize = endrow - startrow + 1
unlsubset = unldf.iloc[startrow:endrow]

texts = unlsubset['text'].tolist()
servreqids = unlsubset['servreqid'].tolist()

inputs = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Predict the classification of the test text
with torch.no_grad():
    logits = model(**inputs).logits

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
runtime_entry = "Blocknum: " + str(blocknum) + " Startrow: " + str(startrow) + " Endrow: " + str(endrow) + " Classified texts: " + str(samplesize) + " Elapsed time: " + str(elapsed_time)
append_line_to_file(logf, runtime_entry)

print(f"Elapsed time: {elapsed_time:.6f} seconds")

# Get predicted labels
predicted_labels = torch.argmax(logits, dim=1)
predicted_labels = predicted_labels.tolist()

# Join new labels to original dataset
relabelleddf = pd.DataFrame({'servreqid': servreqids, 'newlabel': predicted_labels})
unlsubset = pd.merge(unlsubset, relabelleddf, on='servreqid', how='inner')


Elapsed time: 165.903333 seconds


In [None]:
# Add the new labels and IDs back to the sample and save
label2id = {
    'CSB-General Inquiry': 0,
    'FAS-CPD-Business Public Health Complaint': 1,
    'FAS-SAS-Dead Animal': 2,
    'SCL-Streetlight Report': 3,
    'Abandoned Vehicle': 4,
    'SDOT-Pothole': 5,
    'SDOT-Shared Micromobility': 6,
    'SDOT-Sign and Signal Maintenance': 7,
    'SEA-Overgrown Vegetation': 8,
    'SEA-Unauthorized Encampment': 9,
    'SPD-Parking Enforcement': 10,
    'SPR-Maintenance': 11,
    'SPU-Clogged Drains': 12,
    'SPU-Graffiti Report': 13,
    'SPU-Illegal Dumping Report': 14,
    'SPU-Public Litter Cans': 15
}

label2iddf = pd.DataFrame(label2id.items(), columns=['newid', 'newlabel'])

# Add new IDs to original dataset and save to a file
unlsubset = pd.merge(unlsubset, label2iddf, on='newlabel', how='inner')
savef = 'reclassified_fifi_reqs_' + str(blocknum) + '.csv'
unlsubset.to_csv(file_path + savef, index=False)

In [None]:
# Read in the reclassified files and consolidate into one dataframe
flead = 'reclassified_fifi_reqs_'
fclose = '.csv'
fname = flead + str(0) + fclose

finalreclassdf = pd.read_csv(fname)
print(finalreclassdf.shape)
rowcount = finalreclassdf.shape[0]

for blocknum in range(1,32):
    fname = flead + str(blocknum) + fclose
    nextclassdf = pd.read_csv(fname)
    print('Reading in blocknum', blocknum, 'Shape is:', nextclassdf.shape)
    rowcount += nextclassdf.shape[0]
    finalreclassdf = pd.concat([finalreclassdf, nextclassdf], ignore_index=True)

finalreclassdf.head()

(999, 6)
Reading in blocknum 1 Shape is: (999, 6)
Reading in blocknum 2 Shape is: (999, 6)
Reading in blocknum 3 Shape is: (499, 6)
Reading in blocknum 4 Shape is: (499, 6)
Reading in blocknum 5 Shape is: (499, 6)
Reading in blocknum 6 Shape is: (499, 6)
Reading in blocknum 7 Shape is: (499, 6)
Reading in blocknum 8 Shape is: (499, 6)
Reading in blocknum 9 Shape is: (999, 6)
Reading in blocknum 10 Shape is: (499, 6)
Reading in blocknum 11 Shape is: (499, 6)
Reading in blocknum 12 Shape is: (749, 6)
Reading in blocknum 13 Shape is: (599, 6)
Reading in blocknum 14 Shape is: (649, 6)
Reading in blocknum 15 Shape is: (749, 6)
Reading in blocknum 16 Shape is: (649, 6)
Reading in blocknum 17 Shape is: (499, 6)
Reading in blocknum 18 Shape is: (599, 6)
Reading in blocknum 19 Shape is: (499, 6)
Reading in blocknum 20 Shape is: (599, 6)
Reading in blocknum 21 Shape is: (599, 6)
Reading in blocknum 22 Shape is: (599, 6)
Reading in blocknum 23 Shape is: (699, 6)
Reading in blocknum 24 Shape is: (

Unnamed: 0,servreqid,text,id,label,newlabel,newid
0,225303109,Unauthorized camping,CSB-General Inquiry,7,7,CSB-General Inquiry
1,208175246,Broken glass and a pile of cigarette butts un...,CSB-General Inquiry,7,7,CSB-General Inquiry
2,224720640,Due to 72 hr moratorium by Mayor Durkan this r...,SPD-Parking Enforcement,4,7,CSB-General Inquiry
3,228696411,The tree pictured has some very large limbs t...,CSB-General Inquiry,7,7,CSB-General Inquiry
4,216719653,The street cleaner may have pushed them away f...,SEA-Overgrown Vegetation,12,7,CSB-General Inquiry


In [None]:
# Check to ensure unique service records and new count for request type
unique_counts = finalreclassdf.nunique(axis=0)
print("No. of unique values in each column:\n", unique_counts)

No. of unique values in each column:
 servreqid    20000
text         16840
id              14
label           14
newlabel        15
newid           15
dtype: int64


In [None]:
# Save the final reclassified data into a file on disk
fname = 'final_reclassified_fifi_reqs.csv'
finalreclassdf.to_csv(file_path + fname, index=False)