In [6]:
from finetune import Classifier
import pandas
from sklearn.model_selection import train_test_split
import time
import re, string

Get Data

In [7]:
filePath = "../Datasets/combined_trainingdata_20181013.tsv"
data = pandas.read_csv(filePath,sep='\t')
print(data.shape)
print(data.loc[0])

(1278129, 10)
index                                                             0
COMPLAINT_ID                                        US_CHICAGO_1725
CITY                                                     US_CHICAGO
COMPLAINT DATE                                           10/03/2011
DEPT_311                                         health_environment
CODE_311                           permits issued by doe work order
CATEGORY_MAIN                                           environment
CATEGORY_SUB                                    environment_general
COMPLAINT_1       QUESTIONABLE BUSINESS PRACTICES REGARDING OILS...
COMPLAINT_2       [INSPECTION LOG #: 1723 03-OCT-11 18:55:00] TH...
Name: 0, dtype: object


Remove rows with no complaint 1 data. Create Complaint column that has (complaint 2 or complaint 1).
We are also doing pre-processing. This involves removing punctuation, making everything lowercase, replacing numbers with N, cutting everything after 512 chars, concatenating two types of complaints when they are for Chicago.

In [8]:
print(data.columns[data.isna().any()].tolist())
print(data[data.COMPLAINT_1.isna()].shape)
print(data[data.COMPLAINT_2.isna()].shape)
print(data[(data.COMPLAINT_1.isna()) & (data.COMPLAINT_2.isna())].shape)

['COMPLAINT DATE', 'DEPT_311', 'COMPLAINT_1', 'COMPLAINT_2']
(14025, 10)
(964291, 10)
(13579, 10)


In [9]:
dataFiltered = data.dropna(subset = ["COMPLAINT_1"])
print(dataFiltered[(dataFiltered.COMPLAINT_1.isna())].shape)
print(dataFiltered[(dataFiltered.COMPLAINT_1.isna()) & (dataFiltered.COMPLAINT_2.isna())].shape)
print(dataFiltered[dataFiltered.COMPLAINT_1 == ""].shape)

(0, 10)
(0, 10)
(0, 10)


In [10]:
translator = str.maketrans('', '', string.punctuation) # To remove punctuation

def preProcess(complaintStart):
    complaint = complaintStart[:512] # cut to 512 characters max
    complaint = re.sub("\d","N", complaint) # remove numbers
    complaint = complaint.lower().translate(translator) # lower case and remove the punctuation
    complaint = complaint.replace("\n"," ").strip() # remove starting and trailing white spaces
    if re.search('[a-zA-Z]', complaint) is None:# if there are no letters in the complaint, return empty, will be removed in later processing
        return ""
    return complaint

def getComplaint(row):
    complaint2 = row.get("COMPLAINT_2")
    if not pandas.isnull(complaint2):
        if "[INSPECTION LOG #:" in complaint2: # Remove inspection log section from C2
            complaintStrippedList = complaint2.split("]")[1:]
            complaintFinal = "]".join(complaintStrippedList)
        else:
            complaintFinal = complaint2
        if row.get("CITY")=="US_CHICAGO": # if Chicago, concatenate the two
            complaintFinal = row.get("COMPLAINT_1") + " "+ complaintFinal
        complaintProcessed = preProcess(complaintFinal)
        if complaintProcessed == "" or re.search('[a-zA-Z]', complaintProcessed) is None: # if nothing or no letters
            return preProcess(row.get("COMPLAINT_1"))
        return complaintProcessed
    complaintProcessed = preProcess(row.get("COMPLAINT_1"))
    return complaintProcessed

results = dataFiltered.apply(lambda row: getComplaint (row),axis=1)
print(results[results.isna()].shape)

(0,)


In [11]:
dataFiltered["complaint"] = results
print(dataFiltered[dataFiltered.complaint.isna()].shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


(0, 11)


Strip white spaces from CATEGORY_SUB

In [12]:
dataFiltered["CATEGORY_SUB"] = dataFiltered["CATEGORY_SUB"].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Filter data only to ones with at least 10 characters

In [13]:
mask = (dataFiltered["complaint"].str.len() >=10)
dataFiltered = dataFiltered.loc[mask]
print(dataFiltered.shape)

(1084456, 11)


Filter the classes to only ones with more than 100 samples

In [14]:
print(dataFiltered.CATEGORY_SUB.unique())
aggregation = {"complaint":"count"}
aggregatedByLabel = dataFiltered.groupby("CATEGORY_SUB").agg(aggregation)

['environment_general' 'environment_dumping' 'environment_air_pollution'
 'environment_abandoned_site' 'publicorder_noise_complaint'
 'environment_asbestos' 'environment_hazardous_material'
 'planning_general' 'infrastructure_general' 'environment_water_pollution'
 'environment_recycling' 'street_repair' 'housing_general'
 'environment_overgrowth' 'housing_health_code' 'street_sewar'
 'environment_garbage_collection' 'housing_safety'
 'environment_abandoned_vehicle' 'governance_signage' 'street_general'
 'environment_litter' 'street_urgent_repair' 'street_sidewalk'
 'environment_dead_animal' 'street_cleaning' 'environemnt_dead_animal'
 'vandalism_general' 'planning_construction' 'planning_unsafe_environment'
 'infrastructure_water' 'infrastructure_water_repair'
 'environnment_hazardous_material' 'vandalism_graffiti' 'street_lighting'
 'publichealth_general' 'publichealth_school_hygiene' 'fire_general'
 'street_parking' 'housing_mold' 'publichealth_pests' 'housing_pests'
 'street_slippe

In [15]:
print(aggregatedByLabel.sort_values(("complaint")))

                                      complaint
CATEGORY_SUB                                   
publicorder_suspicious_behavior               1
publicorder_drug_activity                     6
publicorder_dangerous_driving                 7
environnment_hazardous_material              15
publichealth_public_building_hygiene         20
publictransit_bus_service                    26
fire_code_violation                          27
environment_water_pollution                  34
publichealth_school_hygiene                  38
goveranance_general                          49
governance_community                         61
governance_it                                81
fire_risks                                  103
housing_pests                               157
environemnt_dead_animal                     268
street_roadkill                             415
planning_construction                       422
vandalism_general                           453
publichealth_animal_feces               

In [16]:
goodLabels = aggregatedByLabel[aggregatedByLabel["complaint"]>100]
goodLabelsList = goodLabels.index.tolist()
print(dataFiltered.shape)
dataGoodLabels = dataFiltered[dataFiltered["CATEGORY_SUB"].isin(goodLabelsList)]
print(dataGoodLabels.shape)

(1084456, 11)
(1084091, 11)


Get SUB to MAIN mapping

In [34]:
labelsMap = dataGoodLabels[["CATEGORY_MAIN", "CATEGORY_SUB"]].drop_duplicates()
labelsMap = labelsMap.set_index("CATEGORY_SUB").to_dict()["CATEGORY_MAIN"]
labelsMap

{'environemnt_dead_animal': 'environment',
 'environment_abandoned_site': 'environment',
 'environment_abandoned_vehicle': 'environment',
 'environment_air_pollution': 'environment',
 'environment_asbestos': 'environment',
 'environment_dead_animal': 'environment',
 'environment_dumping': 'environment',
 'environment_garbage_collection': 'environment',
 'environment_general': 'environment',
 'environment_hazardous_material': 'environment',
 'environment_litter': 'environment',
 'environment_overgrowth': 'environment',
 'environment_recycling': 'environment',
 'fire_equipment_broken': 'fire',
 'fire_general': 'fire',
 'fire_risks': 'fire',
 'governance_general': 'governance',
 'governance_parks_and_rec': 'governance',
 'governance_signage': 'governance',
 'housing_general': 'housing',
 'housing_health_code': 'housing',
 'housing_mold': 'housing',
 'housing_pests': 'housing',
 'housing_safety': 'housing',
 'infrastructure_general': 'infrastructure',
 'infrastructure_power': 'infrastructu

Prepare Training Data

In [35]:
trainingData = dataGoodLabels[["complaint", "CATEGORY_SUB"]]
print(type(trainingData))
print(trainingData.shape)

<class 'pandas.core.frame.DataFrame'>
(1084091, 2)


Get stratified sample to test with (use 10% of the data)

In [36]:
_, sampleX, _, sampleY = train_test_split(trainingData.complaint, trainingData.CATEGORY_SUB, test_size=0.1, random_state=42, stratify=trainingData.CATEGORY_SUB)
print(sampleY.shape)

(108410,)


Checking stratification (it seems to work)

In [37]:
stratSampleDF = pandas.concat([sampleX,sampleY], axis = 1)
stratSampleDF.head()
print(stratSampleDF.CATEGORY_SUB.unique())
aggregationStrat = {"complaint":"count"}
aggregatedByLabelStrat = stratSampleDF.groupby("CATEGORY_SUB").agg(aggregationStrat)
print(aggregatedByLabelStrat.sort_values(("complaint")))

['environment_garbage_collection' 'street_repair' 'housing_general'
 'street_urgent_repair' 'street_lighting' 'environment_overgrowth'
 'environment_dumping' 'fire_general' 'street_drainage' 'planning_general'
 'street_general' 'governance_signage' 'environment_air_pollution'
 'environment_recycling' 'housing_health_code' 'vandalism_graffiti'
 'environment_abandoned_site' 'publichealth_restaurant_hygiene'
 'infrastructure_water_repair' 'infrastructure_general' 'street_slippery'
 'street_sewar' 'environment_hazardous_material'
 'environment_abandoned_vehicle' 'street_sidewalk' 'infrastructure_water'
 'publichealth_general' 'publicorder_general' 'street_parking'
 'housing_mold' 'environment_litter' 'environment_asbestos'
 'governance_general' 'environment_dead_animal' 'housing_safety'
 'infrastructure_power' 'publicorder_noise_complaint'
 'environment_general' 'street_cleaning' 'planning_unsafe_environment'
 'fire_risks' 'governance_parks_and_rec' 'vandalism_general'
 'publichealth_pests

Split sample into train/test (80/20)

In [38]:
trainX, testX, trainY, testY = train_test_split(sampleX, sampleY, test_size=0.2, random_state=42, stratify=sampleY)
print(trainX.shape)

(86728,)


Train model in finetune

In [55]:
start = time.time()
model = Classifier(max_length=512, val_interval=3000, verbose = True)               # Load base model
model.fit(trainX.tolist(), trainY.tolist())          # Finetune base model on custom data
duration = time.time()-start
print("It took :"+str(duration)+ " seconds")

Instructions for updating:
Shapes are always computed; don't use the compute_shapes as it has no effect.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Use `tf.variables_initializer` instead.


Epoch 0:   0%|                             | 15/41198 [00:12<2:57:57,  3.86it/s]

KeyboardInterrupt: 

Save Model

In [None]:
model.save("combined_model_20181018")                   # Serialize the model to disk

Test model and see prediction

In [39]:
model = Classifier.load("../models/combined_model_20181018")
print(model)
predictions = model.predict(testX.tolist())

<finetune.classifier.Classifier object at 0x7fbab7c25ef0>
Instructions for updating:
Shapes are always computed; don't use the compute_shapes as it has no effect.




In [40]:
mainPredictions = []
for pred in predictions:
    mainPredictions.append(labelsMap[pred])

mainTestY = []
for testLabel in testY.tolist():
    mainTestY.append(labelsMap[testLabel])
    
correctMain = 0
countMain = 0
for i, complaint in enumerate(testX.tolist()):
    correctMain += int(mainPredictions[i] == mainTestY[i])
    countMain +=1
print(correctMain)
print(countMain)
print("Accuracy on Main: "+str(correctMain*1.0/countMain))

18102
21682
Accuracy on Main: 0.834886080619869


In [41]:


correct = 0
count = 0
testYList = testY.tolist()
for i, complaint in enumerate(testX.tolist()):
    correct += int(predictions[i] == testYList[i])
    count +=1
print(correct)
print(count)
print("Accuracy on Sub: "+str(correct*1.0/count))

16240
21682
Accuracy on Sub: 0.7490083940595886
