In [1]:
import pandas as pd
import numpy as np
import re
import string
from pprint import pprint
import time
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

from sklearn.model_selection import train_test_split
from finetune import Classifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  return f(*args, **kwds)
  return f(*args, **kwds)


# EDA of Full Training Dataset
Adapted from `combined-dataset-full-finetune.ipynb`

In [2]:
dat = pd.read_csv(r'data\combined_trainingdata_20181013.tsv', sep='\t', low_memory=False)

In [3]:
# Clean col names
dat.columns = [x.lower() for x in dat.columns]
dat.columns = [x.replace(' ', '_') for x in dat.columns]
dat.columns

Index(['index', 'complaint_id', 'city', 'complaint_date', 'dept_311',
       'code_311', 'category_main', 'category_sub', 'complaint_1',
       'complaint_2'],
      dtype='object')

# Clean Data
Remove rows with no complaint 1 data. Create Complaint column that has (complaint 2 or complaint 1). We are also doing pre-processing. This involves removing punctuation, making everything lowercase, replacing numbers with N, cutting everything after 512 chars, concatenating two types of complaints when they are for Chicago.

## New Changes

In [4]:
dat.shape

(1278129, 10)

In [5]:
# Remove NaNs in complaint cols
dat['complaint_1'] = dat['complaint_1'].fillna('')
dat['complaint_2'] = dat['complaint_2'].fillna('')

In [6]:
# Merge complaint_1 & _2
dat['merged_complaint'] = dat[['complaint_1', 'complaint_2']].apply(lambda x: ' '.join(x), axis=1)

In [7]:
# Get len of merged_complaint
dat['complaint_len'] = dat['merged_complaint'].map(len)

In [8]:
dat.shape

(1278129, 12)

In [9]:
# Remove blanks (len is due to the space added from above join)
dat = dat.loc[dat['complaint_len'] != 1]

In [10]:
dat.shape

(1264550, 12)

In [11]:
def clean_specifics(complaint):
    complaint = re.sub('Request entered through the Web. Refer to Intake Questions for further description.',
                      '', complaint)
    complaint = re.sub('Transfer:.+/[A-Z]+', '', complaint)
    complaint = re.sub('ACCT ', '', complaint)
    complaint = re.sub('RTC ', '', complaint)
    return complaint

In [12]:
# Preprocess `merged_complaint`
cachedStopWords = stopwords.words("english")

def preprocess(complaint):
    complaint = clean_specifics(complaint)
    complaint = re.sub('\d', '', complaint) # Remove numbers completely
    complaint = re.sub('[^\w\s]', ' ', complaint) # Sub puncuation with space
    complaint = complaint.strip()
    complaint = re.sub(' +', ' ', complaint) # Remove dupe spaces
    complaint = complaint.lower()
    complaint = ' '.join([word for word in complaint.split() if word not in cachedStopWords])
    return complaint

In [13]:
dat['merged_complaint'] = dat['merged_complaint'].map(preprocess)

In [14]:
dat.shape

(1264550, 12)

In [15]:
# Recalc len of merged_complaint after preprocessing
dat['complaint_len'] = dat['merged_complaint'].map(len)

In [16]:
# Get word lengths
dat['word_len'] = dat.apply(lambda x: len(x['merged_complaint'].split(' ')), axis=1)

In [17]:
dat.shape

(1264550, 13)

In [18]:
# Remove zero and one char length tickets
dat = dat.loc[(dat['complaint_len'] != 0) & (dat['complaint_len'] != 1)]

In [19]:
dat.shape

(1014309, 13)

In [20]:
# Remove two char length tickets except for tv
dat = dat.loc[(dat['complaint_len'] != 2) | (dat['merged_complaint'] == 'tv')]

In [21]:
dat.shape

(1013776, 13)

In [22]:
# Remove three char length tickets
dat = dat.loc[dat['complaint_len'] != 3]

In [23]:
dat.shape

(1004413, 13)

In [24]:
# Remove single word 'acct'
dat = dat.loc[~((dat['word_len'] == 1) & (dat['merged_complaint'].str.contains('acct')))]

In [25]:
dat.shape

(1003871, 13)

In [26]:
dat.loc[dat['complaint_len'] < 501].shape[0]/dat.shape[0]

0.9932112791384551

In [27]:
dat.loc[dat['word_len'] < 101].shape[0]/dat.shape[0]

0.9973711761770188

In [28]:
# Keep only the first 100 words of each complaint
dat['merged_complaint'] = dat.apply(lambda x: ' '.join(x['merged_complaint'].split(' ')[:100]) if (x['word_len'] > 100) else x['merged_complaint'],
                                    axis=1)

In [29]:
# Recalc word lens
dat['word_len'] = dat.apply(lambda x: len(x['merged_complaint'].split(' ')), axis=1)

# Short Complaints

In [30]:
dat.shape

(1003871, 13)

In [31]:
dat.shape

(1003871, 13)

In [32]:
dat.loc[dat['word_len'] == 1, 'merged_complaint'].value_counts()

sticker          3543
tires            3055
pothole          2797
couch            2307
potholes         1983
stickers         1970
mattress         1300
roaches          1185
graffiti         1031
flash             955
tags              927
tire              728
tv                703
mold              700
heat              656
sorted            568
chair             512
matress           506
cleaning          505
mice              500
mattresses        392
rehab             306
dresser           302
water             288
recliner          253
carpet            247
heavy             242
trash             235
stump             234
carts             226
                 ... 
polthole            1
hall                1
chaircushions       1
focus               1
thing               1
carousel            1
audi                1
vergrown            1
wheelbarrel         1
berning             1
paleolicious        1
couchcouch          1
spouts              1
headboards          1
boats     

In [33]:
dat.loc[dat['complaint_len'] ==6, 'merged_complaint'].value_counts()

sorted    568
carpet    247
toilet    221
permit    189
signal    176
litter    148
chairs    143
inside    109
missed     82
washer     75
couchs     53
entire     38
family     37
issues     36
vacate     33
vacant     31
weight     28
blight     20
church     20
frames     19
tables     18
yellow     17
closet     16
inches     15
school     15
lights     14
rivate     13
fridge     13
gallon     13
garage     13
         ... 
lounge      1
parcel      1
auto n      1
roches      1
analog      1
unit b      1
inadeq      1
set pm      1
masala      1
cuches      1
toyota      1
outage      1
frisch      1
proper      1
detail      1
ford e      1
zeored      1
larosa      1
caribe      1
camero      1
osrted      1
alamir      1
jehehe      1
stereo      1
f home      1
screen      1
grecos      1
stcker      1
schiff      1
maress      1
Name: merged_complaint, Length: 326, dtype: int64

# Prep Training Data

In [35]:
dat.shape

(1003871, 13)

In [55]:
# Filter out category_sub with less than 100 count
dat['category_sub'].value_counts(ascending=True)

publicorder_suspicious_behavior              1
publicorder_drug_activity                    6
publicorder_dangerous_driving                7
environnment_hazardous_material             15
publichealth_public_building_hygiene        20
fire_code_violation                         21
publictransit_bus_service                   26
governance_it                               26
environment_water_pollution                 34
publichealth_school_hygiene                 39
goveranance_general                         50
governance_community                        63
fire_risks                                 104
housing_pests                              158
environemnt_dead_animal                    270
planning_construction                      422
street_roadkill                            427
vandalism_general                          454
fire_equipment_broken                      660
publichealth_animal_feces                  804
publichealth_animal                        833
infrastructur

In [56]:
# Create a mask for good categories and filter data
good_label_mask = dat['category_sub'].value_counts(ascending=True).values > 100
good_labels_list = dat['category_sub'].value_counts(ascending=True)[good_label_mask].index.tolist()

dat = dat.loc[dat['category_sub'].isin(good_labels_list)]
dat.shape

(1003563, 13)

In [57]:
# Sub to Main mapping
labels_map = dat[["category_main", "category_sub"]].drop_duplicates()
labels_map = labels_map.set_index("category_sub").to_dict()["category_main"]
labels_map

{'environment_general': 'environment',
 'environment_dumping': 'environment',
 'environment_air_pollution': 'environment',
 'environment_abandoned_site': 'environment',
 'publicorder_noise_complaint': 'public_order',
 'environment_asbestos ': 'environment',
 'environment_hazardous_material': 'environment',
 'planning_general': 'planning',
 'infrastructure_general': 'infrastructure',
 'environment_recycling': 'environment',
 'street_repair': 'street',
 'housing_general': 'housing',
 'environment_overgrowth': 'environment',
 'housing_health_code': 'housing',
 'street_sewar': 'street',
 'environment_garbage_collection': 'environment',
 'housing_safety': 'housing',
 'environment_abandoned_vehicle': 'environment',
 'governance_signage': 'governance',
 'street_general': 'street',
 'environment_litter': 'environment',
 'street_urgent_repair': 'street',
 'street_sidewalk': 'street',
 'environment_dead_animal': 'environment',
 'street_cleaning': 'street',
 'environemnt_dead_animal': 'environmen

In [58]:
# Prep training data
training_data = dat[['merged_complaint', 'category_sub']]
type(training_data)
training_data.shape

(1003563, 2)

In [59]:
# Stratified sample
# _, sampleX, _, sampleY = train_test_split(trainingData.complaint, trainingData.CATEGORY_SUB, test_size=0.1, random_state=42, stratify=trainingData.CATEGORY_SUB)
# print(sampleY.shape)

In [60]:
# Check stratified sample
# stratSampleDF = pandas.concat([sampleX,sampleY], axis = 1)
# stratSampleDF.head()
# print(stratSampleDF.CATEGORY_SUB.unique())
# aggregationStrat = {"complaint":"count"}
# aggregatedByLabelStrat = stratSampleDF.groupby("CATEGORY_SUB").agg(aggregationStrat)
# print(aggregatedByLabelStrat.sort_values(("complaint")))

In [61]:
# Split train/test
trainX, testX, trainY, testY = train_test_split(training_data['merged_complaint'], training_data['category_sub'], test_size=0.2, random_state=42, stratify=training_data['category_sub'])
del training_data
print(trainX.shape)
print("Split into train and test")

(802850,)
Split into train and test


## Finetune

In [62]:
import tensorflow as tf

In [63]:
tf.test.is_gpu_available()

True

In [64]:
from finetune import config

# import logging

# import os
# import subprocess
# import traceback
# import warnings


In [66]:
config.all_gpus()

{0: ' GeForce GTX 1070 Ti (UUID: GPU-d82ccac6-f7d3-1739-5501-da35bf928ab4)'}

In [143]:
# LOGGER = logging.getLogger('finetune')
# PAD_TOKEN = '<PAD>'

# sp = subprocess.Popen([r'C:\Program Files\NVIDIA Corporation\NVSMI\nvidia-smi.exe', '-L'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# response = sp.communicate()[0]
# gpu_list = response.decode('utf-8').strip().split('\n')
# device_ids = {}
# for i, gpu in enumerate(gpu_list):
#     # May be worth logging GPU description
#     device_id_str, _, description = gpu.partition(':')
#     assert int(device_id_str.split(' ')[-1]) == i
#     device_ids[i] = description

# cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
# if cuda_visible_devices:
#     device_ids = {
#         device_id: description 
#         for device_id, description in device_ids.items()
#         if str(device_id) in cuda_visible_devices.split(',')
#     }
# LOGGER.info(" Visible Devices: {{{}}}".format(
#     ", ".join([
#         "{}:{}".format(device_id, description.split('(')[0]).strip()
#         for device_id, description in device_ids.items()
#     ])
# ))

INFO:finetune: Visible Devices: {0: GeForce GTX 1070 Ti}


In [None]:
# Train Finetune
start = time.time()
model = Classifier(max_length=512, val_interval=3000, verbose = True)               # Load base model
model.fit(trainX.tolist(), trainY.tolist())          # Finetune base model on custom data
duration = time.time()-start
print("It took :"+str(duration)+ " seconds")

Epoch 1/3:   1%|▍                                                             | 6002/802750 [15:07<33:19:33,  6.64it/s]
Validation:   0%|                                                                              | 0/100 [00:00<?, ?it/s]
Validation:   8%|█████▌                                                                | 8/100 [00:00<00:01, 48.32it/s]
Validation:  12%|████████▎                                                            | 12/100 [00:00<00:02, 37.07it/s]
Validation:  16%|███████████                                                          | 16/100 [00:00<00:02, 31.87it/s]
Validation:  20%|█████████████▊                                                       | 20/100 [00:00<00:02, 29.09it/s]
Validation:  24%|████████████████▌                                                    | 24/100 [00:00<00:02, 27.35it/s]
Validation:  28%|███████████████████▎                                                 | 28/100 [00:00<00:02, 26.26it/s]
Validation:  32%|██████████████████████ 

Validation:  24%|████████████████▌                                                    | 24/100 [00:00<00:02, 27.92it/s]
Validation:  28%|███████████████████▎                                                 | 28/100 [00:00<00:02, 26.73it/s]
Validation:  32%|██████████████████████                                               | 32/100 [00:01<00:02, 25.95it/s]
Validation:  36%|████████████████████████▊                                            | 36/100 [00:01<00:02, 25.44it/s]
Validation:  40%|███████████████████████████▌                                         | 40/100 [00:01<00:02, 25.18it/s]
Validation:  44%|██████████████████████████████▎                                      | 44/100 [00:01<00:02, 24.96it/s]
Validation:  48%|█████████████████████████████████                                    | 48/100 [00:01<00:02, 24.85it/s]
Validation:  52%|███████████████████████████████████▉                                 | 52/100 [00:01<00:01, 24.73it/s]
Validation:  56%|███████████████████████

Validation:  48%|█████████████████████████████████                                    | 48/100 [00:01<00:02, 24.74it/s]
Validation:  52%|███████████████████████████████████▉                                 | 52/100 [00:01<00:01, 24.61it/s]
Validation:  56%|██████████████████████████████████████▋                              | 56/100 [00:02<00:01, 24.56it/s]
Validation:  60%|█████████████████████████████████████████▍                           | 60/100 [00:02<00:01, 24.44it/s]
Validation:  64%|████████████████████████████████████████████▏                        | 64/100 [00:02<00:01, 24.44it/s]
Validation:  68%|██████████████████████████████████████████████▉                      | 68/100 [00:02<00:01, 24.40it/s]
Validation:  72%|█████████████████████████████████████████████████▋                   | 72/100 [00:02<00:01, 24.42it/s]
Validation:  76%|████████████████████████████████████████████████████▍                | 76/100 [00:02<00:00, 24.39it/s]
Validation:  80%|███████████████████████

Validation:  72%|█████████████████████████████████████████████████▋                   | 72/100 [00:02<00:01, 24.44it/s]
Validation:  76%|████████████████████████████████████████████████████▍                | 76/100 [00:02<00:00, 24.40it/s]
Validation:  80%|███████████████████████████████████████████████████████▏             | 80/100 [00:03<00:00, 24.37it/s]
Validation:  84%|█████████████████████████████████████████████████████████▉           | 84/100 [00:03<00:00, 24.35it/s]
Validation:  88%|████████████████████████████████████████████████████████████▋        | 88/100 [00:03<00:00, 24.34it/s]
Validation:  92%|███████████████████████████████████████████████████████████████▍     | 92/100 [00:03<00:00, 24.37it/s]
Validation:  96%|██████████████████████████████████████████████████████████████████▏  | 96/100 [00:03<00:00, 24.40it/s]
Validation: 100%|████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 24.37it/s]
Epoch 1/3:  13%|███████▎                

Validation:  96%|██████████████████████████████████████████████████████████████████▏  | 96/100 [00:03<00:00, 24.45it/s]
Validation: 100%|████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 24.41it/s]
Epoch 1/3:  16%|█████████▌                                                | 132002/802750 [5:34:30<27:41:43,  6.73it/s]
Validation:   0%|                                                                              | 0/100 [00:00<?, ?it/s]
Validation:   8%|█████▌                                                                | 8/100 [00:00<00:01, 48.91it/s]
Validation:  12%|████████▎                                                            | 12/100 [00:00<00:02, 37.62it/s]
Validation:  16%|███████████                                                          | 16/100 [00:00<00:02, 32.47it/s]
Validation:  20%|█████████████▊                                                       | 20/100 [00:00<00:02, 29.56it/s]
Validation:  24%|████████████████▌      

Validation:  16%|███████████                                                          | 16/100 [00:00<00:02, 32.45it/s]
Validation:  20%|█████████████▊                                                       | 20/100 [00:00<00:02, 29.55it/s]
Validation:  24%|████████████████▌                                                    | 24/100 [00:00<00:02, 27.87it/s]
Validation:  28%|███████████████████▎                                                 | 28/100 [00:00<00:02, 26.70it/s]
Validation:  32%|██████████████████████                                               | 32/100 [00:01<00:02, 25.93it/s]
Validation:  36%|████████████████████████▊                                            | 36/100 [00:01<00:02, 25.47it/s]
Validation:  40%|███████████████████████████▌                                         | 40/100 [00:01<00:02, 25.11it/s]
Validation:  44%|██████████████████████████████▎                                      | 44/100 [00:01<00:02, 24.96it/s]
Validation:  48%|███████████████████████

In [None]:
# Save model
model.save("combined_model_20181105_alau")        

In [None]:
# Test model and predict
model = Classifier.load("../models/combined_model_20181105_alau")
print(model)
predictions = model.predict(testX.tolist())

In [None]:
mainPredictions = []
for pred in predictions:
    mainPredictions.append(labels_map[pred])

mainTestY = []
for testLabel in testY.tolist():
    mainTestY.append(labels_map[testLabel])
    
correctMain = 0
countMain = 0
for i, complaint in enumerate(testX.tolist()):
    correctMain += int(mainPredictions[i] == mainTestY[i])
    countMain +=1
print(correctMain)
print(countMain)
print("Accuracy on Main: "+str(correctMain*1.0/countMain))

In [None]:
correct = 0
count = 0
testYList = testY.tolist()
for i, complaint in enumerate(testX.tolist()):
    correct += int(predictions[i] == testYList[i])
    count +=1
print(correct)
print(count)
print("Accuracy on Sub: "+str(correct*1.0/count))