# Finetune Testing

In [0]:
# load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import os
import time
import re, string

# Load sklearn packages
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

import seaborn as sns

from sklearn.metrics import confusion_matrix 

%matplotlib inline

### Google Credentials

In [0]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [21]:
!python -m spacy download en

Collecting en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 102.1MB/s 
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25l- \ | done
[?25hSuccessfully installed en-core-web-sm-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')



### get the data

In [4]:
!wget https://s3.eu-west-2.amazonaws.com/w210-capstone/combined_model_20181021

--2018-10-25 09:12:36--  https://s3.eu-west-2.amazonaws.com/w210-capstone/combined_model_20181021
Resolving s3.eu-west-2.amazonaws.com (s3.eu-west-2.amazonaws.com)... 52.95.148.32
Connecting to s3.eu-west-2.amazonaws.com (s3.eu-west-2.amazonaws.com)|52.95.148.32|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1398967000 (1.3G) [application/x-www-form-urlencoded]
Saving to: ‘combined_model_20181021’


2018-10-25 09:13:41 (20.7 MB/s) - ‘combined_model_20181021’ saved [1398967000/1398967000]



In [0]:
from google.colab import files
#files.download(file)

In [10]:
!ls -lh

total 1.4G
-rw-r--r-- 1 root root 2.6K Oct 25 09:12 adc.json
-rw-r--r-- 1 root root 1.4G Oct 24 16:47 combined_model_20181021
drwxr-xr-x 2 root root 4.0K Oct 23 16:44 sample_data


### install finetune

In [11]:
!pip3 install finetune

Collecting finetune
[?25l  Downloading https://files.pythonhosted.org/packages/38/51/112a78d9b000c30261fe6c292cf4240c6981e6dcb82f4920f2bbd6f07b33/finetune-0.5.10.tar.gz (45kB)
[K    100% |████████████████████████████████| 51kB 2.0MB/s 
[?25hCollecting pandas>=0.23.1 (from finetune)
[?25l  Downloading https://files.pythonhosted.org/packages/e1/d8/feeb346d41f181e83fba45224ab14a8d8af019b48af742e047f3845d8cff/pandas-0.23.4-cp36-cp36m-manylinux1_x86_64.whl (8.9MB)
[K    100% |████████████████████████████████| 8.9MB 2.8MB/s 
[?25hCollecting IndicoIo>=1.1.5 (from finetune)
  Downloading https://files.pythonhosted.org/packages/83/e5/5bcef01976ed776c33fd1ab1ad57154bf4023e1e3670863488b74b3d2a40/IndicoIo-1.1.5.tar.gz
Collecting scipy>=1.1.0 (from finetune)
[?25l  Downloading https://files.pythonhosted.org/packages/a8/0b/f163da98d3a01b3e0ef1cab8dd2123c34aee2bafbb1c5bffa354cc8a1730/scipy-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (31.2MB)
[K    100% |████████████████████████████████| 31.2MB 946

# Get test train data

In [12]:
# https://drive.google.com/open?id=1E_FoSbaboe5azcnUSAChYLyYUrj5xRYO

#2. Get the file
downloaded = drive.CreateFile({'id':'1E_FoSbaboe5azcnUSAChYLyYUrj5xRYO'}) # replace the id with id of file you want to access
downloaded.GetContentFile('combined_trainingdata_20181013.tsv')  

#3. Read file as panda dataframe
import pandas as pd
data = pd.read_csv('combined_trainingdata_20181013.tsv',sep='\t') 

  interactivity=interactivity, compiler=compiler, result=result)


#### look at data

In [13]:
print(data.columns[data.isna().any()].tolist())
print(data[data.COMPLAINT_1.isna()].shape)
print(data[data.COMPLAINT_2.isna()].shape)
print(data[(data.COMPLAINT_1.isna()) & (data.COMPLAINT_2.isna())].shape)

['COMPLAINT DATE', 'DEPT_311', 'COMPLAINT_1', 'COMPLAINT_2']
(14025, 10)
(964291, 10)
(13579, 10)


### preprocess data

In [14]:
dataFiltered = data.dropna(subset = ["COMPLAINT_1"])
print(dataFiltered[(dataFiltered.COMPLAINT_1.isna())].shape)
print(dataFiltered[(dataFiltered.COMPLAINT_1.isna()) & (dataFiltered.COMPLAINT_2.isna())].shape)
print(dataFiltered[dataFiltered.COMPLAINT_1 == ""].shape)

(0, 10)
(0, 10)
(0, 10)


In [16]:
translator = str.maketrans('', '', string.punctuation) # To remove punctuation

def preProcess(complaintStart):
    complaint = complaintStart[:512] # cut to 512 characters max
    complaint = re.sub("\d","N", complaint) # remove numbers
    complaint = complaint.lower().translate(translator) # lower case and remove the punctuation
    complaint = complaint.replace("\n"," ").strip() # remove starting and trailing white spaces
    if re.search('[a-zA-Z]', complaint) is None:# if there are no letters in the complaint, return empty, will be removed in later processing
        return ""
    return complaint

def getComplaint(row):
    complaint2 = row.get("COMPLAINT_2")
    if not pd.isnull(complaint2):
        if "[INSPECTION LOG #:" in complaint2: # Remove inspection log section from C2
            complaintStrippedList = complaint2.split("]")[1:]
            complaintFinal = "]".join(complaintStrippedList)
        else:
            complaintFinal = complaint2
        if row.get("CITY")=="US_CHICAGO": # if Chicago, concatenate the two
            complaintFinal = row.get("COMPLAINT_1") + " "+ complaintFinal
        complaintProcessed = preProcess(complaintFinal)
        if complaintProcessed == "" or re.search('[a-zA-Z]', complaintProcessed) is None: # if nothing or no letters
            return preProcess(row.get("COMPLAINT_1"))
        return complaintProcessed
    complaintProcessed = preProcess(row.get("COMPLAINT_1"))
    return complaintProcessed

results = dataFiltered.apply(lambda row: getComplaint (row),axis=1)
print(results[results.isna()].shape)

(0,)


In [17]:
dataFiltered["complaint"] = results
print(dataFiltered[dataFiltered.complaint.isna()].shape)

#Strip white spaces from CATEGORY_SUB
dataFiltered["CATEGORY_SUB"] = dataFiltered["CATEGORY_SUB"].str.strip()

#Filter data only to ones with at least 10 characters
mask = (dataFiltered["complaint"].str.len() >=10)
dataFiltered = dataFiltered.loc[mask]
print(dataFiltered.shape)

#Filter the classes to only ones with more than 100 samples
print(dataFiltered.CATEGORY_SUB.unique())
aggregation = {"complaint":"count"}
aggregatedByLabel = dataFiltered.groupby("CATEGORY_SUB").agg(aggregation)




goodLabels = aggregatedByLabel[aggregatedByLabel["complaint"]>100]
goodLabelsList = goodLabels.index.tolist()
print(dataFiltered.shape)
dataGoodLabels = dataFiltered[dataFiltered["CATEGORY_SUB"].isin(goodLabelsList)]
print(dataGoodLabels.shape)

#Get SUB to MAIN mapping
labelsMap = dataGoodLabels[["CATEGORY_MAIN", "CATEGORY_SUB"]].drop_duplicates()
labelsMap = labelsMap.set_index("CATEGORY_SUB").to_dict()["CATEGORY_MAIN"]
#display(labelsMap)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


(0, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


(1084456, 11)
['environment_general' 'environment_dumping' 'environment_air_pollution'
 'environment_abandoned_site' 'publicorder_noise_complaint'
 'environment_asbestos' 'environment_hazardous_material'
 'planning_general' 'infrastructure_general' 'environment_water_pollution'
 'environment_recycling' 'street_repair' 'housing_general'
 'environment_overgrowth' 'housing_health_code' 'street_sewar'
 'environment_garbage_collection' 'housing_safety'
 'environment_abandoned_vehicle' 'governance_signage' 'street_general'
 'environment_litter' 'street_urgent_repair' 'street_sidewalk'
 'environment_dead_animal' 'street_cleaning' 'environemnt_dead_animal'
 'vandalism_general' 'planning_construction' 'planning_unsafe_environment'
 'infrastructure_water' 'infrastructure_water_repair'
 'environnment_hazardous_material' 'vandalism_graffiti' 'street_lighting'
 'publichealth_general' 'publichealth_school_hygiene' 'fire_general'
 'street_parking' 'housing_mold' 'publichealth_pests' 'housing_pests'
 

#### Prepare Training Data

In [18]:
trainingData = dataGoodLabels[["complaint", "CATEGORY_SUB"]]
print(type(trainingData))
print(trainingData.shape)


#Split sample into train/test (80/20)
trainX, testX, trainY, testY = train_test_split(trainingData.complaint, trainingData.CATEGORY_SUB, \
                                                test_size=0.2, random_state=42, stratify=trainingData.CATEGORY_SUB)
del trainingData
print(trainX.shape)
print("Split into train and test")

<class 'pandas.core.frame.DataFrame'>
(1084091, 2)
(867272,)
Split into train and test


### load model and start testing

In [0]:
# import finetune packages
from finetune import Classifier

In [23]:
model = Classifier.load("combined_model_20181021")
print(model)
predictions = model.predict(testX.tolist())

Inference:   0%|          | 0/216819 [00:00<?, ?it/s]

<finetune.classifier.Classifier object at 0x7fa3d47a8898>


FileNotFoundError: ignored

In [28]:
!ls -lh /usr/local/lib/python3.6/dist-packages/finetune/

total 216K
-rw-r--r-- 1 root staff  297 Oct 25 09:25 activations.py
-rw-r--r-- 1 root staff  20K Oct 25 09:25 base.py
-rw-r--r-- 1 root staff 3.0K Oct 25 09:25 classifier.py
-rw-r--r-- 1 root staff 3.2K Oct 25 09:25 comparison.py
-rw-r--r-- 1 root staff  11K Oct 25 09:25 config.py
-rw-r--r-- 1 root staff 1.8K Oct 25 09:25 crf.py
drwxr-sr-x 3 root staff 4.0K Oct 25 09:26 datasets
-rw-r--r-- 1 root staff 1.1K Oct 25 09:25 download.py
-rw-r--r-- 1 root staff  12K Oct 25 09:25 encoding.py
-rw-r--r-- 1 root staff   80 Oct 25 09:25 errors.py
-rw-r--r-- 1 root staff 3.2K Oct 25 09:25 estimator_utils.py
-rw-r--r-- 1 root staff 1.9K Oct 25 09:25 imbalance.py
-rw-r--r-- 1 root staff  693 Oct 25 09:25 __init__.py
-rw-r--r-- 1 root staff  12K Oct 25 09:25 input_pipeline.py
-rw-r--r-- 1 root staff 8.2K Oct 25 09:25 metrics.py
-rw-r--r-- 1 root staff 7.0K Oct 25 09:25 model.py
-rw-r--r-- 1 root staff 4.1K Oct 25 09:25 multifield.py
-rw-r--r-- 1 root staff 3.0K Oct 25 09:25 multi_label_classifier.py
