In [8]:
from finetune import Classifier
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import re, string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:

data=pd.read_csv(\
                     'https://s3.amazonaws.com/danicic-w210/combined_trainingdata_20181013.tsv',sep='\t')

print(data.shape)
print(data.loc[0])

print(data.columns[data.isna().any()].tolist())
print(data[data.COMPLAINT_1.isna()].shape)
print(data[data.COMPLAINT_2.isna()].shape)
print(data[(data.COMPLAINT_1.isna()) & (data.COMPLAINT_2.isna())].shape)

dataFiltered = data.dropna(subset = ["COMPLAINT_1"])
print(dataFiltered[(dataFiltered.COMPLAINT_1.isna())].shape)
print(dataFiltered[(dataFiltered.COMPLAINT_1.isna()) & (dataFiltered.COMPLAINT_2.isna())].shape)
print(dataFiltered[dataFiltered.COMPLAINT_1 == ""].shape)


# Alex's
def clean_specifics(complaint):
    complaint = re.sub('Request entered through the Web. Refer to Intake Questions for further description.',
                      '', complaint)
    complaint = re.sub('Transfer:.+/[A-Z]+', '', complaint)
    complaint = re.sub('ACCT ', '', complaint)
    complaint = re.sub('RTC ', '', complaint)
    return complaint
  
  
 # Preprocess `merged_complaint`
cachedStopWords = stopwords.words("english")


# eliminate stop phrases
# remove rows where complaint len <4

stop_phrases=['request entered web refer intake questions description',\
             'duplicate',\
             'batch created',\
             'issue reported city oakland public works department via phone email pwacallcenteroaklandnetcom web wwwoaklandpwcom',\
             'issue reported city oakland call center via phone email callcenteroaklandnetcom web wwwoaklandpwcom',\
             'issue reported city oakland public works agency via phone email pwacallcenteroaklandnetcom web wwwoaklandpwcom',\
             'issue reported oak via phone outside oakland email oakoaklandnetcom web oaklandcagov',\
             'request entered web refer intake questions description',\
             'waze user reported',\
             'council member smitherman reporting',\
             'issue reported city ann arbor customer service via phone email customerserviceagovorg web wwwagovorg',\
             'test report',\
             'reported mobile device httpmseeclickfixcom',\
             'information may available cdph environmental inspections dataset',\
             'ftc',\
             'resident reports',\
             'reported'\
             'rtc',\
             'acct',\
             'batch group'\
             'mt airy nep']


# eliminate entire row of these complaints

elim_list=['this issue was reported to the city of oakland public works agency via phone nnnnnnnnnn email pwacallcenteroaklandnetcom or web wwwoaklandpwcom',\
          'bu',\
          'rtc',\
          'mt airy nep nnnn',\
          'batch group nn',\
          'this issue was reported to oak nnn via phone nnn or nnnnnnnnnn from outside oakland email oaknnnoaklandnetcom or web nnnoaklandcagov',\
          'waze user reported',\
          'this issue was reported to the city of ann arbor customer service via phone nnn nnnnnnn email customerserviceangovorg or web wwwangovorg',\
          'reported by nn',\
          'council member smitherman reporting',\
          'nnn reported ps nnn down',\
          'test report',\
          'p c g s b',\
          'junkyard issues more information may be available in the cdph environmental inspections dataset',\
          'reissue ccnnnnnnnn to update']

# Stan's + Alex

translator = str.maketrans('', '', string.punctuation) # To remove punctuation

def preProcess(complaintStart):
    complaint = clean_specifics(complaintStart)
    complaint = ' '.join([word for word in complaint.split() if word not in cachedStopWords]) # remove stopwords (alex) early on to make 512 limit 
    complaint = complaintStart[:512] # cut to 512 characters max
    complaint = re.sub("\d","", complaint) # remove numbers completely (alex)
    complaint = complaint.lower().translate(translator) # lower case and remove the punctuation
    complaint = re.sub('[^\w\s]', ' ', complaint) # Sub puncuation with space (alex)
    complaint = complaint.strip() # (alex)
    complaint = re.sub(' +', ' ', complaint) # Remove dupe spaces (alex)
    complaint = complaint.replace("\n"," ").strip() # remove starting and trailing white spaces
    if re.search('[a-zA-Z]', complaint) is None:# if there are no letters in the complaint, return empty, will be removed in later processing
        return ""
    complaint = ' '.join([word for word in complaint.split() if word not in cachedStopWords]) # remove stopwords at end after preprocessing (alex) 
    return complaint

def getComplaint(row):
    complaint2 = row.get("COMPLAINT_2")
    if not pd.isnull(complaint2):
        if "[INSPECTION LOG #:" in complaint2: # Remove inspection log section from C2
            complaintStrippedList = complaint2.split("]")[1:]
            complaintFinal = "]".join(complaintStrippedList)
        else:
            complaintFinal = complaint2
        if row.get("CITY")=="US_CHICAGO": # if Chicago, concatenate the two
            complaintFinal = row.get("COMPLAINT_1") + " "+ complaintFinal
        complaintProcessed = preProcess(complaintFinal)
        if complaintProcessed == "" or re.search('[a-zA-Z]', complaintProcessed) is None: # if nothing or no letters
            return preProcess(row.get("COMPLAINT_1"))
        return complaintProcessed
    complaintProcessed = preProcess(row.get("COMPLAINT_1"))
    return complaintProcessed

results = dataFiltered.apply(lambda row: getComplaint (row),axis=1)
print(results[results.isna()].shape)

dataFiltered["complaint"] = results
print(dataFiltered[dataFiltered.complaint.isna()].shape)

dataFiltered["CATEGORY_SUB"] = dataFiltered["CATEGORY_SUB"].str.strip()

# eliminate stop_phrases
print('removing exclusion phrases- autogenerted text')
exclusions = '|'.join(stop_phrases)  #first_word = re.sub(exclusions, '', first_word)
dataFiltered['complaint']=dataFiltered['complaint'].map(lambda x: re.sub(exclusions,'',x))

# strip leading and trailing spaces
dataFiltered['complaint'] = dataFiltered['complaint'].str.strip()

# keep only len 4 and greater
print('dropping complaints less than 4 chars long')
dataFiltered = dataFiltered.loc[(dataFiltered["complaint"].str.len() >=4)]
print(dataFiltered.shape)

stop_complaints=['reported',\
                'reported ps',\
                #'locked',\  # one word locked doesn't say much, but all related to unsafe planning, so dont want to miss locked [door, fire escape, etc] with pic
                'batch group',\
                'mt airy nep']

dataFiltered=dataFiltered[~dataFiltered['complaint'].isin(stop_complaints)]
print('removing complaints that simply state: <<new >> ')
dataFiltered.drop(dataFiltered.loc[dataFiltered['complaint']=='new '].index, inplace=True)
dataFiltered.shape

(1278129, 10)
index                                                             0
COMPLAINT_ID                                        US_CHICAGO_1725
CITY                                                     US_CHICAGO
COMPLAINT DATE                                           10/03/2011
DEPT_311                                         health_environment
CODE_311                           permits issued by doe work order
CATEGORY_MAIN                                           environment
CATEGORY_SUB                                    environment_general
COMPLAINT_1       QUESTIONABLE BUSINESS PRACTICES REGARDING OILS...
COMPLAINT_2       [INSPECTION LOG #: 1723 03-OCT-11 18:55:00] TH...
Name: 0, dtype: object
['COMPLAINT DATE', 'DEPT_311', 'COMPLAINT_1', 'COMPLAINT_2']
(14025, 10)
(964291, 10)
(13579, 10)
(0, 10)
(0, 10)
(0, 10)
(0,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(0, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


removing exclusion phrases- autogenerted text


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


dropping complaints less than 4 chars long
(964228, 11)
removing complaints that simply state: <<new >> 


(960376, 11)

In [14]:
# extract only classes for which we have more than 500 complaints
aggregation = {"complaint":"count"}
aggregatedByLabel = dataFiltered.groupby("CATEGORY_SUB").agg(aggregation)

goodLabels = aggregatedByLabel[aggregatedByLabel["complaint"]>500]
goodLabelsList = goodLabels.index.tolist()
print(dataFiltered.shape)
dataFiltered = dataFiltered[dataFiltered["CATEGORY_SUB"].isin(goodLabelsList)]
print(dataFiltered.shape)
print("Removed labels with too few samples")

(960376, 11)
(958234, 11)
Removed labels with too few samples


In [16]:
# get mapping from sub to main category
labelsMap = dataFiltered[["CATEGORY_MAIN", "CATEGORY_SUB"]].drop_duplicates()
labelsMap = labelsMap.set_index("CATEGORY_SUB").to_dict()["CATEGORY_MAIN"]
print("Calculated Mapping from SUB to MAIN category")
print(labelsMap)

Calculated Mapping from SUB to MAIN category
{'street_urgent_repair': 'street', 'infrastructure_power': 'infrastructure', 'street_repair': 'street', 'publichealth_general': 'public_health', 'environment_recycling': 'environment', 'housing_mold': 'housing', 'environment_garbage_collection': 'environment', 'environment_overgrowth': 'environment', 'street_lighting': 'street', 'street_sewar': 'street', 'environment_dead_animal': 'environment', 'street_slippery': 'street', 'housing_general': 'housing', 'infrastructure_water_repair': 'infrastructure', 'governance_signage': 'governance', 'street_sidewalk': 'street', 'street_parking': 'street', 'housing_safety': 'housing', 'fire_general': 'fire', 'environment_air_pollution': 'environment', 'street_drainage': 'street', 'publichealth_animal': 'public_health', 'fire_equipment_broken': 'fire', 'governance_general': 'governance', 'housing_health_code': 'housing', 'publichealth_restaurant_hygiene': 'public_health', 'environment_dumping': 'environmen

In [17]:
# keep only columns we care about
dataFiltered = dataFiltered[["complaint", "CATEGORY_SUB", "CATEGORY_MAIN"]]

In [18]:
# write result to tsv
dataFiltered.to_csv("/W210_Gov_Complaints_Portal/Datasets/combined_trainingdata_filtered_20181108.tsv", sep='\t' )