In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
#engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')

# Tobin's code for loading in violation_codes

In [3]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/Nashville')
violation_types_df = pd.read_sql_query("SELECT * FROM violation_codes", engine)

# Cleaning violation_codes
### violation_id is supposed to be the unique identifier for each violation-code
     However, the data is "mangled" so we need to drop the garbage rows where violation_id is either duplicate or not an integer

In [4]:
violation_types_df.query('violation_id == "-"').head(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
60,-,,,N,,2,00:00.0,,,,,,,
155,-,,,N,,2,00:00.0,,,,,,,
238,-,,,N,,2,00:00.0,,,,,,,
322,-,,,N,,2,00:00.0,,,,,,,
324,-,,,N,,2,00:00.0,,,,,,,


In [5]:
violation_types_df.query('violation_id == "-"').tail(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
697,-,,,N,,2,00:00.0,,,,,,,
699,-,,,N,,2,00:00.0,,,,,,,
701,-,,,N,,2,00:00.0,,,,,,,
703,-,,,N,,2,00:00.0,,,,,,,
705,-,,,N,,2,00:00.0,,,,,,,


### collect the duplicate rows to be dropped in a new data frame

In [6]:
dropabledf = violation_types_df.query('violation_type == "NULL" and violation_desc == "N" and violation_text == "NULL"')
dropabledf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137 entries, 15 to 926
Data columns (total 14 columns):
violation_id      131 non-null object
org_id            137 non-null object
violation_type    137 non-null object
violation_desc    137 non-null object
violation_text    137 non-null object
remedial_text     137 non-null object
table_name        137 non-null object
expired_flag      137 non-null object
date_expired      137 non-null object
created_by        137 non-null object
date_created      0 non-null object
modified_by       0 non-null object
date_modified     0 non-null object
fee_setup_id      0 non-null object
dtypes: object(14)
memory usage: 16.1+ KB


In [7]:
dropabledf.index

Int64Index([ 15,  60, 155, 238, 322, 324, 380, 400, 414, 416,
            ...
            708, 896, 903, 906, 908, 914, 917, 920, 923, 926],
           dtype='int64', length=137)

### assign new data frame: "cleaned_df1" that has filtered out previously identified rows with duplicate violation_id values

In [8]:
cleaned_df1 = violation_types_df.drop(dropabledf.index)

In [9]:
cleaned_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 797 entries, 0 to 933
Data columns (total 14 columns):
violation_id      797 non-null object
org_id            788 non-null object
violation_type    788 non-null object
violation_desc    788 non-null object
violation_text    775 non-null object
remedial_text     651 non-null object
table_name        651 non-null object
expired_flag      651 non-null object
date_expired      651 non-null object
created_by        651 non-null object
date_created      651 non-null object
modified_by       651 non-null object
date_modified     651 non-null object
fee_setup_id      651 non-null object
dtypes: object(14)
memory usage: 93.4+ KB


### Tobin's function to identify the rows that don't have integers as violation_id; 
it's a boolean series

In [10]:
def conv(x):
    try:
        int(x)
        return True
    except:
        return False
    
numeric = cleaned_df1['violation_id'].apply(conv)
numeric

0       True
1       True
2       True
3       True
4       True
5       True
6       True
7       True
8       True
9       True
10      True
11      True
12      True
13      True
14      True
16      True
17      True
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
30      True
       ...  
895     True
897     True
898    False
899    False
900    False
901    False
902    False
904     True
905     True
907     True
909     True
910     True
911     True
912     True
913    False
915     True
916     True
918     True
919    False
921     True
922     True
924     True
925     True
927     True
928     True
929     True
930     True
931     True
932     True
933     True
Name: violation_id, Length: 797, dtype: bool

### assign new data frame: "cleaned_df2" that has filtered out rows where violation_id is not an integer

In [10]:
cleaned_df2 = cleaned_df1[numeric]

# Aggregating columns
### aggregating violation_text column by looping through it and creating a new column with aggregated types

In [11]:
def violation_text_startswith(s):
    if s == None:
        return 'None'
    elif s.startswith('T.C.A') or s.startswith ('TCA'):
        return 'TCA'
    elif s.startswith('M.C.L') or s.startswith ('MCL'):
        return 'MCL'
    elif s.startswith('Section') or s.startswith ('section'):
        return 'SEC'
    else:
        return 'Other'

In [12]:
# the following line of code generates a warning but not an error
cleaned_df2['text_type'] = cleaned_df2['violation_text'].apply(violation_text_startswith)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
cleaned_df2.text_type.value_counts(dropna = False)

SEC      429
Other    173
MCL      101
TCA       72
None      13
Name: text_type, dtype: int64

##### aggregating the TCA violation type rows into subtypes
because there were only 72 rows visual inspection of a sorted list of values was sufficient to determine how to aggregate the types into subtypes.

In [14]:
def Text_type_startswith(s):
    if s == None:
        return 'None'
    elif s.startswith('T.C.A 57-4'):
        return 'intoxicating liquors - consumption of alcoholic beverages on premises'
    elif s.startswith('T.C.A 57-5'):
        return 'intoxicating liquors - beer'
    elif s.startswith('Section 2'):
        return 'administration - urban forester - removal of hazard trees'
    elif s.startswith('Section 6.04'):
        return 'business license and regulations - advertising signs and other advertising methods'
    elif s.startswith('Section 6.28'):
        return 'business license and regulations - hotels and roominghouses'
    elif s.startswith('Section 10.20'):
        return 'health and safety - waste management'
    elif s.startswith('Section 10.24'):
        return 'health and safety - littering'
    elif s.startswith('Section 10.32'):
        return 'health and safety - rodents insects and other pests'
    elif s.startswith('Section 10.36'):
        return 'health and safety - housing sanitation'
    elif s.startswith('Section 10.52'):
        return 'health and safety - quarrying and mining operations'
    elif s.startswith('Section 12.08'):
        return 'vehicles and traffic - general regulations administration and eforcement'
    elif s.startswith('Section 16.04'):
        return 'buildings and construction - property standards'
    elif s.startswith('Section 16.16'):
        return 'buildings and construction - property standards'
    elif s.startswith('Section 16.24'):
        return 'buildings and construction - property standards'
    elif s.startswith('Section 16.28'):
        return 'buildings and construction - building permits'
    elif s.startswith('Section 16.34'):
        return 'buildings and construction - none'
    elif s.startswith('Section 17.04'):
        return 'zoning - general provisions and definitions'
    elif s.startswith('Section 17.08'):
        return 'zoning - zoning districts and land uses'
    elif s.startswith('Section 17.12'):
        return 'zoning - district bulk regulations'
    elif s.startswith('Section 17.16'):
        return 'zoning - land use development standards'
    elif s.startswith('Section 17.20'):
        return 'zoning - parking loading and access'
    elif s.startswith('Section 17.24'):
        return 'zoning - landscaping buffering and tree replacement'
    elif s.startswith('Section 17.28'):
        return 'zoning - environmental and operational performance standards'
    elif s.startswith('Section 17.32'):
        return 'zoning - sign regulations'
    elif s.startswith('Section 17.40'):
        return 'zoning - administration and procedures'
    elif s.startswith('MCL Chapter 7'):
        return 'alcoholic beverages'
    elif s.startswith('MCL Chapter 8'):
        return 'animals'
    elif s.startswith('MCL Chapter 10.20'):
        return 'health and safety - waste management'
    elif s.startswith('MCL Chapter 10.24'):
        return 'health and safety - littering'
    elif s.startswith('MCL Chapter 10.26'):
        return 'health and safety - high weeds and debris'
    elif s.startswith('MCL Chapter 10.32') or s.startswith('Section 10.32'):
        return 'health and safety - rodents insects and other pests'
    elif s.startswith('MCL Chapter 10.36'):
        return 'health and safety - housing sanitation'
    elif s.startswith('MCL Chapter 10.52'):
        return 'health and safety - quarrying and mining operations'
    elif s.startswith('MCL Chapter 15') or s.startswith('Section 15'):
        return 'water sewers and other public services'
    elif s.startswith('MCL Chapter 16.12'):
        return 'buildings and construction - plumbing code'
    elif s.startswith('MCL Chapter 16.16'):
        return 'buildings and construction - gas mechanical regulations'
    elif s.startswith('MCL Chapter 16.24'):
        return 'buildings and construction - property standards'
    elif s.startswith('MCL Chapter 16.18'):
        return 'buildings and construction - building permits'
    elif s.startswith('MCL Chapter 17.04'):
        return 'zoning - general provisions and definitions'
    elif s.startswith('MCL Chapter 17.08'):
        return 'zoning - zoning districts and land uses'
    elif s.startswith('MCL Chapter 17.12'):
        return 'zoning - district bulk regulations'
    elif s.startswith('MCL Chapter 17.16'):
        return 'zoning - land use development standards'
    elif s.startswith('MCL Chapter 17.20'):
        return 'zoning - parking, loading and access'
    elif s.startswith('MCL Chapter 17.24'):
        return 'zoning - landscaping buffering and tree replacement'
    elif s.startswith('MCL Chapter 17.28'):
        return 'zoning - environmental and operational performance standards'
    elif s.startswith('MCL Chapter 17.32'):
        return 'zoning - sign regulations'
    elif s.startswith('MCL Chapter 17.40'):
        return 'zoning - administration and procedures'
    else:
        return 'Other'

### storing the text subtypes into a new column

In [15]:
cleaned_df2['text_subtype_and_text_subsubtype'] = cleaned_df2['violation_text'].apply(Text_type_startswith)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
cleaned_df2.text_subtype_and_text_subsubtype.value_counts(dropna = False)

buildings and construction - property standards                                       292
Other                                                                                 249
intoxicating liquors - beer                                                            71
zoning - sign regulations                                                              33
zoning - administration and procedures                                                 16
health and safety - littering                                                          15
zoning - land use development standards                                                14
None                                                                                   13
health and safety - waste management                                                   11
water sewers and other public services                                                 10
zoning - district bulk regulations                                                      9
business l

# It's now necessary to aggregate the 'Other', 'None' rows by expanding the Text_subtype_starts with function