In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
#engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')

# Tobin's code for loading in violation_codes

In [4]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/Nashville')
violation_types_df = pd.read_sql_query("SELECT * FROM violation_codes", engine)

# Cleaning violation_codes
### violation_id is supposed to be the unique identifier for each violation-code
     However, the data is "mangled" so we need to drop the garbage rows where violation_id is either duplicate or not an integer

In [5]:
violation_types_df.query('violation_id == "-"').head(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
60,-,,,N,,2,00:00.0,,,,,,,
155,-,,,N,,2,00:00.0,,,,,,,
238,-,,,N,,2,00:00.0,,,,,,,
322,-,,,N,,2,00:00.0,,,,,,,
324,-,,,N,,2,00:00.0,,,,,,,


In [6]:
violation_types_df.query('violation_id == "-"').tail(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
697,-,,,N,,2,00:00.0,,,,,,,
699,-,,,N,,2,00:00.0,,,,,,,
701,-,,,N,,2,00:00.0,,,,,,,
703,-,,,N,,2,00:00.0,,,,,,,
705,-,,,N,,2,00:00.0,,,,,,,


### collect the duplicate rows to be dropped in a new data frame

In [7]:
dropabledf = violation_types_df.query('violation_type == "NULL" and violation_desc == "N" and violation_text == "NULL"')
dropabledf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137 entries, 15 to 926
Data columns (total 14 columns):
violation_id      131 non-null object
org_id            137 non-null object
violation_type    137 non-null object
violation_desc    137 non-null object
violation_text    137 non-null object
remedial_text     137 non-null object
table_name        137 non-null object
expired_flag      137 non-null object
date_expired      137 non-null object
created_by        137 non-null object
date_created      0 non-null object
modified_by       0 non-null object
date_modified     0 non-null object
fee_setup_id      0 non-null object
dtypes: object(14)
memory usage: 16.1+ KB


In [8]:
dropabledf.index

Int64Index([ 15,  60, 155, 238, 322, 324, 380, 400, 414, 416,
            ...
            708, 896, 903, 906, 908, 914, 917, 920, 923, 926],
           dtype='int64', length=137)

### assign new data frame: "cleaned_df1" that has filtered out previously identified rows with duplicate violation_id values

In [9]:
cleaned_df1 = violation_types_df.drop(dropabledf.index)

In [10]:
cleaned_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 797 entries, 0 to 933
Data columns (total 14 columns):
violation_id      797 non-null object
org_id            788 non-null object
violation_type    788 non-null object
violation_desc    788 non-null object
violation_text    775 non-null object
remedial_text     651 non-null object
table_name        651 non-null object
expired_flag      651 non-null object
date_expired      651 non-null object
created_by        651 non-null object
date_created      651 non-null object
modified_by       651 non-null object
date_modified     651 non-null object
fee_setup_id      651 non-null object
dtypes: object(14)
memory usage: 93.4+ KB


### Tobin's function to identify the rows that don't have integers as violation_id; 
it's a boolean series

In [12]:
def conv(x):
    try:
        int(x)
        return True
    except:
        return False
    
numeric = cleaned_df1['violation_id'].apply(conv)

### assign new data frame: "cleaned_df2" that has filtered out rows where violation_id is not an integer

In [16]:
cleaned_df2 = cleaned_df1[numeric]

# Aggregating columns
### aggregating violation_text column by looping through it and creating a new column with aggregated types

In [17]:
def violation_text_startswith(s):
    if s == None:
        return 'None'
    elif s.startswith('T.C.A') or s.startswith ('TCA'):
        return 'TCA'
    elif s.startswith('M.C.L') or s.startswith ('MCL'):
        return 'MCL'
    elif s.startswith('Section') or s.startswith ('section'):
        return 'SEC'
    else:
        return 'Other'

In [19]:
# the following line of code generates a warning but not an error
cleaned_df2['text_type'] = cleaned_df2['violation_text'].apply(violation_text_startswith)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
cleaned_df2.text_type.value_counts(dropna = False)

SEC      429
Other    173
MCL      101
TCA       72
None      13
Name: text_type, dtype: int64

##### aggregating the TCA violation type rows into subtypes
because there were only 72 rows visual inspection of a sorted list of values was sufficient to determine how to aggregate the types into subtypes.

In [26]:
def Text_type_startswith(s):
    if s == None:
        return 'None'
    elif s.startswith('T.C.A 57-4'):
        return '57-4, Intoxicating Liquors - Consumption of Alcoholic Beverages on Premises'
    elif s.startswith('T.C.A 57-5'):
        return '57-5, Intoxicating Liquors - Beer'
    elif s.startswith('MCL Chapter 7'):
        return '7- , Alcoholic Beverages'
    elif s.startswith ('MCL Chapter 8'):
        return '8- , Animals'
    elif s.startswith ('MCL Chapter 10'):
        return '10- , Health and Safety'
    elif s.startswith ('MCL Chapter 15'):
        return '15- , Water Sewers and Other Public Services'
    elif s.startswith ('MCL Chapter 16'):
        return '16- , Buildings and Construction'
    elif s.startswith ('MCL Chapter 17'):
        return '17-, zoning'
    else:
        return 'Other'

### storing the text subtypes into a new column

In [27]:
cleaned_df2['text_subtype_and_text_subsubtype'] = cleaned_df2['violation_text'].apply(Text_type_startswith)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [28]:
cleaned_df2.text_subtype_and_text_subsubtype.value_counts(dropna = False)

Other                                                                          678
57-5, Intoxicating Liquors - Beer                                               71
10- , Health and Safety                                                         16
None                                                                            13
15- , Water Sewers and Other Public Services                                     5
8- , Animals                                                                     2
16- , Buildings and Construction                                                 2
57-4, Intoxicating Liquors - Consumption of Alcoholic Beverages on Premises      1
Name: text_subtype_and_text_subsubtype, dtype: int64

# It's now necessary to aggregate the 'SEC', 'Other', 'None' rows by expanding the Text_subtype_starts with function