In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine

In [3]:
# load data as dataframe from sql
engine = create_engine('postgresql://postgres:postgres@localhost:5432/Nashville')
violation_types_df = pd.read_sql_query("SELECT * FROM violation_codes", engine)
# visually inspect
violation_types_df.head(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
0,1,1,BANNERS,Banners,Section 17.32.060 & 17.32.070 - Banners: Bann...,CAAZ_BANNERS,,N,,1,03:52.6,,,
1,2,1,BGMATOWNER,Proper Maintenance Req - Owner,Section 16.16.030 (B) - Proper maintenance req...,CAAH_BG_MATOWNER,,N,,1,03:52.6,,,
2,4,1,BLDGMAINT,Proper Maintenance Req,Section 16.16.030 (A) - Proper maintenance req...,CAAH_BLDG_MAINT,,N,,1,03:52.6,,,
3,5,1,BLDGPERMIT,Building Permit Required,Section 16.28.010 - Building Permit Required: ...,CAAB_BLDG_PERMIT,,N,,1,03:52.6,,,
4,6,1,BLDGSCOPE,Scope of Building Code,Section 16.16.020 (B) - The provisions of this...,CAAH_BLDG_SCOPE,,N,,1,03:52.6,,,


In [4]:
violation_types_df.tail(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
929,965,1,BBR36,No motions for reconsideration,The Beer Permit Board shall not entertain moti...,,,N,,10635,04:56.0,,,
930,966,1,BBR37,Special Event Permits,Applicants for special event permits will be l...,,,N,,10635,08:59.4,,,
931,967,1,BBR38,Suspension Rules,"Upon suspension of a beer permit, the permitte...",,,N,,10635,10:16.9,,,
932,969,1,BBR41,No permit issued with complaints,A permit shall not be issued by the Executive ...,,,N,,10635,12:53.1,,,
933,970,1,BBR39,Sports Authority Facilities,This rule is applicable only to beer sales wit...,,,N,,10635,16:46.8,,,


In [5]:
violation_types_df.columns

Index(['violation_id', 'org_id', 'violation_type', 'violation_desc',
       'violation_text', 'remedial_text', 'table_name', 'expired_flag',
       'date_expired', 'created_by', 'date_created', 'modified_by',
       'date_modified', 'fee_setup_id'],
      dtype='object')

In [6]:
# Ok. So, what I'm trying to do is reduce these 900ish to 20ish types. So let's decide which column we're going to reduce. 
# The metadata docx said the violation_id is a unique identifier. Let's test that.
print(violation_types_df.shape)

(934, 14)


In [7]:
violation_types_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934 entries, 0 to 933
Data columns (total 14 columns):
violation_id      928 non-null object
org_id            925 non-null object
violation_type    925 non-null object
violation_desc    925 non-null object
violation_text    912 non-null object
remedial_text     788 non-null object
table_name        788 non-null object
expired_flag      788 non-null object
date_expired      788 non-null object
created_by        788 non-null object
date_created      651 non-null object
modified_by       651 non-null object
date_modified     651 non-null object
fee_setup_id      651 non-null object
dtypes: object(14)
memory usage: 102.2+ KB


In [84]:
# So, there are 934 rows, there should be 934 unique violation_id values. And, they should all be integers. 
# Let's see if they are unique.
violation_types_df.violation_id.value_counts(dropna = False)
#violation_types_df['violation_id'].value_counts(dropna = False)

-                                                                                                                                                                                                                                     119
NaN                                                                                                                                                                                                                                     6
170                                                                                                                                                                                                                                     1
584                                                                                                                                                                                                                                     1
9                                                               

In [9]:
# Ok. So, 119 rows have a dash in them instead of an int. Let's look at them to seee if they are dropable
violation_types_df.query('violation_id == "-"').head(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
60,-,,,N,,2,00:00.0,,,,,,,
155,-,,,N,,2,00:00.0,,,,,,,
238,-,,,N,,2,00:00.0,,,,,,,
322,-,,,N,,2,00:00.0,,,,,,,
324,-,,,N,,2,00:00.0,,,,,,,


In [10]:
violation_types_df.query('violation_id == "-"').tail(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
697,-,,,N,,2,00:00.0,,,,,,,
699,-,,,N,,2,00:00.0,,,,,,,
701,-,,,N,,2,00:00.0,,,,,,,
703,-,,,N,,2,00:00.0,,,,,,,
705,-,,,N,,2,00:00.0,,,,,,,


## They definitely look dropable, but how do I confirm this programatically instead of relying on a visual inspection?
####      Well, I've confirmed that the head and tail match a pattern of "dropability" (i.e.:'-, NULL, NULL, N, etch..'). 
So, I could write a loop that looks inside each row, and confirms a pattern match for each cell in that row, if it's a match then I can either mark those rows for dropping, or just drop them directly.
     But all that seems complex. Instead of checking every category of every row, maybe I could just check a subset of 'critical categories' (save some time and effort). 
##### What are my best candidates for critical categories?
   If they don't have a type, description, or text, then they are effectively uncategorizeable. That's a nicer small set of 3. (Time data is irrelevant in this context since we're dealing with violation types, not actual instances of violations.) Ok so I'll try to write a loop that checks the values in those columns for each row.

In [11]:
# I should be able to query the dropable rows and store them as a new dataframe
dropabledf = violation_types_df.query('violation_type == "NULL" and violation_desc == "N" and violation_text == "NULL"')
dropabledf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137 entries, 15 to 926
Data columns (total 14 columns):
violation_id      131 non-null object
org_id            137 non-null object
violation_type    137 non-null object
violation_desc    137 non-null object
violation_text    137 non-null object
remedial_text     137 non-null object
table_name        137 non-null object
expired_flag      137 non-null object
date_expired      137 non-null object
created_by        137 non-null object
date_created      0 non-null object
modified_by       0 non-null object
date_modified     0 non-null object
fee_setup_id      0 non-null object
dtypes: object(14)
memory usage: 16.1+ KB


it's interesting that I ended up with 137. That's more than the 119 that have a dash instead of a violiation_id but that's fine because the criteria for discrimination applies to the entirety of the original df. If any row is missing those three critical categories then they are uncategorizeable. And that's my primary goal with this df; to reduce it from 900ish to 20ish.

## Ok, so now I've created a df where I can put everything I want to drop. How do I drop it from the original df? 
Some sort of merge? An outermerge should eliminate all entries from the original df that are in the dropable df, since it is a proper subset of the original df. This is a defacto solution. Let's try it.

In [12]:
#help(pd.merge) makes me think that a merge may not be the right approach.

In [13]:
#let's try df.replace instead

Ok, so I'm stuck. I can drop rows by index, but I have to pass the index values as a list. Can I just get the index list from the dropable df? Let's see

In [14]:
dropabledf.index

Int64Index([ 15,  60, 155, 238, 322, 324, 380, 400, 414, 416,
            ...
            708, 896, 903, 906, 908, 914, 917, 920, 923, 926],
           dtype='int64', length=137)

In [15]:
type(dropabledf.index)

pandas.core.indexes.numeric.Int64Index

In [16]:
cleaned_df1 = violation_types_df.drop(dropabledf.index)

In [17]:
cleaned_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 797 entries, 0 to 933
Data columns (total 14 columns):
violation_id      797 non-null object
org_id            788 non-null object
violation_type    788 non-null object
violation_desc    788 non-null object
violation_text    775 non-null object
remedial_text     651 non-null object
table_name        651 non-null object
expired_flag      651 non-null object
date_expired      651 non-null object
created_by        651 non-null object
date_created      651 non-null object
modified_by       651 non-null object
date_modified     651 non-null object
fee_setup_id      651 non-null object
dtypes: object(14)
memory usage: 93.4+ KB


In [21]:
cleaned_df1['violation_id'].value_counts(dropna = False)

848    1
81     1
952    1
395    1
690    1
847    1
560    1
384    1
893    1
170    1
50     1
435    1
576    1
451    1
40     1
172    1
138    1
390    1
820    1
746    1
100    1
535    1
421    1
9      1
584    1
837    1
951    1
483    1
851    1
849    1
      ..
129    1
564    1
474    1
802    1
28     1
193    1
677    1
118    1
742    1
791    1
832    1
153    1
861    1
82     1
629    1
768    1
869    1
158    1
615    1
161    1
74     1
936    1
748    1
827    1
538    1
51     1
672    1
394    1
784    1
41     1
Name: violation_id, Length: 797, dtype: int64

In [19]:
# Ok. Let's see if I can make a sub df that contains all the rows where the violation id
# isn't an integer
cleaned_df1.query(cleaned_df1.violation_id.is_integer())

AttributeError: 'Series' object has no attribute 'is_integer'

In [None]:
cleaned_df1['violation_id'].apply(is_integer())

In [22]:
help(is_integer)

NameError: name 'is_integer' is not defined

In [23]:
help(is_int)

NameError: name 'is_int' is not defined

In [24]:
help(pd.is_int)

AttributeError: module 'pandas' has no attribute 'is_int'

In [25]:
help(pd.is_integer)

AttributeError: module 'pandas' has no attribute 'is_integer'

In [26]:
help(np.is_int)

AttributeError: module 'numpy' has no attribute 'is_int'

In [27]:
help(isinstance)

Help on built-in function isinstance in module builtins:

isinstance(obj, class_or_tuple, /)
    Return whether an object is an instance of a class or of a subclass thereof.
    
    A tuple, as in ``isinstance(x, (A, B, ...))``, may be given as the target to
    check against. This is equivalent to ``isinstance(x, A) or isinstance(x, B)
    or ...`` etc.



In [35]:
cleaned_df1.query(cleaned_df1['violation_id'].apply(type == 'int'))

TypeError: 'bool' object is not callable

In [36]:
vid_series = cleaned_df1.violation_id

In [37]:
vid_series

0                                                      1
1                                                      2
2                                                      4
3                                                      5
4                                                      6
5                                                      7
6                                                      8
7                                                      9
8                                                     10
9                                                     11
10                                                    12
11                                                    13
12                                                    14
13                                                    15
14                                                    16
16                                                    17
17                                                    18
18                             

In [39]:
vid_series.apply(type)

0      <class 'str'>
1      <class 'str'>
2      <class 'str'>
3      <class 'str'>
4      <class 'str'>
5      <class 'str'>
6      <class 'str'>
7      <class 'str'>
8      <class 'str'>
9      <class 'str'>
10     <class 'str'>
11     <class 'str'>
12     <class 'str'>
13     <class 'str'>
14     <class 'str'>
16     <class 'str'>
17     <class 'str'>
18     <class 'str'>
19     <class 'str'>
20     <class 'str'>
21     <class 'str'>
22     <class 'str'>
23     <class 'str'>
24     <class 'str'>
25     <class 'str'>
26     <class 'str'>
27     <class 'str'>
28     <class 'str'>
29     <class 'str'>
30     <class 'str'>
           ...      
895    <class 'str'>
897    <class 'str'>
898    <class 'str'>
899    <class 'str'>
900    <class 'str'>
901    <class 'str'>
902    <class 'str'>
904    <class 'str'>
905    <class 'str'>
907    <class 'str'>
909    <class 'str'>
910    <class 'str'>
911    <class 'str'>
912    <class 'str'>
913    <class 'str'>
915    <class 'str'>
916    <class

In [61]:
np.sort(vid_series)

array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115', '116',
       '117', '118', '119', '12', '120', '121', '122', '123', '124', '125',
       '126', '127', '128', '129', '13', '130', '131', '132', '133', '134',
       '135', '136', '137', '138', '139', '14', '140', '141', '142', '143',
       '144', '145', '146', '147', '148', '149', '15', '150', '151', '152',
       '153', '154', '155', '156', '157', '158', '159', '16', '160', '161',
       '162', '163', '164', '165', '166', '167', '168', '169', '17', '170',
       '171', '172', '173', '174', '175', '176', '177', '178', '179', '18',
       '180', '181', '182', '183', '184', '185', '186', '187', '188',
       '189', '19', '190', '191', '192', '193', '194', '195', '196', '197',
       '198', '199', '2', '20', '200', '201', '202', '203', '204', '205',
       '206', '207', '208', '209', '21', '210', '212', '213', '214', '22',
       '23', '24', '247

In [93]:
#vid_series.query('len =< 4') This didn't work

In [94]:
#cleaned_df1['violation_id'].query(len < 4) this also didn't work

In [95]:
#cleaned_df1.query('violation_id.len == 4') also didn't work

In [96]:
#vids_numeric = vid_series.apply(pd.to_numeric(errors = 'coerce')) also didn't work

In [99]:
vids_numeric = pd.to_numeric(vid_series, errors = 'coerce', downcast='integer')

In [90]:
vids_numeric

0        1.0
1        2.0
2        4.0
3        5.0
4        6.0
5        7.0
6        8.0
7        9.0
8       10.0
9       11.0
10      12.0
11      13.0
12      14.0
13      15.0
14      16.0
16      17.0
17      18.0
18      19.0
19      20.0
20      21.0
21      22.0
22      23.0
23      24.0
24      25.0
25      26.0
26      27.0
27      28.0
28      29.0
29      30.0
30      31.0
       ...  
895    946.0
897    947.0
898      NaN
899      NaN
900      NaN
901      NaN
902      NaN
904    948.0
905    949.0
907    951.0
909    952.0
910    953.0
911    954.0
912    955.0
913      NaN
915    956.0
916    957.0
918    959.0
919      NaN
921    960.0
922    968.0
924    961.0
925    962.0
927    963.0
928    964.0
929    965.0
930    966.0
931    967.0
932    969.0
933    970.0
Name: violation_id, Length: 797, dtype: float64

In [92]:
cleaned_df1.remedial_text.value_counts()

NULL                    443
CAAH_BLDG_SCOPE           2
CAAZ_SIGN_MOTION          1
CAAE_ELEC_EQ              1
CAMCL_HSAN_STD            1
CAAH_RESP_MAINT           1
CAAZ_PERMIT_BLD_REQ       1
CAMCL_LIT_BUSCIT          1
CAAH_EXT_OPNWIN           1
CAAH_RUB_GARFAC           1
CAAH_EXT_FENMAT           1
CAAE_ELECEQ_LIGHT         1
CAMCL_LIT_VACLOT          1
CAAH_EXT_VEHRP            1
CAAH_OCCUP_LIMPVY         1
CAMCL_QUAR_HOLE           1
CAAH_EXT_GLAZ             1
CAAZ_SIGN_PERMIT_REQ      1
CAAZ_SIGN_IPLS            1
CAMCL_HAZ_WST_PATH        1
CAAH_EXT_ROFDRAN          1
CAAZ_OVCROWDEDFM          1
CAAH_INT_BLDSUR           1
CAAH_SCH_BUS              1
CAAH_EXTERIOR_REPAIR      1
CAAZ_PAVING_PARK          1
CAAP_PLUM_DWLUT           1
CAAH_HEAT_WRK_SPC         1
CAMCL_LIT_NOTICE          1
CAMCL_LIT_HNBILL_VEH      1
                       ... 
CAAH_EXT_DOOR             1
CAAH_EXT_ROFSYS           1
CAAH_LIGHT_CHALL          1
CAAH_EXT_BALCO            1
CAAZ_USE_NOT_PERMIT 