In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine

In [240]:
# load data as dataframe from sql
engine = create_engine('postgresql://postgres:postgres@localhost:5432/Nashville')
violation_types_df = pd.read_sql_query("SELECT * FROM violation_codes", engine)
# visually inspect
violation_types_df.head(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
0,1,1,BANNERS,Banners,Section 17.32.060 & 17.32.070 - Banners: Bann...,CAAZ_BANNERS,,N,,1,03:52.6,,,
1,2,1,BGMATOWNER,Proper Maintenance Req - Owner,Section 16.16.030 (B) - Proper maintenance req...,CAAH_BG_MATOWNER,,N,,1,03:52.6,,,
2,4,1,BLDGMAINT,Proper Maintenance Req,Section 16.16.030 (A) - Proper maintenance req...,CAAH_BLDG_MAINT,,N,,1,03:52.6,,,
3,5,1,BLDGPERMIT,Building Permit Required,Section 16.28.010 - Building Permit Required: ...,CAAB_BLDG_PERMIT,,N,,1,03:52.6,,,
4,6,1,BLDGSCOPE,Scope of Building Code,Section 16.16.020 (B) - The provisions of this...,CAAH_BLDG_SCOPE,,N,,1,03:52.6,,,


In [241]:
violation_types_df.tail(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
929,965,1,BBR36,No motions for reconsideration,The Beer Permit Board shall not entertain moti...,,,N,,10635,04:56.0,,,
930,966,1,BBR37,Special Event Permits,Applicants for special event permits will be l...,,,N,,10635,08:59.4,,,
931,967,1,BBR38,Suspension Rules,"Upon suspension of a beer permit, the permitte...",,,N,,10635,10:16.9,,,
932,969,1,BBR41,No permit issued with complaints,A permit shall not be issued by the Executive ...,,,N,,10635,12:53.1,,,
933,970,1,BBR39,Sports Authority Facilities,This rule is applicable only to beer sales wit...,,,N,,10635,16:46.8,,,


In [242]:
violation_types_df.columns

Index(['violation_id', 'org_id', 'violation_type', 'violation_desc',
       'violation_text', 'remedial_text', 'table_name', 'expired_flag',
       'date_expired', 'created_by', 'date_created', 'modified_by',
       'date_modified', 'fee_setup_id'],
      dtype='object')

In [243]:
# Ok. So, what I'm trying to do is reduce these 900ish to 20ish types. So let's decide which column we're going to reduce. 
# The metadata docx said the violation_id is a unique identifier. Let's test that.
print(violation_types_df.shape)

(934, 14)


In [244]:
violation_types_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934 entries, 0 to 933
Data columns (total 14 columns):
violation_id      928 non-null object
org_id            925 non-null object
violation_type    925 non-null object
violation_desc    925 non-null object
violation_text    912 non-null object
remedial_text     788 non-null object
table_name        788 non-null object
expired_flag      788 non-null object
date_expired      788 non-null object
created_by        788 non-null object
date_created      651 non-null object
modified_by       651 non-null object
date_modified     651 non-null object
fee_setup_id      651 non-null object
dtypes: object(14)
memory usage: 102.2+ KB


In [245]:
# So, there are 934 rows, there should be 934 unique violation_id values. And, they should all be integers. 
# Let's see if they are unique.
violation_types_df.violation_id.value_counts(dropna = False)
#violation_types_df['violation_id'].value_counts(dropna = False)

-                                                                                                                                                                                                                                                                                                                     119
NaN                                                                                                                                                                                                                                                                                                                     6
75                                                                                                                                                                                                                                                                                                                      1
425                                                       

In [246]:
# Ok. So, 119 rows have a dash in them instead of an int. Let's look at them to seee if they are dropable
violation_types_df.query('violation_id == "-"').head(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
60,-,,,N,,2,00:00.0,,,,,,,
155,-,,,N,,2,00:00.0,,,,,,,
238,-,,,N,,2,00:00.0,,,,,,,
322,-,,,N,,2,00:00.0,,,,,,,
324,-,,,N,,2,00:00.0,,,,,,,


In [247]:
violation_types_df.query('violation_id == "-"').tail(5)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
697,-,,,N,,2,00:00.0,,,,,,,
699,-,,,N,,2,00:00.0,,,,,,,
701,-,,,N,,2,00:00.0,,,,,,,
703,-,,,N,,2,00:00.0,,,,,,,
705,-,,,N,,2,00:00.0,,,,,,,


## They definitely look dropable, but how do I confirm this programatically instead of relying on a visual inspection?
####      Well, I've confirmed that the head and tail match a pattern of "dropability" (i.e.:'-, NULL, NULL, N, etc...'). 
So, I could write a loop that looks inside each row, and confirms a pattern match for each cell in that row, if it's a match then I can either mark those rows for dropping, or just drop them directly.
     But all that seems complex. Instead of checking every category of every row, maybe I could just check a subset of 'critical categories' (save some time and effort). 
##### What are my best candidates for critical categories?
   If they don't have a type, description, or text, then they are effectively uncategorizeable. That's a nicer small set of 3. (Time data is irrelevant in this context since we're dealing with violation types, not actual instances of violations.) Ok so I'll try to write a loop that checks the values in those columns for each row.

In [248]:
# I should be able to query the dropable rows and store them as a new dataframe
dropabledf = violation_types_df.query('violation_type == "NULL" and violation_desc == "N" and violation_text == "NULL"')
dropabledf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137 entries, 15 to 926
Data columns (total 14 columns):
violation_id      131 non-null object
org_id            137 non-null object
violation_type    137 non-null object
violation_desc    137 non-null object
violation_text    137 non-null object
remedial_text     137 non-null object
table_name        137 non-null object
expired_flag      137 non-null object
date_expired      137 non-null object
created_by        137 non-null object
date_created      0 non-null object
modified_by       0 non-null object
date_modified     0 non-null object
fee_setup_id      0 non-null object
dtypes: object(14)
memory usage: 16.1+ KB


it's interesting that I ended up with 137. That's more than the 119 that have a dash instead of a violiation_id but that's fine because the criteria for discrimination applies to the entirety of the original df. If any row is missing those three critical categories then they are uncategorizeable. And that's my primary goal with this df; to reduce it from 900ish to 20ish.

## Ok, so now I've created a df where I can put everything I want to drop. How do I drop it from the original df? 
Some sort of merge? An outermerge should eliminate all entries from the original df that are in the dropable df, since it is a proper subset of the original df. This is a defacto solution. Let's try it.

In [249]:
#help(pd.merge) makes me think that a merge may not be the right approach.

In [250]:
#let's try df.replace instead

Ok, so I'm stuck. I can drop rows by index, but I have to pass the index values as a list. Can I just get the index list from the dropable df? Let's see

In [251]:
dropabledf.index

Int64Index([ 15,  60, 155, 238, 322, 324, 380, 400, 414, 416,
            ...
            708, 896, 903, 906, 908, 914, 917, 920, 923, 926],
           dtype='int64', length=137)

In [252]:
type(dropabledf.index)

pandas.core.indexes.numeric.Int64Index

In [253]:
cleaned_df1 = violation_types_df.drop(dropabledf.index)

In [254]:
cleaned_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 797 entries, 0 to 933
Data columns (total 14 columns):
violation_id      797 non-null object
org_id            788 non-null object
violation_type    788 non-null object
violation_desc    788 non-null object
violation_text    775 non-null object
remedial_text     651 non-null object
table_name        651 non-null object
expired_flag      651 non-null object
date_expired      651 non-null object
created_by        651 non-null object
date_created      651 non-null object
modified_by       651 non-null object
date_modified     651 non-null object
fee_setup_id      651 non-null object
dtypes: object(14)
memory usage: 93.4+ KB


In [255]:
cleaned_df1['violation_id'].value_counts(dropna = False)

144                                                                                                                                                                        1
425                                                                                                                                                                        1
649                                                                                                                                                                        1
109                                                                                                                                                                        1
929                                                                                                                                                                        1
636                                                                                                                                    

In [256]:
# Ok. Let's see if I can make a sub df that contains all the rows where the violation id
# isn't an integer
# cleaned_df1.query(cleaned_df1.violation_id.is_integer()) didn't work

In [257]:
# cleaned_df1['violation_id'].apply(is_integer()) didn't work

In [258]:
# help(is_integer)

In [259]:
# help(is_int)

In [260]:
# help(pd.is_int)

In [261]:
# help(pd.is_integer)

In [262]:
# help(np.is_int)

In [263]:
# help(isinstance)

In [264]:
#cleaned_df1.query(cleaned_df1['violation_id'].apply(type == 'int'))

In [265]:
vid_series = cleaned_df1.violation_id

In [266]:
vid_series

0                                                      1
1                                                      2
2                                                      4
3                                                      5
4                                                      6
5                                                      7
6                                                      8
7                                                      9
8                                                     10
9                                                     11
10                                                    12
11                                                    13
12                                                    14
13                                                    15
14                                                    16
16                                                    17
17                                                    18
18                             

In [267]:
vid_series.apply(type)

0      <class 'str'>
1      <class 'str'>
2      <class 'str'>
3      <class 'str'>
4      <class 'str'>
5      <class 'str'>
6      <class 'str'>
7      <class 'str'>
8      <class 'str'>
9      <class 'str'>
10     <class 'str'>
11     <class 'str'>
12     <class 'str'>
13     <class 'str'>
14     <class 'str'>
16     <class 'str'>
17     <class 'str'>
18     <class 'str'>
19     <class 'str'>
20     <class 'str'>
21     <class 'str'>
22     <class 'str'>
23     <class 'str'>
24     <class 'str'>
25     <class 'str'>
26     <class 'str'>
27     <class 'str'>
28     <class 'str'>
29     <class 'str'>
30     <class 'str'>
           ...      
895    <class 'str'>
897    <class 'str'>
898    <class 'str'>
899    <class 'str'>
900    <class 'str'>
901    <class 'str'>
902    <class 'str'>
904    <class 'str'>
905    <class 'str'>
907    <class 'str'>
909    <class 'str'>
910    <class 'str'>
911    <class 'str'>
912    <class 'str'>
913    <class 'str'>
915    <class 'str'>
916    <class

In [268]:
np.sort(vid_series)

array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115', '116',
       '117', '118', '119', '12', '120', '121', '122', '123', '124', '125',
       '126', '127', '128', '129', '13', '130', '131', '132', '133', '134',
       '135', '136', '137', '138', '139', '14', '140', '141', '142', '143',
       '144', '145', '146', '147', '148', '149', '15', '150', '151', '152',
       '153', '154', '155', '156', '157', '158', '159', '16', '160', '161',
       '162', '163', '164', '165', '166', '167', '168', '169', '17', '170',
       '171', '172', '173', '174', '175', '176', '177', '178', '179', '18',
       '180', '181', '182', '183', '184', '185', '186', '187', '188',
       '189', '19', '190', '191', '192', '193', '194', '195', '196', '197',
       '198', '199', '2', '20', '200', '201', '202', '203', '204', '205',
       '206', '207', '208', '209', '21', '210', '212', '213', '214', '22',
       '23', '24', '247

In [269]:
#vid_series.query('len =< 4') This didn't work

In [270]:
#cleaned_df1['violation_id'].query(len < 4) this also didn't work

In [271]:
#cleaned_df1.query('violation_id.len == 4') also didn't work

In [272]:
#vids_numeric = vid_series.apply(pd.to_numeric(errors = 'coerce')) also didn't work

In [273]:
vids_numeric = pd.to_numeric(vid_series, errors = 'coerce', downcast='integer')

In [274]:
vids_numeric

0        1.0
1        2.0
2        4.0
3        5.0
4        6.0
5        7.0
6        8.0
7        9.0
8       10.0
9       11.0
10      12.0
11      13.0
12      14.0
13      15.0
14      16.0
16      17.0
17      18.0
18      19.0
19      20.0
20      21.0
21      22.0
22      23.0
23      24.0
24      25.0
25      26.0
26      27.0
27      28.0
28      29.0
29      30.0
30      31.0
       ...  
895    946.0
897    947.0
898      NaN
899      NaN
900      NaN
901      NaN
902      NaN
904    948.0
905    949.0
907    951.0
909    952.0
910    953.0
911    954.0
912    955.0
913      NaN
915    956.0
916    957.0
918    959.0
919      NaN
921    960.0
922    968.0
924    961.0
925    962.0
927    963.0
928    964.0
929    965.0
930    966.0
931    967.0
932    969.0
933    970.0
Name: violation_id, Length: 797, dtype: float64

In [275]:
cleaned_df1.remedial_text.value_counts()

NULL                    443
CAAH_BLDG_SCOPE           2
CAAE_ELECEA_RECPT         1
CAAH_INT_INTDOOR          1
CAAP_PLUM_SYSHAZ          1
CAMCL_LIT_BUSREP          1
CAMCL_SEW_RUNOFF          1
CAAH_EXTER_MOCCUP         1
CAAH_EXT_DOOR             1
CAAP_WATER_WTRHEAT        1
CAMCL_LIT_PROPPL          1
CAAH_OCCUP_WCACC          1
CAAH_EXT_INSSCNRES        1
CAAH_EXT_TRLS             1
CAMCL_LIT_VEH             1
CAAH_BOARDING_REQU        1
CAAH_EXT_CHIMTW           1
CAAB_STOP_WORK            1
CAAH_INT_BLDSUR           1
CAAZ_SIGN_UNSAILL         1
CAAH_EXT_VEHRP            1
CAAH_EXT_WEED             1
CAAH_EXTER_REQ            1
CAAH_EXT_FDWALL           1
CAAH_EXT_FENMAT           1
CAAH_VENT_DRYER           1
CAAZ_CAR_ROW              1
CAAE_ELECFAC              1
CAAH_TRUCK                1
CAAP_PLUM_HOTEL           1
                       ... 
CAAZ_OVCROWDEDFM          1
CAAH_EXT_PTSID            1
CAAP_TOLRN_LOCEMFC        1
CAAE_ELECEQ_LIGHT         1
CAAH_OCCUP_RMWID    

In [276]:
def conv(x):
    try:
        int(x)
        return True
    except:
        return False
    
numeric = cleaned_df1['violation_id'].apply(conv)
cleaned_df2 = cleaned_df1[numeric]

In [277]:
cleaned_df2

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id
0,1,1,BANNERS,Banners,Section 17.32.060 & 17.32.070 - Banners: Bann...,CAAZ_BANNERS,,N,,1,03:52.6,,,
1,2,1,BGMATOWNER,Proper Maintenance Req - Owner,Section 16.16.030 (B) - Proper maintenance req...,CAAH_BG_MATOWNER,,N,,1,03:52.6,,,
2,4,1,BLDGMAINT,Proper Maintenance Req,Section 16.16.030 (A) - Proper maintenance req...,CAAH_BLDG_MAINT,,N,,1,03:52.6,,,
3,5,1,BLDGPERMIT,Building Permit Required,Section 16.28.010 - Building Permit Required: ...,CAAB_BLDG_PERMIT,,N,,1,03:52.6,,,
4,6,1,BLDGSCOPE,Scope of Building Code,Section 16.16.020 (B) - The provisions of this...,CAAH_BLDG_SCOPE,,N,,1,03:52.6,,,
5,7,1,BOARDINGRQ,Boarding Requirements,Section 16.24.340 (U) (2) – Boarding of Vacant...,CAAH_BOARDING_REQU,,N,,1,03:52.6,,,
6,8,1,BOARDVAC,Boarding of Vacant Buildings,Section 16.24.340 (U) (1) – Boarding of Vacant...,CAAH_BOARDING_VABLDG,,N,,1,03:52.6,,,
7,9,1,CARROW,Vehicle In The Right-Of-Way,Section 12.08.210 - Abandoned Vehicles: Abando...,CAMCL_SEW_CONCT,,N,,1,03:52.6,,,
8,10,1,CERTCOMP,Certificate of Compliance Required,Section 17.40.580 - Certificate of Compliance:...,CAAZ_CERT_COMP,,N,,1,03:52.6,,,
9,11,1,CERTOCC,Certificate of Occupancy Required,Section 16.24.130 – Certificate of Occupancy R...,CAAH_CERT_OCC,,N,,1,03:52.6,,,


In [278]:
cleaned_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 788 entries, 0 to 933
Data columns (total 14 columns):
violation_id      788 non-null object
org_id            788 non-null object
violation_type    788 non-null object
violation_desc    788 non-null object
violation_text    775 non-null object
remedial_text     651 non-null object
table_name        651 non-null object
expired_flag      651 non-null object
date_expired      651 non-null object
created_by        651 non-null object
date_created      651 non-null object
modified_by       651 non-null object
date_modified     651 non-null object
fee_setup_id      651 non-null object
dtypes: object(14)
memory usage: 92.3+ KB


In [279]:
cleaned_df2.violation_id.value_counts(dropna=False).head(5)

144    1
412    1
953    1
551    1
852    1
Name: violation_id, dtype: int64

In [280]:
def violation_text_startswith(s):
    if s == None:
        return 'None'
    elif s.startswith('T.C.A') or s.startswith ('TCA'):
        return 'TCA'
    elif s.startswith('M.C.L') or s.startswith ('MCL'):
        return 'MCL'
    elif s.startswith('Section') or s.startswith ('section'):
        return 'SEC'
    else:
        return 'Other'

In [281]:
cleaned_df2['text_type'] = cleaned_df2['violation_text'].apply(violation_text_startswith)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [282]:
cleaned_df2

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id,text_type
0,1,1,BANNERS,Banners,Section 17.32.060 & 17.32.070 - Banners: Bann...,CAAZ_BANNERS,,N,,1,03:52.6,,,,SEC
1,2,1,BGMATOWNER,Proper Maintenance Req - Owner,Section 16.16.030 (B) - Proper maintenance req...,CAAH_BG_MATOWNER,,N,,1,03:52.6,,,,SEC
2,4,1,BLDGMAINT,Proper Maintenance Req,Section 16.16.030 (A) - Proper maintenance req...,CAAH_BLDG_MAINT,,N,,1,03:52.6,,,,SEC
3,5,1,BLDGPERMIT,Building Permit Required,Section 16.28.010 - Building Permit Required: ...,CAAB_BLDG_PERMIT,,N,,1,03:52.6,,,,SEC
4,6,1,BLDGSCOPE,Scope of Building Code,Section 16.16.020 (B) - The provisions of this...,CAAH_BLDG_SCOPE,,N,,1,03:52.6,,,,SEC
5,7,1,BOARDINGRQ,Boarding Requirements,Section 16.24.340 (U) (2) – Boarding of Vacant...,CAAH_BOARDING_REQU,,N,,1,03:52.6,,,,SEC
6,8,1,BOARDVAC,Boarding of Vacant Buildings,Section 16.24.340 (U) (1) – Boarding of Vacant...,CAAH_BOARDING_VABLDG,,N,,1,03:52.6,,,,SEC
7,9,1,CARROW,Vehicle In The Right-Of-Way,Section 12.08.210 - Abandoned Vehicles: Abando...,CAMCL_SEW_CONCT,,N,,1,03:52.6,,,,SEC
8,10,1,CERTCOMP,Certificate of Compliance Required,Section 17.40.580 - Certificate of Compliance:...,CAAZ_CERT_COMP,,N,,1,03:52.6,,,,SEC
9,11,1,CERTOCC,Certificate of Occupancy Required,Section 16.24.130 – Certificate of Occupancy R...,CAAH_CERT_OCC,,N,,1,03:52.6,,,,SEC


In [283]:
cleaned_df2.query('text_type == "TCA"').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72 entries, 807 to 882
Data columns (total 15 columns):
violation_id      72 non-null object
org_id            72 non-null object
violation_type    72 non-null object
violation_desc    72 non-null object
violation_text    72 non-null object
remedial_text     72 non-null object
table_name        72 non-null object
expired_flag      72 non-null object
date_expired      72 non-null object
created_by        72 non-null object
date_created      72 non-null object
modified_by       72 non-null object
date_modified     72 non-null object
fee_setup_id      72 non-null object
text_type         72 non-null object
dtypes: object(15)
memory usage: 9.0+ KB


In [284]:
cleaned_df2.query('text_type == "TCA"').violation_id.value_counts(dropna = False)

855    1
882    1
887    1
900    1
929    1
883    1
876    1
897    1
903    1
866    1
910    1
892    1
863    1
899    1
909    1
868    1
920    1
871    1
881    1
916    1
870    1
906    1
859    1
875    1
858    1
908    1
911    1
857    1
886    1
919    1
      ..
854    1
891    1
927    1
934    1
893    1
904    1
856    1
898    1
896    1
864    1
880    1
888    1
885    1
879    1
917    1
878    1
889    1
894    1
914    1
853    1
907    1
915    1
861    1
890    1
877    1
902    1
905    1
895    1
873    1
913    1
Name: violation_id, Length: 72, dtype: int64

In [285]:
violations_TCA = cleaned_df2.query('text_type == "TCA"')

In [286]:
violations_TCA.head(72)

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id,text_type
807,896,1,BT575416,Beer stored at other address,T.C.A 57-5-416: Unless authorized in writing b...,,,N,,2,00:00.0,,,,TCA
808,853,1,BT575101A2,Not Manufacturer and Wholesale Retail,T.C.A 57-5-101(a)(2): Except as otherwise prov...,,,N,,2,00:00.0,,,,TCA
809,854,1,BT575101A3,Not Wholesale and Manufacturer Retail,T.C.A 57-5-101(a)(3): Except as otherwise prov...,,,N,,2,00:00.0,,,,TCA
810,855,1,BT575103A1,Operating without county city permit,T.C.A 57-5-103(a)(1): It is unlawful to operat...,,,N,,2,00:00.0,,,,TCA
811,856,1,BT575103A10,Permits to citizen legal resident,"T.C.A 57-5-103(a)(10): After July 1, 2015, a c...",,,N,,2,00:00.0,,,,TCA
812,857,1,BT575103A2,Permit to business owner,T.C.A 57-5-103(a)(2): Permits shall be issued ...,,,N,,2,00:00.0,,,,TCA
813,858,1,BT575103A3A,Cannot transfer ownership,T.C.A 57-5-103(a)(3)(A): A permit shall be val...,,,N,,2,00:00.0,,,,TCA
814,859,1,BT575103A3B,Permit for single location only,T.C.A 57-5-103(a)(3)(B): A permit shall be val...,,,N,,2,00:00.0,,,,TCA
815,860,1,BT575103A3C,Business Name same as Permit Name,T.C.A 57-5-103(a)(3)(C): A permit shall be val...,,,N,,2,00:00.0,,,,TCA
816,861,1,BT575103A4,Two businesses one location one permit,T.C.A 57-5-103(a)(4): Where an owner operates ...,,,N,,2,00:00.0,,,,TCA


I expected to see more TCA sections than just 57, check to see if they're there.

In [287]:
sorted(violations_TCA.violation_text)

['T.C.A 57-4-203(B)(1)(A): any licensee or other person who sells, furnishes, disposes of, gives, or causes to be sold, furnished, disposed of, or given, any alcoholic beverage to any person under 21 years of age',
 'T.C.A 57-5-101(a)(2): Except as otherwise provided for in this part, no brewer or manufacturer of beer shall have any financial or ownership interest, direct or indirect, in the business of or a building containing a wholesale or retail licensee, including to furnish or loan any fixtures of any kind to a retail licensee, and no such brewer or manufacturer shall hold a wholesale or retail license. For purposes of this section, an indirect interest includes any interest acquired by affiliates, subsidiaries, corporate officials, partners, or employees of the brewer or manufacturer.',
 'T.C.A 57-5-101(a)(3): Except as otherwise provided for in this part, no wholesaler shall hold any financial or ownership interest, direct or indirect, in the business of or a building containin

They're no.
So, did something went wrong with my labeling strategy?

In [288]:
cleaned_df2.text_type.value_counts(dropna = False)

SEC      429
Other    173
MCL      101
TCA       72
None      13
Name: text_type, dtype: int64

In [289]:
print("the counta of the text types is ", 429+173+101+72+13)
cleaned_df2.shape

the counta of the text types is  788


(788, 15)

In [290]:
def TCA_text_startswith(s):
    if s == None:
        return 'None'
    elif s.startswith('T.C.A 57-4'):
        return '57-4, Intoxicating Liquors - Consumption of Alcoholic Beverages on Premises'
    elif s.startswith('T.C.A 57-5'):
        return '57-5, Intoxicating Liquors - Beer'
    else:
        return 'Other'

In [291]:
cleaned_df2['text_subtype_and_text_subsubtype'] = cleaned_df2['violation_text'].apply(TCA_text_startswith)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [292]:
cleaned_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 788 entries, 0 to 933
Data columns (total 16 columns):
violation_id                        788 non-null object
org_id                              788 non-null object
violation_type                      788 non-null object
violation_desc                      788 non-null object
violation_text                      775 non-null object
remedial_text                       651 non-null object
table_name                          651 non-null object
expired_flag                        651 non-null object
date_expired                        651 non-null object
created_by                          651 non-null object
date_created                        651 non-null object
modified_by                         651 non-null object
date_modified                       651 non-null object
fee_setup_id                        651 non-null object
text_type                           788 non-null object
text_subtype_and_text_subsubtype    788 non-null object

Ok. I think I've successfully given meaningful categories to everything that is TCA. Lets look at it as a horizontal slice.

In [293]:
cleaned_df2.query('text_type == "TCA"')

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id,text_type,text_subtype_and_text_subsubtype
807,896,1,BT575416,Beer stored at other address,T.C.A 57-5-416: Unless authorized in writing b...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
808,853,1,BT575101A2,Not Manufacturer and Wholesale Retail,T.C.A 57-5-101(a)(2): Except as otherwise prov...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
809,854,1,BT575101A3,Not Wholesale and Manufacturer Retail,T.C.A 57-5-101(a)(3): Except as otherwise prov...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
810,855,1,BT575103A1,Operating without county city permit,T.C.A 57-5-103(a)(1): It is unlawful to operat...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
811,856,1,BT575103A10,Permits to citizen legal resident,"T.C.A 57-5-103(a)(10): After July 1, 2015, a c...",,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
812,857,1,BT575103A2,Permit to business owner,T.C.A 57-5-103(a)(2): Permits shall be issued ...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
813,858,1,BT575103A3A,Cannot transfer ownership,T.C.A 57-5-103(a)(3)(A): A permit shall be val...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
814,859,1,BT575103A3B,Permit for single location only,T.C.A 57-5-103(a)(3)(B): A permit shall be val...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
815,860,1,BT575103A3C,Business Name same as Permit Name,T.C.A 57-5-103(a)(3)(C): A permit shall be val...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"
816,861,1,BT575103A4,Two businesses one location one permit,T.C.A 57-5-103(a)(4): Where an owner operates ...,,,N,,2,00:00.0,,,,TCA,"57-5, Intoxicating Liquors - Beer"


In [294]:
cleaned_df2.query('text_type == "TCA"').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72 entries, 807 to 882
Data columns (total 16 columns):
violation_id                        72 non-null object
org_id                              72 non-null object
violation_type                      72 non-null object
violation_desc                      72 non-null object
violation_text                      72 non-null object
remedial_text                       72 non-null object
table_name                          72 non-null object
expired_flag                        72 non-null object
date_expired                        72 non-null object
created_by                          72 non-null object
date_created                        72 non-null object
modified_by                         72 non-null object
date_modified                       72 non-null object
fee_setup_id                        72 non-null object
text_type                           72 non-null object
text_subtype_and_text_subsubtype    72 non-null object
dtypes: object

In [295]:
cleaned_df2.text_subtype_and_text_subsubtype.value_counts(dropna = False)

Other                                                                          703
57-5, Intoxicating Liquors - Beer                                               71
None                                                                            13
57-4, Intoxicating Liquors - Consumption of Alcoholic Beverages on Premises      1
Name: text_subtype_and_text_subsubtype, dtype: int64

That's interesting. Why are there 13 nones. Let's look at them as text subtypes

In [296]:
cleaned_df2.text_type.value_counts(dropna=False)

SEC      429
Other    173
MCL      101
TCA       72
None      13
Name: text_type, dtype: int64

They're still there, which means that the violation text was probably none for 13 rows. Lets take a look at those rows.

In [297]:
cleaned_df2.query('text_type == "None"')

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id,text_type,text_subtype_and_text_subsubtype
219,247,1,917,HOUSE DISREPAIR,,,,N,,2,00:00.0,,,,,
710,756,1,HD1032120,APPROVED GARBAGE & TRASH CONTAINERS,,,,N,,2,00:00.0,,,,,
711,757,1,HD1032130,ACCUMUALTION OF GARBAGE & TRASH,,,,N,,2,00:00.0,,,,,
712,758,1,HD1032140,"ACCUMUALTION OF LUMBER, BOXES, BARRELS",,,,N,,2,00:00.0,,,,,
713,759,1,HD1032160,STAGNANT WATER COLLECTION,,,,N,,2,00:00.0,,,,,
714,760,1,HD10360201,"DIRT, FILTH, RUBBISH & GARBAGE",,,,N,,2,00:00.0,,,,,
715,761,1,HD10360202,VERMIN AND RODENT INFESTATION,,,,N,,2,00:00.0,,,,,
716,762,1,HD10360203,EXCESS VEGETATION,,,,N,,2,00:00.0,,,,,
717,763,1,HD10360204,CLEAN AMINAL AREAS,,,,N,,2,00:00.0,,,,,
718,764,1,HD1572020,SEWAGE ON GROUND,,,,N,,2,00:00.0,,,,,


Weird. I'd drop them except that they seem to have meaningfull type and description... What I have noticed is that they all have types beginning with HD, and all the descriptions are germane to housing. Label their text_type as HD and look into them later

In [298]:
sorted(cleaned_df2.query('text_type == "None"').violation_type)

['917',
 'HD1032120',
 'HD1032130',
 'HD1032140',
 'HD1032160',
 'HD10360201',
 'HD10360202',
 'HD10360203',
 'HD10360204',
 'HD1572020',
 'HD157240',
 'HD1624110',
 'HDVIOL']

Yeah, definitely looks like HD is the text Type, 103,157,162 the sub types and the remaining numbe sequences indicative of the subsubtype. I guess write the relevant function. and categorize them.

Actually, before I do, I need to make sure that none of my other rows have HD types...

In [299]:
def violation_type_startswith(s):
    if s == None:
        return 'None'
    elif s.startswith('HD') or s.startswith('H.D') :
        return 'HD'
    elif s.startswith('BB') or s.startswith('B.B'):
        return 'BB'
    elif s.startswith('TCA') or s.startswith('T.C.A'):
        return 'TCA'
    elif s.startswith('MCL') or s.startswith('M.C.L'):
        return 'MCL'
    else:
        return 'Other'
cleaned_df2['violation_subtype'] = cleaned_df2['violation_type'].apply(violation_type_startswith)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [300]:
cleaned_df2.violation_subtype.value_counts(dropna = False)

Other    661
BB       111
HD        16
Name: violation_subtype, dtype: int64

That's interesting. There are 16 HD v_subtypes, that's 3 more than expected. Inspect

In [301]:
cleaned_df2.query('violation_subtype == "HD"')

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id,text_type,text_subtype_and_text_subsubtype,violation_subtype
710,756,1,HD1032120,APPROVED GARBAGE & TRASH CONTAINERS,,,,N,,2,00:00.0,,,,,,HD
711,757,1,HD1032130,ACCUMUALTION OF GARBAGE & TRASH,,,,N,,2,00:00.0,,,,,,HD
712,758,1,HD1032140,"ACCUMUALTION OF LUMBER, BOXES, BARRELS",,,,N,,2,00:00.0,,,,,,HD
713,759,1,HD1032160,STAGNANT WATER COLLECTION,,,,N,,2,00:00.0,,,,,,HD
714,760,1,HD10360201,"DIRT, FILTH, RUBBISH & GARBAGE",,,,N,,2,00:00.0,,,,,,HD
715,761,1,HD10360202,VERMIN AND RODENT INFESTATION,,,,N,,2,00:00.0,,,,,,HD
716,762,1,HD10360203,EXCESS VEGETATION,,,,N,,2,00:00.0,,,,,,HD
717,763,1,HD10360204,CLEAN AMINAL AREAS,,,,N,,2,00:00.0,,,,,,HD
718,764,1,HD1572020,SEWAGE ON GROUND,,,,N,,2,00:00.0,,,,,,HD
719,765,1,HD157240,IMPROPER MAINT SEWAGE DISPOSAL SYSTEM,,,,N,,2,00:00.0,,,,,,HD


alright so there seems to be a clue in the fact that they overlap with MCL. Hopefully that means that they are all from MCL. Anyway, that also means that I can keep going.

I bet the BB sub types are all MCL text types. Let's see how would I verify that?

In [302]:
cleaned_df2.query('violation_subtype == "BB"')

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id,text_type,text_subtype_and_text_subsubtype,violation_subtype
735,781,1,BB708020,Sales without a permit,M.C.L 7.08.020: No person shall sell beer with...,,,N,,2,00:00.0,,,,MCL,Other,BB
736,782,1,BB708030B,Off Sale - consume on or near premises,"M.C.L 7.08.030B: A retailer's ""off-sale"" permi...",,,N,,2,00:00.0,10635,44:37.3,,MCL,Other,BB
737,783,1,BB708030C,On sale required for on premise consume,"M.C.L 7.08.030C: A retailer's ""on-sale"" permit...",,,N,,2,00:00.0,10635,44:57.4,,MCL,Other,BB
738,784,1,BB708030D,Caterer Permit,M.C.L 7.08.030D: A caterer's permit shall be i...,,,N,,2,00:00.0,,,,MCL,Other,BB
739,785,1,BB708030E,Special Event - sales on premises,M.C.L 7.08.030E: A retailer's special events p...,,,N,,2,00:00.0,,,,MCL,Other,BB
740,786,1,BB708040B3,Owners convict beer laws moral turpitude,"M.C.L 7.08.040B3: That no person, firm, corpor...",,,N,,2,00:00.0,10635,45:50.3,,MCL,Other,BB
741,841,1,BB720010B,BB WMD contiguous locations,M.C.L 7.20.010B: If a wholesaler or distributo...,,,N,,2,00:00.0,,,,MCL,Other,BB
742,787,1,BB708040B5,Owners incarcerated moral turpitude,M.C.L 7.08.040B5: That no person having at lea...,,,N,,2,00:00.0,,,,MCL,Other,BB
743,788,1,BB708040G,Supplemental Application Information,M.C.L 7.08.040G: An applicant or permit holder...,,,N,,2,00:00.0,,,,MCL,Other,BB
744,789,1,BB708040H,False statement on application,M.C.L 7.08.040H: Any applicant making a false ...,,,N,,2,00:00.0,,,,MCL,Other,BB


In [303]:
cleaned_df2.query('violation_subtype == "BB"').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111 entries, 735 to 933
Data columns (total 17 columns):
violation_id                        111 non-null object
org_id                              111 non-null object
violation_type                      111 non-null object
violation_desc                      111 non-null object
violation_text                      111 non-null object
remedial_text                       102 non-null object
table_name                          102 non-null object
expired_flag                        102 non-null object
date_expired                        102 non-null object
created_by                          102 non-null object
date_created                        102 non-null object
modified_by                         102 non-null object
date_modified                       102 non-null object
fee_setup_id                        102 non-null object
text_type                           111 non-null object
text_subtype_and_text_subsubtype    111 non-null obje

In [304]:
cleaned_df2.query('violation_subtype == "BB"').text_type.value_counts(dropna = False)

MCL      76
Other    35
Name: text_type, dtype: int64

In [305]:
#weird... what are the ones with Other text type?

In [306]:
cleaned_df2.query('violation_subtype == "BB" and text_type == "Other"')

Unnamed: 0,violation_id,org_id,violation_type,violation_desc,violation_text,remedial_text,table_name,expired_flag,date_expired,created_by,date_created,modified_by,date_modified,fee_setup_id,text_type,text_subtype_and_text_subsubtype,violation_subtype
883,935,1,BBR11,Post laws and regulations,All retail permittees shall conspicuously post...,,,N,,10635.0,43:37.9,,,,Other,Other,BB
884,950,1,BBR18,Health Department,Upon making application for a beer permit from...,,,N,,10635.0,03:29.1,,,,Other,Other,BB
885,936,1,BBR1,Temp License Allowed,The Executive Secretary to the Metropolitan Be...,Said person shall state· under oath that he or...,,,N,,10635,44:50.5,,,Other,Other,BB
886,937,1,BBR2,Commercially sealed containers,An off-sale beer permittee shall sell beer onl...,,,N,,10635.0,45:45.1,,,,Other,Other,BB
887,938,1,BBR3,"Under 18 transport, sell, dispense",No person under 18 years of age shall transpor...,,,N,,10635.0,49:08.1,,,,Other,Other,BB
888,939,1,BBR6,No drive-through service,Beer shall not be sold through any drive-throu...,,,N,,10635.0,50:51.6,,,,Other,Other,BB
889,940,1,BBR8,Special Event bonafide charity political,Special Events held on public property can onl...,,,N,,10635.0,52:28.2,,,,Other,Other,BB
890,941,1,BBR9,All owners and partners,All applicants for beer permits must supply th...,,,N,,10635.0,53:23.1,,,,Other,Other,BB
891,942,1,BBR10,No delivery,No permittee nor any of his/her agents shall d...,,,N,,10635.0,54:21.2,,,,Other,Other,BB
892,943,1,BBR12,Permittee responsible,"In disciplinary proceedings, it shall be no de...",,,N,,10635.0,55:11.5,,,,Other,Other,BB


In [307]:
cleaned_df2.query('violation_subtype == "BB" and text_type == "Other"').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 883 to 933
Data columns (total 17 columns):
violation_id                        35 non-null object
org_id                              35 non-null object
violation_type                      35 non-null object
violation_desc                      35 non-null object
violation_text                      35 non-null object
remedial_text                       26 non-null object
table_name                          26 non-null object
expired_flag                        26 non-null object
date_expired                        26 non-null object
created_by                          26 non-null object
date_created                        26 non-null object
modified_by                         26 non-null object
date_modified                       26 non-null object
fee_setup_id                        26 non-null object
text_type                           35 non-null object
text_subtype_and_text_subsubtype    35 non-null object
violation_subt

Ok. So, I gave meaningful categories to everything with a text type TCA. Now I need to to the same for the other text types. What are they again?

In [313]:
cleaned_df2.text_type.value_counts(dropna = False)

SEC      429
Other    173
MCL      101
TCA       72
None      13
Name: text_type, dtype: int64

I'm pretty sure that I already looked into the 13 nones, and they turned out to be HD violation sub types (which in turn are likely MCL text types)