In [None]:
from sqlalchemy import create_engine
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import nltk
import matplotlib.pyplot as plt

In [None]:
connection_housing = "postgres://postgres:postgres@localhost:5432/housing"

In [None]:
engine = create_engine(connection_housing)

In [5]:
query1 = '''
SELECT p.apn, p.permittype, p.permitsubtype, p.dateissued, p.purpose, p.contractor, pu.propzip
FROM permit AS p
INNER JOIN property_updated AS pu
USING (apn)
WHERE p.status NOT IN ('EXPIRED'
    , 'REJECTED'
    , 'PENDING'
    , 'WITHDRAWN'
    , 'CNCL'
    , 'REFUNDED'
    , 'NOT GRANTD'
    , 'REVOKED'
    , 'CHANCERY'
    , 'INREVIEW'
    , 'REQUESTED'
    , 'IGNORE')
AND pu.ludesc IN ('SINGLE FAMILY'
    , 'RESIDENTIAL CONDO'
    , 'APARTMENT: HIGH RISE (3 STORIES OR GREATER)'
    , 'APARTMENT: LOW RISE (BUILT SINCE 1960)'
    , 'APARTMENT: WALK UP (BUILT PRIOR TO 1960)'
    , 'DUPLEX'
    , 'TRIPLEX'
    , 'QUADPLEX'
    , 'ZERO LOT LINE'
    , 'VACANT RESIDENTIAL LAND'
    , 'MOBILE HOME PARK')
AND p.dateissued >= '2008-01-01'
'''

test = pd.read_sql(query1, con = engine)

In [6]:
test.shape

(357388, 7)

In [7]:
test.head(50)

Unnamed: 0,apn,permittype,permitsubtype,dateissued,purpose,contractor,propzip
0,13209013100,ELECTRICAL PERMIT,ELECT SERVICE RELEASE,2020-07-13,NEW RESIDENTIAL CITY OF OAK HILL,CARDWELLS ELECTRIC,37220
1,13209015600,ELECTRICAL PERMIT,FULL ELECTRICAL PERMIT,2014-12-09,"12/9/14- MRP- EXISTING, RESIDENTIAL, ADDING PO...",Winston Electric Llc,37220
2,13209015600,ELECTRICAL PERMIT,ELECT SERVICE RELEASE,2014-12-09,OAK HILL BP # 4987 FULL PERMIT #2006526 THAT...,Empire Electric Llc,37220
3,13209015600,GAS / MECH UP TO 1500 BTUS - HC,GAS / MECH PERMIT (UP TO 1500 BTU),2014-10-23,OAK HILL PERMIT\r\n(2) WOOD-BURNING FIREPLACES...,Nashville Fireplace Distributors Inc,37220
4,13209015600,ELECTRICAL PERMIT,FULL ELECTRICAL PERMIT,2014-10-23,OAK HILL BUILDING PERMIT #4987 \r\n\r\nNEED FU...,Empire Electric Llc,37220
5,13209015600,GAS / MECH PERMIT - VC MC,GAS / MECH PERMIT,2014-09-30,INSTALL NEW HVAC SYSTEMS WITH DUCT WORK AND GA...,"Frick, M J Co Inc",37220
6,13209015600,CODES PLUMBING PERMIT,PLUMBING PERMIT,2014-08-14,INSTALL COMPLETE PLUMBING SYSTEM IN NEW CONSTR...,G & M Plumbing Llc,37220
7,13209015600,ELECTRICAL PERMIT,TEMPORARY ELEC SERVICE,2014-07-11,NEED A 60 TEMP PERMIT FOR NEW CONSTRUCTION\r\n...,Empire Electric Llc,37220
8,132090F00200CO,CODES PLUMBING PERMIT,PLUMBING PERMIT,2011-12-08,TO CONSTRUCT A 3083 SQ FT 2 STORY SINGLE FAMIL...,Benchmark Plumbing Inc,37204
9,132090F00200CO,ELECTRICAL LOW VOLTAGE PERMIT,,2011-11-09,TO CONSTRUCT A 3083 SQ FT 2 STORY SINGLE FAMIL...,Reha Enterprises Llc,37204


In [8]:
permits = test.dropna()

In [9]:
permits.head(20)

Unnamed: 0,apn,permittype,permitsubtype,dateissued,purpose,contractor,propzip
0,13209013100,ELECTRICAL PERMIT,ELECT SERVICE RELEASE,2020-07-13,NEW RESIDENTIAL CITY OF OAK HILL,CARDWELLS ELECTRIC,37220
1,13209015600,ELECTRICAL PERMIT,FULL ELECTRICAL PERMIT,2014-12-09,"12/9/14- MRP- EXISTING, RESIDENTIAL, ADDING PO...",Winston Electric Llc,37220
2,13209015600,ELECTRICAL PERMIT,ELECT SERVICE RELEASE,2014-12-09,OAK HILL BP # 4987 FULL PERMIT #2006526 THAT...,Empire Electric Llc,37220
3,13209015600,GAS / MECH UP TO 1500 BTUS - HC,GAS / MECH PERMIT (UP TO 1500 BTU),2014-10-23,OAK HILL PERMIT\r\n(2) WOOD-BURNING FIREPLACES...,Nashville Fireplace Distributors Inc,37220
4,13209015600,ELECTRICAL PERMIT,FULL ELECTRICAL PERMIT,2014-10-23,OAK HILL BUILDING PERMIT #4987 \r\n\r\nNEED FU...,Empire Electric Llc,37220
5,13209015600,GAS / MECH PERMIT - VC MC,GAS / MECH PERMIT,2014-09-30,INSTALL NEW HVAC SYSTEMS WITH DUCT WORK AND GA...,"Frick, M J Co Inc",37220
6,13209015600,CODES PLUMBING PERMIT,PLUMBING PERMIT,2014-08-14,INSTALL COMPLETE PLUMBING SYSTEM IN NEW CONSTR...,G & M Plumbing Llc,37220
7,13209015600,ELECTRICAL PERMIT,TEMPORARY ELEC SERVICE,2014-07-11,NEED A 60 TEMP PERMIT FOR NEW CONSTRUCTION\r\n...,Empire Electric Llc,37220
8,132090F00200CO,CODES PLUMBING PERMIT,PLUMBING PERMIT,2011-12-08,TO CONSTRUCT A 3083 SQ FT 2 STORY SINGLE FAMIL...,Benchmark Plumbing Inc,37204
10,132090F00200CO,ELECTRICAL PERMIT,FULL ELECTRICAL PERMIT,2011-11-07,NEW WIRING OF RESIDENCE- TO CONSTRUCT A 3083 S...,"Morgan, Michael J Dba Mr. Sparky Nashville-East",37204


In [10]:
permits.shape

(336497, 7)

In [11]:
pd.DataFrame(permits.purpose.value_counts(ascending=False).head(20)).reset_index()

Unnamed: 0,index,purpose
0,SERVICE RELEASE,882
1,GAS FIREPLACE,871
2,HVAC CHANGE OUT,506
3,FLOOD DAMAGE,495
4,REPLACEMENT,473
5,FIREPLACE INSTALLATION,470
6,PLUMBING,425
7,GAS FIREPLACE\r\n\r\n,411
8,ROUGHIN AND FINAL,390
9,INSTALL HVAC,370


## Find number of rows different words appear
### Trying to identify boilerplate language