In [243]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [295]:
# Import dataset from merged dataset
data = pd.read_csv("../../data/data_merged.csv", encoding='iso-8859-1', low_memory=False)

In [246]:
data.columns

Index(['Unnamed: 0', 'As_Of_Date', 'alias', 'permit_cap_status', 'pdox_b1_id',
       'Project_Status', 'Project_Description', 'Project_CreateDate',
       'ReviewCycle', 'TaskName', 'TaskStatus', 'GroupName',
       'elapsed_workdays', 'RC_ReviewStatus', 'FlowTask_CreatedDate',
       'FlowTask_DateUpdated', 'FlowTask_DateAccepted', 'RC_HowAssigned',
       'RC_ReviewComments_confidential', 'Fee_Assessed', 'job_class',
       'over_30', 'AGENCY', 'pdox', 'est_worktime', 'QA_Date', 'QA_Date_rc1',
       'Ward', 'DESC_OF_WORK', 'proposed_gross_floor_area', 'green_floor_area',
       'issued', 'Begin_review_dd', 'Resubmit_rec_dd', 'rc_1_time', 'rc_time',
       'FlowInstanceID', 'review_group', 'use_type', 'permit_type',
       'building_construction_type', 'existing_use_of_building',
       'proposed_use_of_building', 'proposed_number_stories',
       'proposed_number_units', 'existing_number_units', 'present_gfa',
       'create_review', 'ssl', 'perm_id', 'RC_assignedBy_anon',
       '

In [247]:
data.shape

(185372, 68)

## Filter the data: 
### It will be easier to interpert *elapsed_workdays* if *TaskStatus* = Complete and exclude customer operations

In [296]:
df = data[data['TaskStatus'] == 'Complete']
df = df[(df['AGENCY'] == 'SISTER')|(df['AGENCY'] == 'DCRA')]

In [297]:
df.shape

(162218, 68)

In [298]:
# Drop unrelavent variables
X = df.drop(['Unnamed: 0', 'As_Of_Date', 'permit_cap_status','Project_Description','Project_CreateDate',
             'TaskStatus', 'FlowTask_CreatedDate','FlowTask_DateUpdated', 'FlowTask_DateAccepted',
             'RC_ReviewStatus', 'RC_ReviewComments_confidential', 'Fee_Assessed','over_30','pdox',
             'est_worktime', 'QA_Date', 'QA_Date_rc1', 'DESC_OF_WORK', 'issued', 'Begin_review_dd',
             'Resubmit_rec_dd', 'rc_1_time', 'rc_time', 'FlowInstanceID', 'create_review', 'ssl',
             'RC_assignedBy_anon', 'UpdatedByEmail_anon', 'applicant_anon','owner_name_anon', 'NAME_anon',
             'OBJECTID', 'BLDG_NUM', 'SALEDATE', 'AYB', 'SALE_NUM', 'GIS_LAST_MOD_DTTM'], axis=1)

In [299]:
X.columns

Index(['alias', 'pdox_b1_id', 'Project_Status', 'ReviewCycle', 'TaskName',
       'GroupName', 'elapsed_workdays', 'RC_HowAssigned', 'job_class',
       'AGENCY', 'Ward', 'proposed_gross_floor_area', 'green_floor_area',
       'review_group', 'use_type', 'permit_type', 'building_construction_type',
       'existing_use_of_building', 'proposed_use_of_building',
       'proposed_number_stories', 'proposed_number_units',
       'existing_number_units', 'present_gfa', 'perm_id', 'PRICE', 'QUALIFIED',
       'YR_RMDL', 'EYB', 'LIVING_GBA', 'USECODE', 'LANDAREA'],
      dtype='object')

## Data Cleaning

In [300]:
# Clean GroupName variable
# Suppress chaining warning
pd.options.mode.chained_assignment = None  # default='warn'

In [301]:
X.GroupName[X.GroupName == 'DC Water Review Supervisor'] = "DC Water Review"
X.GroupName[X.GroupName == 'DDOT Review Supervisor'] = "DDOT Review"
X.GroupName[X.GroupName == 'DOH Review Supervisor'] = "DOH Review"
X.GroupName[X.GroupName == 'Electrical Review Supervisor'] = "Electrical Review"
X.GroupName[X.GroupName == 'Elevator Review Supervisor'] = "Elevator Review"
X.GroupName[X.GroupName == 'Energy Review Supervisor'] = "Energy Review"
X.GroupName[X.GroupName == 'Fire Review Supervisor'] = "Fire Review"
X.GroupName[X.GroupName == 'Green Review Supervisor'] = "Green Review"
X.GroupName[X.GroupName == 'Mechanical Review Supervisor'] = "Mechanical Review"
X.GroupName[X.GroupName == 'Plumbing Review Supervisor'] = "Plumbing Review"
X.GroupName[X.GroupName == 'Structural Review Supervisor'] = "Structural Review"
X.GroupName[X.GroupName == 'WASA Review Supervisor'] = "WASA Review"
X.GroupName[X.GroupName == 'Zoning Review Supervisor'] = "Zoning Review"

In [302]:
X.GroupName[X.GroupName == 'DDOE Review Supervisor'] = "DDOE Review"
X.GroupName[X.GroupName == 'DDOE SE-SW Review'] = "DDOE Review"
X.GroupName[X.GroupName == 'DDOE WSP Review'] = "DDOE Review"
X.GroupName[X.GroupName == 'DOEE FP Review'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE GAR Review'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE Review Supervisor'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE SE-SW Review'] = "DOEE Review"

In [303]:
# Aggregate 'building_construction_type' variable 
X.building_construction_type[X.building_construction_type == 'Type I - Fire-Resistive Non-combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'Type I-A - Fire-Resistive Non-Combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'Type I-B - Fire-Resistive Non-Combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'TYPE II-A - Non-Combustible Construction'] = "TYPE II - Non-Combustible Construction"
X.building_construction_type[X.building_construction_type == 'TYPE II-B - Non-Combustible Construction'] = "TYPE II - Non-Combustible Construction"
X.building_construction_type[X.building_construction_type == 'TYPE III-A - Non-Combustible Exterior Walls'] = "TYPE III - Non-Combustible Exterior Walls"
X.building_construction_type[X.building_construction_type == 'TYPE III-B - Non-Combustible Exterior Walls'] = "TYPE III - Non-Combustible Exterior Walls"
X.building_construction_type[X.building_construction_type == 'TYPE V-A - Any Materials Permitted'] = "TYPE V - Any Materials Permitted"
X.building_construction_type[X.building_construction_type == 'TYPE V-B - Any Materials Permitted'] = "TYPE V - Any Materials Permitted"

Change *price* and *EYB* == 0 to N.A

In [304]:
X['PRICE'] = X['PRICE'].apply(lambda x: None if x == 0 else x)

In [305]:
X['EYB'] = X['EYB'].apply(lambda x: None if x == 0 else x)

Change EYB to elapsed years from construction

In [306]:
X['EYFC'] = 2018 - X['EYB']
X = X.drop(['EYB'], axis = 1)

Evaluate the variables 'proposed_number_units' and 'existing_number_units' 
Convert to a dummy to indicate whether the num of units changing in the permit application or not: 1= change/ 0=no change

In [307]:
num_units = X['proposed_number_units'] != X['existing_number_units']
num_units = pd.DataFrame(num_units, columns=["num_units_change"])
num_units.groupby(by="num_units_change").size()

num_units_change
False    74863
True     87355
dtype: int64

Evaluate the variables 'existing_use_of_building' and 'proposed_use_of_building' 
Convert to a dummy to indicate whether the existing use changing in the permit application or not: 1= change/ 0=no change

In [308]:
use = X['existing_use_of_building'] != X['proposed_use_of_building']
use = pd.DataFrame(use, columns=["use_change"])
use.groupby(by="use_change").size()

use_change
False    76783
True     85435
dtype: int64

Evaluate the variables'present_gfa', 'proposed_gross_floor_area' 
Convert to a dummy to indicate whether the gfa changing in the permit application or not: 1= change/ 0=no change

In [309]:
gfa = X['proposed_gross_floor_area'] != X['present_gfa']
gfa = pd.DataFrame(gfa, columns=["gfa_change"])
gfa.groupby(by="gfa_change").size()

gfa_change
False     61745
True     100473
dtype: int64

In [310]:
temp = pd.concat([use, num_units, gfa], axis=1)

Create dummy variables for all "GroupName"

In [311]:
agencies = pd.get_dummies(X.GroupName)
agencies.columns

Index(['CFA Review', 'Chinatown Review', 'DC Water Review', 'DDOE Review',
       'DDOT Review', 'DOEE Review', 'DOH Review', 'EISF Review',
       'Electrical Review', 'Elevator Review', 'Energy Review', 'File Room',
       'Fire Review', 'Green Review', 'HPRB Review', 'Mechanical Review',
       'NCPC Review', 'PRC', 'Plumbing Review', 'QA Review',
       'Structural Review', 'WASA Review', 'WMATA Review',
       'White House Review', 'Zoning Review'],
      dtype='object')

In [312]:
temp = pd.concat([temp, agencies], axis=1)
X = pd.concat([X, temp], axis=1)

In [313]:
# Drop variables
X = X.drop(['existing_use_of_building', 'proposed_use_of_building','proposed_number_stories',
            'proposed_number_units','existing_number_units', 
            'proposed_gross_floor_area','present_gfa' ,'GroupName'], axis=1)

#### Log variables have big numbers: *green_floor_area*, *PRICE*, *LIVING_GBA*, *LANDAREA*

In [266]:
X.groupby(by="green_floor_area").size()

green_floor_area
0.000000e+00    63084
5.600000e-01       24
1.000000e+00       59
2.000000e+00      163
3.000000e+00      179
4.000000e+00       42
5.000000e+00       31
6.000000e+00       22
1.200000e+01       24
1.350000e+01       19
2.160000e+01       34
2.400000e+01       36
2.970000e+01       41
3.000000e+01       27
3.026000e+01       22
3.510000e+01       23
5.200000e+01       37
5.280000e+01       59
5.400000e+01       25
6.000000e+01      116
6.500000e+01       29
7.000000e+01       31
7.200000e+01       36
8.000000e+01       59
9.750000e+01       21
9.800000e+01       57
1.040000e+02       28
1.050000e+02       60
1.080000e+02       23
1.120000e+02       25
                ...  
1.918080e+05       36
1.927200e+05       32
1.936272e+05       37
1.977840e+05        9
2.000000e+05       42
2.007200e+05       37
2.018074e+05       31
2.057738e+05       28
2.084940e+05        7
2.364872e+05       45
2.373210e+05       28
2.409750e+05       68
2.433600e+05       34
2.457262e+05   

In [314]:
import math
X['green_floor_area'] = X['green_floor_area'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="green_floor_area").size()

In [315]:
X['PRICE'] = X['PRICE'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="PRICE").size()

In [316]:
X['LIVING_GBA'] = X['LIVING_GBA'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="LIVING_GBA").size()

In [317]:
X['LANDAREA'] = X['LANDAREA'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="LANDAREA").size()

In [318]:
X.columns

Index(['alias', 'pdox_b1_id', 'Project_Status', 'ReviewCycle', 'TaskName',
       'elapsed_workdays', 'RC_HowAssigned', 'job_class', 'AGENCY', 'Ward',
       'green_floor_area', 'review_group', 'use_type', 'permit_type',
       'building_construction_type', 'perm_id', 'PRICE', 'QUALIFIED',
       'YR_RMDL', 'LIVING_GBA', 'USECODE', 'LANDAREA', 'EYFC', 'use_change',
       'num_units_change', 'gfa_change', 'CFA Review', 'Chinatown Review',
       'DC Water Review', 'DDOE Review', 'DDOT Review', 'DOEE Review',
       'DOH Review', 'EISF Review', 'Electrical Review', 'Elevator Review',
       'Energy Review', 'File Room', 'Fire Review', 'Green Review',
       'HPRB Review', 'Mechanical Review', 'NCPC Review', 'PRC',
       'Plumbing Review', 'QA Review', 'Structural Review', 'WASA Review',
       'WMATA Review', 'White House Review', 'Zoning Review'],
      dtype='object')

## Build New Dataset: D

### Build Feature Matrix

In [334]:
X.sort_values("pdox_b1_id", ascending = True)
D = X.groupby('pdox_b1_id')['alias', 'Project_Status', 'job_class', 'Ward', 'green_floor_area', 'use_type',
                            'permit_type', 'PRICE', 'QUALIFIED', 'LIVING_GBA', 'USECODE',
                            'LANDAREA', 'EYFC', 'use_change', 'num_units_change', 'gfa_change'].head(1)
D = D.reset_index()
D

Unnamed: 0,index,alias,Project_Status,job_class,Ward,green_floor_area,use_type,permit_type,PRICE,QUALIFIED,LIVING_GBA,USECODE,LANDAREA,EYFC,use_change,num_units_change,gfa_change
0,0,Addition Alteration Repair Permit,Approved,B,6.0,0.000000,R,Construction,6.209515,U,3.399328,24.0,3.379124,50.0,False,True,False
1,15,Addition Permit,Under Review,E,5.0,3.260071,R,Construction,5.525045,Q,3.049218,11.0,3.557146,63.0,True,True,True
2,49,Alteration and Repair Permit,Approved,B,2.0,0.000000,C,Construction,,U,4.141983,81.0,4.092089,33.0,False,False,False
3,89,Addition Alteration Repair Permit,Approved,C,5.0,3.147367,C,Construction,5.755875,U,3.225309,11.0,3.473487,63.0,True,True,True
4,125,New Building Permit,Under Review,A,4.0,4.440909,C,Construction,,U,3.834421,81.0,4.017033,53.0,True,True,True
5,141,Addition Alteration Repair Permit,Approved,TPR,1.0,4.238508,C,Construction,5.705008,U,3.164353,11.0,3.217747,63.0,True,True,True
6,153,Addition Permit,Approved,E,2.0,2.389928,R,Construction,6.172507,U,3.300161,11.0,3.452247,41.0,False,False,True
7,165,Addition Alteration Repair Permit,Approved,E,6.0,2.883661,R,Construction,5.934498,Q,3.258158,11.0,3.260548,50.0,False,False,False
8,179,Sheeting and Shoring Permit,Approved,E,6.0,,C,Construction,,,,,,,False,True,True
9,188,Addition Alteration Repair Permit,Approved,C,6.0,3.304491,C,Construction,5.563481,U,3.079181,13.0,3.209515,63.0,True,True,True


In [335]:
MAX_Review_Cycle = X.groupby('pdox_b1_id')['ReviewCycle'].max()
MAX_Review_Cycle = MAX_Review_Cycle.reset_index()
MAX_Review_Cycle = MAX_Review_Cycle.drop(['pdox_b1_id'], axis=1)

In [336]:
D = pd.concat([D, MAX_Review_Cycle], axis=1)
D

Unnamed: 0,index,alias,Project_Status,job_class,Ward,green_floor_area,use_type,permit_type,PRICE,QUALIFIED,LIVING_GBA,USECODE,LANDAREA,EYFC,use_change,num_units_change,gfa_change,ReviewCycle
0,0,Addition Alteration Repair Permit,Approved,B,6.0,0.000000,R,Construction,6.209515,U,3.399328,24.0,3.379124,50.0,False,True,False,2.0
1,15,Addition Permit,Under Review,E,5.0,3.260071,R,Construction,5.525045,Q,3.049218,11.0,3.557146,63.0,True,True,True,1.0
2,49,Alteration and Repair Permit,Approved,B,2.0,0.000000,C,Construction,,U,4.141983,81.0,4.092089,33.0,False,False,False,1.0
3,89,Addition Alteration Repair Permit,Approved,C,5.0,3.147367,C,Construction,5.755875,U,3.225309,11.0,3.473487,63.0,True,True,True,1.0
4,125,New Building Permit,Under Review,A,4.0,4.440909,C,Construction,,U,3.834421,81.0,4.017033,53.0,True,True,True,1.0
5,141,Addition Alteration Repair Permit,Approved,TPR,1.0,4.238508,C,Construction,5.705008,U,3.164353,11.0,3.217747,63.0,True,True,True,1.0
6,153,Addition Permit,Approved,E,2.0,2.389928,R,Construction,6.172507,U,3.300161,11.0,3.452247,41.0,False,False,True,2.0
7,165,Addition Alteration Repair Permit,Approved,E,6.0,2.883661,R,Construction,5.934498,Q,3.258158,11.0,3.260548,50.0,False,False,False,1.0
8,179,Sheeting and Shoring Permit,Approved,E,6.0,,C,Construction,,,,,,,False,True,True,1.0
9,188,Addition Alteration Repair Permit,Approved,C,6.0,3.304491,C,Construction,5.563481,U,3.079181,13.0,3.209515,63.0,True,True,True,1.0


Change *agencies* into continous variables

In [337]:
X['CFA_Review'] = X['elapsed_workdays']*agencies['CFA Review']
X['Chinatown_Review'] = X['elapsed_workdays']*agencies['Chinatown Review']
X['DC_Water_Review'] = X['elapsed_workdays']*agencies['DC Water Review']
X['DDOE_Review'] = X['elapsed_workdays']*agencies['DDOE Review']
X['DDOT_Review'] = X['elapsed_workdays']*agencies['DDOT Review']
X['DOEE_Review'] = X['elapsed_workdays']*agencies['DOEE Review']
X['DOH_Review'] = X['elapsed_workdays']*agencies['DOH Review']
X['EISF_Review'] = X['elapsed_workdays']*agencies['EISF Review']
X['Electrical_Review'] = X['elapsed_workdays']*agencies['Electrical Review']
X['Elevator_Review'] = X['elapsed_workdays']*agencies['Elevator Review']
X['Energy_Review'] = X['elapsed_workdays']*agencies['Energy Review']
X['File_Room'] = X['elapsed_workdays']*agencies['File Room']
X['Fire_Review'] = X['elapsed_workdays']*agencies['Fire Review']

In [338]:
X['Green_Review'] = X['elapsed_workdays']*agencies['Green Review']
X['HPRB_Review'] = X['elapsed_workdays']*agencies['HPRB Review']
X['Mechanical_Review'] = X['elapsed_workdays']*agencies['Mechanical Review']
X['NCPC_Review'] = X['elapsed_workdays']*agencies['NCPC Review']
X['PRC'] = X['elapsed_workdays']*agencies['PRC']
X['Plumbing_Review'] = X['elapsed_workdays']*agencies['Plumbing Review']
X['QA_Review'] = X['elapsed_workdays']*agencies['QA Review']
X['Structural_Review'] = X['elapsed_workdays']*agencies['Structural Review']
X['WASA_Review'] = X['elapsed_workdays']*agencies['WASA Review']
X['WMATA_Review'] = X['elapsed_workdays']*agencies['WMATA Review']
X['White_House_Review'] = X['elapsed_workdays']*agencies['White House Review']
X['Zoning_Review'] = X['elapsed_workdays']*agencies['Zoning Review']

In [339]:
AGENCY = X.groupby('pdox_b1_id')['CFA_Review', 'Chinatown_Review', 'DC_Water_Review', 'DDOE_Review', 
                                'DDOT_Review', 'DOEE_Review', 'DOH_Review', 'EISF_Review',
                                'Electrical_Review', 'Elevator_Review', 'Energy_Review', 'File_Room',
                                'Fire_Review', 'Green_Review', 'HPRB_Review', 'Mechanical_Review',
                                'NCPC_Review', 'PRC', 'Plumbing_Review', 'QA_Review',
                                'Structural_Review', 'WASA_Review', 'WMATA_Review', 'White_House_Review',
                                'Zoning_Review'].aggregate(np.sum)
AGENCY = AGENCY.reset_index()
AGENCY = AGENCY.drop(['pdox_b1_id'], axis=1)

In [340]:
D = pd.concat([D, AGENCY], axis=1)
D

Unnamed: 0,index,alias,Project_Status,job_class,Ward,green_floor_area,use_type,permit_type,PRICE,QUALIFIED,...,Mechanical_Review,NCPC_Review,PRC,Plumbing_Review,QA_Review,Structural_Review,WASA_Review,WMATA_Review,White_House_Review,Zoning_Review
0,0,Addition Alteration Repair Permit,Approved,B,6.0,0.000000,R,Construction,6.209515,U,...,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,23.0
1,15,Addition Permit,Under Review,E,5.0,3.260071,R,Construction,5.525045,Q,...,0.0,0.0,9.0,0.0,7.0,26.0,0.0,0.0,0.0,14.0
2,49,Alteration and Repair Permit,Approved,B,2.0,0.000000,C,Construction,,U,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
3,89,Addition Alteration Repair Permit,Approved,C,5.0,3.147367,C,Construction,5.755875,U,...,0.0,0.0,5.0,0.0,8.0,18.0,0.0,0.0,0.0,3.0
4,125,New Building Permit,Under Review,A,4.0,4.440909,C,Construction,,U,...,0.0,0.0,3.0,0.0,4.0,26.0,0.0,0.0,0.0,17.0
5,141,Addition Alteration Repair Permit,Approved,TPR,1.0,4.238508,C,Construction,5.705008,U,...,0.0,0.0,5.0,0.0,11.0,32.0,0.0,0.0,0.0,26.0
6,153,Addition Permit,Approved,E,2.0,2.389928,R,Construction,6.172507,U,...,0.0,0.0,1.0,0.0,6.0,18.0,0.0,0.0,0.0,21.0
7,165,Addition Alteration Repair Permit,Approved,E,6.0,2.883661,R,Construction,5.934498,Q,...,0.0,0.0,1.0,0.0,0.0,18.0,0.0,0.0,0.0,5.0
8,179,Sheeting and Shoring Permit,Approved,E,6.0,,C,Construction,,,...,0.0,0.0,5.0,0.0,3.0,18.0,0.0,0.0,0.0,17.0
9,188,Addition Alteration Repair Permit,Approved,C,6.0,3.304491,C,Construction,5.563481,U,...,0.0,0.0,5.0,0.0,0.0,18.0,0.0,0.0,0.0,30.0


### Build Target Matrix

In [341]:
T = X.groupby("pdox_b1_id")["elapsed_workdays"].aggregate(np.sum)
T = T.reset_index()

In [342]:
D = pd.concat([T, D], axis=1)
D

Unnamed: 0,pdox_b1_id,elapsed_workdays,index,alias,Project_Status,job_class,Ward,green_floor_area,use_type,permit_type,...,Mechanical_Review,NCPC_Review,PRC,Plumbing_Review,QA_Review,Structural_Review,WASA_Review,WMATA_Review,White_House_Review,Zoning_Review
0,AN1700124,62.0,0,Addition Alteration Repair Permit,Approved,B,6.0,0.000000,R,Construction,...,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,23.0
1,AN1700215,100.0,15,Addition Permit,Under Review,E,5.0,3.260071,R,Construction,...,0.0,0.0,9.0,0.0,7.0,26.0,0.0,0.0,0.0,14.0
2,AN1700216,4.0,49,Alteration and Repair Permit,Approved,B,2.0,0.000000,C,Construction,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
3,AN1700235,65.0,89,Addition Alteration Repair Permit,Approved,C,5.0,3.147367,C,Construction,...,0.0,0.0,5.0,0.0,8.0,18.0,0.0,0.0,0.0,3.0
4,AN1700236,83.0,125,New Building Permit,Under Review,A,4.0,4.440909,C,Construction,...,0.0,0.0,3.0,0.0,4.0,26.0,0.0,0.0,0.0,17.0
5,AN1700242,91.0,141,Addition Alteration Repair Permit,Approved,TPR,1.0,4.238508,C,Construction,...,0.0,0.0,5.0,0.0,11.0,32.0,0.0,0.0,0.0,26.0
6,AN1700244,57.0,153,Addition Permit,Approved,E,2.0,2.389928,R,Construction,...,0.0,0.0,1.0,0.0,6.0,18.0,0.0,0.0,0.0,21.0
7,AN1700245,35.0,165,Addition Alteration Repair Permit,Approved,E,6.0,2.883661,R,Construction,...,0.0,0.0,1.0,0.0,0.0,18.0,0.0,0.0,0.0,5.0
8,AN1700250,61.0,179,Sheeting and Shoring Permit,Approved,E,6.0,,C,Construction,...,0.0,0.0,5.0,0.0,3.0,18.0,0.0,0.0,0.0,17.0
9,AN1700255,84.0,188,Addition Alteration Repair Permit,Approved,C,6.0,3.304491,C,Construction,...,0.0,0.0,5.0,0.0,0.0,18.0,0.0,0.0,0.0,30.0


### Drop NA

In [343]:
D.shape

(8552, 45)

In [344]:
D.isnull().sum()

pdox_b1_id               0
elapsed_workdays         0
index                    0
alias                    0
Project_Status           3
job_class               15
Ward                    28
green_floor_area      4890
use_type              3169
permit_type              0
PRICE                 4624
QUALIFIED             1873
LIVING_GBA            1873
USECODE               1873
LANDAREA              1873
EYFC                  1902
use_change               0
num_units_change         0
gfa_change               0
ReviewCycle              0
CFA_Review               0
Chinatown_Review         0
DC_Water_Review          0
DDOE_Review              0
DDOT_Review              0
DOEE_Review              0
DOH_Review               0
EISF_Review              0
Electrical_Review        0
Elevator_Review          0
Energy_Review            0
File_Room                0
Fire_Review              0
Green_Review             0
HPRB_Review              0
Mechanical_Review        0
NCPC_Review              0
P

In [345]:
D = D.dropna()
D.shape

(1984, 45)

#### Create Dummy Target

In [346]:
D['elapsed_workdays'].mean()

143.36592741935485

In [347]:
np.percentile(D.elapsed_workdays, [50, 63.8])

array([ 91., 150.])

Create dummy target based on the mean and the median

In [348]:
D['over_90'] = D['elapsed_workdays'].apply(lambda x: 1 if x > 90 else 0)
D.groupby(by='over_90').size()

over_90
0    986
1    998
dtype: int64

In [349]:
D['over_150'] = D['elapsed_workdays'].apply(lambda x: 1 if x > 150 else 0)
D.groupby(by='over_150').size()

over_150
0    1268
1     716
dtype: int64

In [350]:
D.columns

Index(['pdox_b1_id', 'elapsed_workdays', 'index', 'alias', 'Project_Status',
       'job_class', 'Ward', 'green_floor_area', 'use_type', 'permit_type',
       'PRICE', 'QUALIFIED', 'LIVING_GBA', 'USECODE', 'LANDAREA', 'EYFC',
       'use_change', 'num_units_change', 'gfa_change', 'ReviewCycle',
       'CFA_Review', 'Chinatown_Review', 'DC_Water_Review', 'DDOE_Review',
       'DDOT_Review', 'DOEE_Review', 'DOH_Review', 'EISF_Review',
       'Electrical_Review', 'Elevator_Review', 'Energy_Review', 'File_Room',
       'Fire_Review', 'Green_Review', 'HPRB_Review', 'Mechanical_Review',
       'NCPC_Review', 'PRC', 'Plumbing_Review', 'QA_Review',
       'Structural_Review', 'WASA_Review', 'WMATA_Review',
       'White_House_Review', 'Zoning_Review', 'over_90', 'over_150'],
      dtype='object')

In [351]:
D = D.drop(['index'], axis = 1)

In [357]:
# Save
D.to_csv("../../data/final_permit_uncode.csv", index=False)

#### Encode categorical variables

In [353]:
D

Unnamed: 0,pdox_b1_id,elapsed_workdays,alias,Project_Status,job_class,Ward,green_floor_area,use_type,permit_type,PRICE,...,PRC,Plumbing_Review,QA_Review,Structural_Review,WASA_Review,WMATA_Review,White_House_Review,Zoning_Review,over_90,over_150
0,AN1700124,62.0,Addition Alteration Repair Permit,Approved,B,6.0,0.000000,R,Construction,6.209515,...,0.0,0.0,0.0,22.0,0.0,0.0,0.0,23.0,0,0
1,AN1700215,100.0,Addition Permit,Under Review,E,5.0,3.260071,R,Construction,5.525045,...,9.0,0.0,7.0,26.0,0.0,0.0,0.0,14.0,1,0
3,AN1700235,65.0,Addition Alteration Repair Permit,Approved,C,5.0,3.147367,C,Construction,5.755875,...,5.0,0.0,8.0,18.0,0.0,0.0,0.0,3.0,0,0
5,AN1700242,91.0,Addition Alteration Repair Permit,Approved,TPR,1.0,4.238508,C,Construction,5.705008,...,5.0,0.0,11.0,32.0,0.0,0.0,0.0,26.0,1,0
6,AN1700244,57.0,Addition Permit,Approved,E,2.0,2.389928,R,Construction,6.172507,...,1.0,0.0,6.0,18.0,0.0,0.0,0.0,21.0,0,0
7,AN1700245,35.0,Addition Alteration Repair Permit,Approved,E,6.0,2.883661,R,Construction,5.934498,...,1.0,0.0,0.0,18.0,0.0,0.0,0.0,5.0,0,0
9,AN1700255,84.0,Addition Alteration Repair Permit,Approved,C,6.0,3.304491,C,Construction,5.563481,...,5.0,0.0,0.0,18.0,0.0,0.0,0.0,30.0,0,0
13,AN1700271,102.0,Alteration and Repair Permit,Final Processing,B,1.0,0.000000,C,Construction,6.020775,...,1.0,0.0,1.0,22.0,0.0,0.0,0.0,23.0,1,0
22,AN1700305,61.0,Alteration and Repair Permit,HFC,B,7.0,0.000000,C,Construction,6.332438,...,7.0,0.0,1.0,0.0,0.0,0.0,0.0,24.0,0,0
23,AN1700307,50.0,Alteration and Repair Permit,HFC,B,7.0,0.000000,C,Construction,6.352183,...,2.0,0.0,1.0,12.0,0.0,0.0,0.0,20.0,0,0


In [354]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
CV = ['alias', 'Project_Status', 'job_class', 'use_type', 'permit_type', 'QUALIFIED',
      'use_change', 'num_units_change', 'gfa_change']
for v in CV:
    D[v] = lb.fit_transform(D[v])

In [358]:
# Save
D.to_csv("../../data/final_permit_encode.csv", index=False)