In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [54]:
# Run Ziqi's code on "merged_dcra_cama.csv" first then import it
df = pd.read_csv("data/merged.csv", encoding='iso-8859-1', low_memory=False)

In [55]:
df.columns

Index(['X1', 'As_Of_Date', 'alias', 'permit_cap_status', 'pdox_b1_id',
       'Project_Status', 'Project_Description', 'Project_CreateDate',
       'ReviewCycle', 'TaskName', 'TaskStatus', 'GroupName',
       'elapsed_workdays', 'RC_ReviewStatus', 'FlowTask_CreatedDate',
       'FlowTask_DateUpdated', 'FlowTask_DateAccepted', 'RC_HowAssigned',
       'RC_ReviewComments_confidential', 'Fee_Assessed', 'job_class',
       'over_30', 'AGENCY', 'pdox', 'est_worktime', 'QA_Date', 'QA_Date_rc1',
       'Ward', 'DESC_OF_WORK', 'proposed_gross_floor_area', 'green_floor_area',
       'issued', 'Begin_review_dd', 'Resubmit_rec_dd', 'rc_1_time', 'rc_time',
       'FlowInstanceID', 'review_group', 'use_type', 'permit_type',
       'building_construction_type', 'existing_use_of_building',
       'proposed_use_of_building', 'proposed_number_stories',
       'proposed_number_units', 'existing_number_units', 'present_gfa',
       'create_review', 'ssl', 'perm_id', 'RC_assignedBy_anon',
       'UpdatedB

In [56]:
# Drop variables
X = df.drop(['X1', 'As_Of_Date', 'permit_cap_status','Project_Description','Project_CreateDate', 'TaskStatus',
             'FlowTask_CreatedDate','FlowTask_DateUpdated', 'FlowTask_DateAccepted', 'RC_ReviewStatus', 
             'RC_ReviewComments_confidential', 'Fee_Assessed','over_30','pdox','est_worktime', 'QA_Date',
             'QA_Date_rc1', 'DESC_OF_WORK', 'issued', 'Begin_review_dd', 'Resubmit_rec_dd', 'rc_1_time', 'rc_time',
             'FlowInstanceID', 'create_review', 'ssl', 'RC_assignedBy_anon', 'UpdatedByEmail_anon', 'applicant_anon',
             'owner_name_anon', 'NAME_anon', 'OBJECTID', 'SALEDATE', 'AYB', 'SALE_NUM', 'GIS_LAST_MOD_DTTM'], axis=1)

In [57]:
X.columns

Index(['alias', 'pdox_b1_id', 'Project_Status', 'ReviewCycle', 'TaskName',
       'GroupName', 'elapsed_workdays', 'RC_HowAssigned', 'job_class',
       'AGENCY', 'Ward', 'proposed_gross_floor_area', 'green_floor_area',
       'review_group', 'use_type', 'permit_type', 'building_construction_type',
       'existing_use_of_building', 'proposed_use_of_building',
       'proposed_number_stories', 'proposed_number_units',
       'existing_number_units', 'present_gfa', 'perm_id', 'PRICE', 'QUALIFIED',
       'EYB', 'LIVING_GBA', 'USECODE', 'LANDAREA'],
      dtype='object')

In [58]:
# Clean GroupName variable
# Suppress chaining warning
pd.options.mode.chained_assignment = None  # default='warn'

In [59]:
X.GroupName[X.GroupName == 'DC Water Review Supervisor'] = "DC Water Review"
X.GroupName[X.GroupName == 'DDOT Review Supervisor'] = "DDOT Review"
X.GroupName[X.GroupName == 'DOH Review Supervisor'] = "DOH Review"
X.GroupName[X.GroupName == 'Electrical Review Supervisor'] = "Electrical Review"
X.GroupName[X.GroupName == 'Elevator Review Supervisor'] = "Elevator Review"
X.GroupName[X.GroupName == 'Energy Review Supervisor'] = "Energy Review"
X.GroupName[X.GroupName == 'Fire Review Supervisor'] = "Fire Review"
X.GroupName[X.GroupName == 'Green Review Supervisor'] = "Green Review"
X.GroupName[X.GroupName == 'Mechanical Review Supervisor'] = "Mechanical Review"
X.GroupName[X.GroupName == 'Plumbing Review Supervisor'] = "Plumbing Review"
X.GroupName[X.GroupName == 'Structural Review Supervisor'] = "Structural Review"
X.GroupName[X.GroupName == 'WASA Review Supervisor'] = "WASA Review"
X.GroupName[X.GroupName == 'Zoning Review Supervisor'] = "Zoning Review"

In [60]:
X.GroupName[X.GroupName == 'DDOE Review Supervisor'] = "DDOE Review"
X.GroupName[X.GroupName == 'DDOE SE-SW Review'] = "DDOE Review"
X.GroupName[X.GroupName == 'DDOE WSP Review'] = "DDOE Review"
X.GroupName[X.GroupName == 'DOEE FP Review'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE GAR Review'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE Review Supervisor'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE SE-SW Review'] = "DOEE Review"

In [61]:
# Aggregate 'building_construction_type' variable 
X.building_construction_type[X.building_construction_type == 'Type I - Fire-Resistive Non-combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'Type I-A - Fire-Resistive Non-Combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'Type I-B - Fire-Resistive Non-Combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'TYPE II-A - Non-Combustible Construction'] = "TYPE II - Non-Combustible Construction"
X.building_construction_type[X.building_construction_type == 'TYPE II-B - Non-Combustible Construction'] = "TYPE II - Non-Combustible Construction"
X.building_construction_type[X.building_construction_type == 'TYPE III-A - Non-Combustible Exterior Walls'] = "TYPE III - Non-Combustible Exterior Walls"
X.building_construction_type[X.building_construction_type == 'TYPE III-B - Non-Combustible Exterior Walls'] = "TYPE III - Non-Combustible Exterior Walls"
X.building_construction_type[X.building_construction_type == 'TYPE V-A - Any Materials Permitted'] = "TYPE V - Any Materials Permitted"
X.building_construction_type[X.building_construction_type == 'TYPE V-B - Any Materials Permitted'] = "TYPE V - Any Materials Permitted"

Evaluate the variables 'proposed_number_units' and 'existing_number_units' 
Convert to a dummy to indicate whether the num of units changing in the permit application or not: 1= change/ 0=no change

In [62]:
num_units = X['proposed_number_units'] != X['existing_number_units']
num_units = pd.DataFrame(num_units, columns=["num_units_change"])
num_units.groupby(by="num_units_change").size()

num_units_change
False    74863
True     87355
dtype: int64

Evaluate the variables 'existing_use_of_building' and 'proposed_use_of_building' 
Convert to a dummy to indicate whether the existing use changing in the permit application or not: 1= change/ 0=no change

In [63]:
use = X['existing_use_of_building'] != X['proposed_use_of_building']
use = pd.DataFrame(use, columns=["use_change"])
use.groupby(by="use_change").size()

use_change
False    76783
True     85435
dtype: int64

Evaluate the variables'present_gfa', 'proposed_gross_floor_area' 
Convert to a dummy to indicate whether the gfa changing in the permit application or not: 1= change/ 0=no change

In [64]:
gfa = X['proposed_gross_floor_area'] != X['present_gfa']
gfa = pd.DataFrame(gfa, columns=["gfa_change"])
gfa.groupby(by="gfa_change").size()

gfa_change
False     61745
True     100473
dtype: int64

In [65]:
temp = pd.concat([use, num_units, gfa], axis=1)

Create dummy variables for all "GroupName"

In [66]:
agencies = pd.get_dummies(X.GroupName)
agencies.columns

Index(['CFA Review', 'Chinatown Review', 'DC Water Review', 'DDOE Review',
       'DDOT Review', 'DOEE Review', 'DOH Review', 'EISF Review',
       'Electrical Review', 'Elevator Review', 'Energy Review', 'File Room',
       'Fire Review', 'Green Review', 'HPRB Review', 'Mechanical Review',
       'NCPC Review', 'PRC', 'Plumbing Review', 'QA Review',
       'Structural Review', 'WASA Review', 'WMATA Review',
       'White House Review', 'Zoning Review'],
      dtype='object')

In [67]:
temp = pd.concat([temp, agencies], axis=1)
X = pd.concat([X, temp], axis=1)

In [68]:
# Drop variables
X = X.drop(['existing_use_of_building', 'proposed_use_of_building','proposed_number_stories',
            'proposed_number_units','existing_number_units', 
            'proposed_gross_floor_area','present_gfa' ,'GroupName'], axis=1)

In [69]:
# Drop N.A.
X.isnull().sum()

alias                             0
pdox_b1_id                        0
Project_Status                   66
ReviewCycle                    1444
TaskName                          0
elapsed_workdays                  9
RC_HowAssigned                68610
job_class                       478
AGENCY                            0
Ward                            351
green_floor_area              56304
review_group                      0
use_type                      29771
permit_type                       0
building_construction_type    68345
perm_id                           0
PRICE                         49177
QUALIFIED                     34693
EYB                           34761
LIVING_GBA                    34693
USECODE                       34693
LANDAREA                      34693
use_change                        0
num_units_change                  0
gfa_change                        0
CFA Review                        0
Chinatown Review                  0
DC Water Review             

In [70]:
X = X.dropna()
X.iloc[:,0].size

37726

In [71]:
# Save
X.to_csv("~/Google/19Spring/ANLY501ASL/GroupProject/data/cleaned_uncode.csv", index=False)

#### Log variables have big numbers: *green_floor_area*, *PRICE*, *LIVING_GBA*, *LANDAREA*

In [72]:
X.groupby(by="green_floor_area").size()

green_floor_area
0.0000         24698
0.5600            20
1.0000            22
2.0000            81
3.0000            79
5.0000            26
6.0000            11
13.5000           10
21.6000           21
24.0000           23
29.7000           27
30.0000           13
30.2600           14
52.0000           26
52.8000           44
54.0000           14
60.0000           26
65.0000           17
70.0000           18
72.0000           11
97.5000           14
98.0000           26
105.0000          37
108.0000          15
112.0000          16
115.3400          18
118.9036          19
119.0000          45
120.0000          18
126.0000          17
               ...  
16644.8000        21
17220.0000        26
18218.7500        27
19635.0000        26
23125.0000        39
25600.0000        26
26145.0000        21
26400.0000        16
26700.0000        31
27456.0000        26
33793.8000        23
45900.0000        23
46312.5000        27
52704.0000        25
57600.0000        35
65415.0000       

In [73]:
import math
X['green_floor_area'] = X['green_floor_area'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="green_floor_area").size()

In [74]:
X['PRICE'] = X['PRICE'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="PRICE").size()

In [75]:
X['LIVING_GBA'] = X['LIVING_GBA'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="LIVING_GBA").size()

In [76]:
X['LANDAREA'] = X['LANDAREA'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="LANDAREA").size()

##### Change EYB to elapsed years from construction

In [77]:
X['EYB'] = X['EYB'].apply(lambda x: None if x == 0 else x)

In [78]:
X['EYFC'] = 2018 - X['EYB']
X.groupby(by="EYFC").size()

EYFC
0.0       379
1.0       343
2.0       241
3.0       144
4.0       432
5.0       208
6.0       246
7.0       225
8.0       347
9.0       347
10.0      382
11.0      390
12.0      233
13.0      359
14.0      275
15.0      195
16.0      419
17.0      276
18.0      325
19.0      365
20.0      182
21.0      539
22.0      141
23.0     1012
24.0      479
25.0      432
26.0      795
27.0      399
28.0     1069
29.0      435
         ... 
63.0     2630
64.0       64
65.0       14
66.0       43
67.0      422
68.0      121
69.0       66
70.0      312
71.0       25
72.0       46
73.0       22
74.0      951
76.0       23
78.0       27
79.0       17
81.0       46
82.0       63
83.0       74
84.0       27
86.0       22
88.0      143
90.0       10
91.0       64
96.0       26
97.0       31
98.0      215
101.0       7
110.0      40
118.0     209
128.0      23
Length: 93, dtype: int64

In [79]:
X.isnull().sum()
X = X.dropna()
X.isnull().sum()
X = X.drop(['EYB'], axis = 1)

#### Encode categorical variables

In [80]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
CV = ['alias', 'Project_Status', 'TaskName', 'RC_HowAssigned', 'job_class', 'AGENCY',
      'review_group', 'use_type', 'permit_type', 'building_construction_type', 'QUALIFIED',
      'use_change', 'num_units_change', 'gfa_change']
for v in CV:
    X[v] = lb.fit_transform(X[v])

#### As discussed today, I'm going to create dummy target for 7,14,21,28 days.

In [81]:
X['elapsed_workdays_7'] = X['elapsed_workdays'].apply(lambda x: 1 if x > 7 else 0)
X.groupby(by='elapsed_workdays_7').size()

elapsed_workdays_7
0    20166
1    17299
dtype: int64

In [82]:
X['elapsed_workdays_14'] = X['elapsed_workdays'].apply(lambda x: 1 if x > 14 else 0)
X.groupby(by='elapsed_workdays_14').size()

elapsed_workdays_14
0    27177
1    10288
dtype: int64

In [83]:
X['elapsed_workdays_21'] = X['elapsed_workdays'].apply(lambda x: 1 if x > 20 else 0)
X.groupby(by='elapsed_workdays_21').size()

elapsed_workdays_21
0    30786
1     6679
dtype: int64

In [84]:
X['elapsed_workdays_28'] = X['elapsed_workdays'].apply(lambda x: 1 if x > 28 else 0)
X.groupby(by='elapsed_workdays_28').size()

elapsed_workdays_28
0    34060
1     3405
dtype: int64

In [85]:
# Save
X.to_csv("~/Google/19Spring/ANLY501ASL/GroupProject/data/cleaned_encode2.csv", index=False)