In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [None]:
# Run Ziqi's code on "merged_dcra_cama.csv" first then import it
df = pd.read_csv("data/merged.csv", encoding='iso-8859-1', low_memory=False)

In [None]:
df.columns

In [None]:
# Drop variables
X = df.drop(['X1', 'As_Of_Date', 'permit_cap_status','Project_Description','Project_CreateDate', 'TaskStatus',
             'FlowTask_CreatedDate','FlowTask_DateUpdated', 'FlowTask_DateAccepted', 'RC_ReviewStatus', 
             'RC_ReviewComments_confidential', 'Fee_Assessed','over_30','pdox','est_worktime', 'QA_Date',
             'QA_Date_rc1', 'DESC_OF_WORK', 'issued', 'Begin_review_dd', 'Resubmit_rec_dd', 'rc_1_time', 'rc_time',
             'FlowInstanceID', 'create_review', 'ssl', 'RC_assignedBy_anon', 'UpdatedByEmail_anon', 'applicant_anon',
             'owner_name_anon', 'NAME_anon', 'OBJECTID', 'SALEDATE', 'AYB', 'SALE_NUM', 'GIS_LAST_MOD_DTTM'], axis=1)

In [None]:
X.columns

In [None]:
# Clean GroupName variable
# Suppress chaining warning
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
X.GroupName[X.GroupName == 'DC Water Review Supervisor'] = "DC Water Review"
X.GroupName[X.GroupName == 'DDOT Review Supervisor'] = "DDOT Review"
X.GroupName[X.GroupName == 'DOH Review Supervisor'] = "DOH Review"
X.GroupName[X.GroupName == 'Electrical Review Supervisor'] = "Electrical Review"
X.GroupName[X.GroupName == 'Elevator Review Supervisor'] = "Elevator Review"
X.GroupName[X.GroupName == 'Energy Review Supervisor'] = "Energy Review"
X.GroupName[X.GroupName == 'Fire Review Supervisor'] = "Fire Review"
X.GroupName[X.GroupName == 'Green Review Supervisor'] = "Green Review"
X.GroupName[X.GroupName == 'Mechanical Review Supervisor'] = "Mechanical Review"
X.GroupName[X.GroupName == 'Plumbing Review Supervisor'] = "Plumbing Review"
X.GroupName[X.GroupName == 'Structural Review Supervisor'] = "Structural Review"
X.GroupName[X.GroupName == 'WASA Review Supervisor'] = "WASA Review"
X.GroupName[X.GroupName == 'Zoning Review Supervisor'] = "Zoning Review"

In [None]:
X.GroupName[X.GroupName == 'DDOE Review Supervisor'] = "DDOE Review"
X.GroupName[X.GroupName == 'DDOE SE-SW Review'] = "DDOE Review"
X.GroupName[X.GroupName == 'DDOE WSP Review'] = "DDOE Review"
X.GroupName[X.GroupName == 'DOEE FP Review'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE GAR Review'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE Review Supervisor'] = "DOEE Review"
X.GroupName[X.GroupName == 'DOEE SE-SW Review'] = "DOEE Review"

In [None]:
# Aggregate 'building_construction_type' variable 
X.building_construction_type[X.building_construction_type == 'Type I - Fire-Resistive Non-combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'Type I-A - Fire-Resistive Non-Combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'Type I-B - Fire-Resistive Non-Combustible'] = "Type I - Fire-Resistive Non-Combustible"
X.building_construction_type[X.building_construction_type == 'TYPE II-A - Non-Combustible Construction'] = "TYPE II - Non-Combustible Construction"
X.building_construction_type[X.building_construction_type == 'TYPE II-B - Non-Combustible Construction'] = "TYPE II - Non-Combustible Construction"
X.building_construction_type[X.building_construction_type == 'TYPE III-A - Non-Combustible Exterior Walls'] = "TYPE III - Non-Combustible Exterior Walls"
X.building_construction_type[X.building_construction_type == 'TYPE III-B - Non-Combustible Exterior Walls'] = "TYPE III - Non-Combustible Exterior Walls"
X.building_construction_type[X.building_construction_type == 'TYPE V-A - Any Materials Permitted'] = "TYPE V - Any Materials Permitted"
X.building_construction_type[X.building_construction_type == 'TYPE V-B - Any Materials Permitted'] = "TYPE V - Any Materials Permitted"

Evaluate the variables 'proposed_number_units' and 'existing_number_units' 
Convert to a dummy to indicate whether the num of units changing in the permit application or not: 1= change/ 0=no change

In [None]:
num_units = X['proposed_number_units'] != X['existing_number_units']
num_units = pd.DataFrame(num_units, columns=["num_units_change"])
num_units.groupby(by="num_units_change").size()

Evaluate the variables 'existing_use_of_building' and 'proposed_use_of_building' 
Convert to a dummy to indicate whether the existing use changing in the permit application or not: 1= change/ 0=no change

In [None]:
use = X['existing_use_of_building'] != X['proposed_use_of_building']
use = pd.DataFrame(use, columns=["use_change"])
use.groupby(by="use_change").size()

Evaluate the variables'present_gfa', 'proposed_gross_floor_area' 
Convert to a dummy to indicate whether the gfa changing in the permit application or not: 1= change/ 0=no change

In [None]:
gfa = X['proposed_gross_floor_area'] != X['present_gfa']
gfa = pd.DataFrame(gfa, columns=["gfa_change"])
gfa.groupby(by="gfa_change").size()

In [None]:
temp = pd.concat([use, num_units, gfa], axis=1)

Create dummy variables for all "GroupName"

In [None]:
agencies = pd.get_dummies(X.GroupName)
agencies.columns

In [None]:
temp = pd.concat([temp, agencies], axis=1)
X = pd.concat([X, temp], axis=1)

In [None]:
# Drop variables
X = X.drop(['existing_use_of_building', 'proposed_use_of_building','proposed_number_stories',
            'proposed_number_units','existing_number_units', 
            'proposed_gross_floor_area','present_gfa' ,'GroupName'], axis=1)

In [None]:
# Drop N.A.
X.isnull().sum()

In [None]:
X = X.dropna()
X.iloc[:,0].size

In [None]:
# Save
X.to_csv("~/Google/19Spring/ANLY501ASL/GroupProject/data/cleaned_uncode.csv", index=False)

#### Log variables have big numbers: *green_floor_area*, *PRICE*, *LIVING_GBA*, *LANDAREA*

In [None]:
X.groupby(by="green_floor_area").size()

In [None]:
import math
X['green_floor_area'] = X['green_floor_area'].apply(lambda x: math.log(x,10) if x > 0 else x)

In [None]:
X.groupby(by="green_floor_area").size()

In [None]:
X['PRICE'] = X['PRICE'].apply(lambda x: math.log(x,10) if x > 0 else x)
X.groupby(by="PRICE").size()

In [None]:
X['LIVING_GBA'] = X['LIVING_GBA'].apply(lambda x: math.log(x,10) if x > 0 else x)
X.groupby(by="LIVING_GBA").size()

In [None]:
X['LANDAREA'] = X['LANDAREA'].apply(lambda x: math.log(x,10) if x > 0 else x)
X.groupby(by="LANDAREA").size()

##### Standardize EYB

In [None]:
from sklearn import preprocessing
X['EYB'] = preprocessing.scale(X['EYB'])
X.groupby(by="EYB").size()

#### Encode categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
CV = ['alias', 'Project_Status', 'TaskName', 'RC_HowAssigned', 'job_class', 'AGENCY',
      'review_group', 'use_type', 'permit_type', 'building_construction_type', 'QUALIFIED',
      'use_change', 'num_units_change', 'gfa_change']
for v in CV:
    X[v] = lb.fit_transform(X[v])

#### I'm thinking over30 may not be a good threshold, according to the distribution of *elapsed_workdays*, I choose over20 (around 80 percentile). The threshold can be changed later.

In [None]:
X.groupby(by="elapsed_workdays").size()

In [None]:
np.percentile(X.elapsed_workdays, [25, 50, 81])

In [None]:
X['elapsed_workdays_20'] = X['elapsed_workdays'].apply(lambda x: 1 if x > 20 else 0)
X.groupby(by='elapsed_workdays_20').size()

In [None]:
# Save
X.to_csv("~/Google/19Spring/ANLY501ASL/GroupProject/data/cleaned_encode.csv", index=False)