In [16]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.preprocessing import LabelEncoder
from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import OneHotEncoder

In [17]:
df = pd.read_csv("/content/drive/MyDrive/WMCA/chaid_data.csv")
df.drop(columns=["current-energy-rating", "current-energy-efficiency",
                'address'], inplace=True)

In [18]:
# Convert to ordinal for regression
df['inspection-date'] = df['inspection-date'].astype('datetime64').map(dt.datetime.toordinal)
df['lodgement-datetime'] = df['lodgement-datetime'].astype('datetime64').map(dt.datetime.toordinal)

In [19]:
# Keep missing values as seperate category
bool_var = ['solar-water-heating-flag', 'photo-supply-binary', 'mains-gas-flag']
df[bool_var] = df[bool_var].replace(np.nan, 'MISSING')
df[bool_var] = df[bool_var].astype(str)

# One hot encode non-ordinal variables
for var in bool_var:
  enc = OneHotEncoder(handle_unknown='ignore')
  df[var] = enc.fit_transform(np.array(df[var]).reshape(-1,1)).toarray()

Impute with iterator (merge with other)

In [20]:
cat_var = df.select_dtypes(include= ['object']).columns.tolist()

percent_missing = df[cat_var].isnull().sum() * 100 / len(df)
percent_missing.sort_values(ascending=False, inplace=True)
print(percent_missing)

floor-level                 80.628631
roof-energy-eff             16.481990
roof-env-eff                16.481990
glazed-type                 11.092910
glazed-area                  8.437256
mechanical-ventilation       8.437121
construction-age-band        8.374794
tenure                       7.603029
energy-tariff                6.602416
built-form                   2.260964
main-fuel                    0.890186
hot-water-env-eff            0.179799
mainheat-energy-eff          0.179663
mainheat-env-eff             0.179392
hot-water-energy-eff         0.172211
mainheatc-env-eff            0.140370
mainheatc-energy-eff         0.140370
lighting-energy-eff          0.096335
walls-energy-eff             0.073437
walls-env-eff                0.073437
lighting-env-eff             0.070592
windows-energy-eff           0.069101
windows-env-eff              0.069101
transaction-type             0.048235
windows-description          0.026421
roof-description             0.004200
walls-descri

In [21]:
# High missing values 
df.drop(columns=['floor-level'], inplace=True)
cat_var.remove('floor-level')

In [22]:
# Impute small amount of missing
for var in percent_missing.index[percent_missing <1]:
  imputer = CategoricalImputer()
  df[var] = imputer.fit_transform(df[var])

# Fill remaining missing values
df[cat_var] = df[cat_var].replace(np.nan, 'MISSING')

In [24]:
ordinal_var = [col for col in df.columns if '-eff' in col] + ['construction-age-band', 'glazed-area']

# Change categorical variable to numbers
for var in ordinal_var:
  label_encoder = LabelEncoder()
  df[var] = label_encoder.fit_transform(df[var]) 

In [25]:
non_ordinal_var = [col for col in df.columns if col not in ordinal_var]
# One hot encode non-ordinal variables
for var in non_ordinal_var:
  # Crashes :(
  # enc = OneHotEncoder(handle_unknown='ignore')
  # df[var] = enc.fit_transform(np.array(df[var]).reshape(-1,1)).toarray()

  label_encoder = LabelEncoder()
  df[var] = label_encoder.fit_transform(df[var]) 

In [26]:
df.to_csv("/content/drive/MyDrive/WMCA/encoding_categorical.csv", index=False)