In [1]:
import pandas as pd
import pickle
import numpy as np

!pip install CHAID
from CHAID import Tree

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting CHAID
  Downloading CHAID-5.3.0-py3-none-any.whl (17 kB)
Collecting treelib
  Downloading treelib-1.6.1.tar.gz (24 kB)
Collecting savReaderWriter
  Downloading savReaderWriter-3.4.2.tar.gz (50.9 MB)
[K     |████████████████████████████████| 50.9 MB 205 kB/s 
Building wheels for collected packages: savReaderWriter, treelib
  Building wheel for savReaderWriter (setup.py) ... [?25l[?25hdone
  Created wheel for savReaderWriter: filename=savReaderWriter-3.4.2-py3-none-any.whl size=51142023 sha256=2c98d32ee8c344ee9d5d7957a60b483fd38fcd5bf5f681105fc19464f7928834
  Stored in directory: /root/.cache/pip/wheels/cb/e1/62/6632325e02256c9a369d61b28ce216694a353cd831feea54b6
  Building wheel for treelib (setup.py) ... [?25l[?25hdone
  Created wheel for treelib: filename=treelib-1.6.1-py3-none-any.whl size=18385 sha256=1c8f4d6018e79aa767c1c29f7271e1a96c594d33fde8fc5833b81b037bf252b3
  Sto

In [2]:
df = pd.read_csv("/content/drive/MyDrive/WMCA/cleaning_categorical_data.csv")

In [3]:
cat_var = ['mainheatcont-description', 'walls-description', 'hotwater-description', 
           'mainheat-description', 'floor-description',
           'windows-description', 'roof-description', 'secondheat-description',
           'main-fuel', 'transaction-type','energy-tariff']

# CHAID
[Overview](https://select-statistics.co.uk/blog/chaid-chi-square-automatic-interaction-detector/) of how CHAID works. 

In [4]:
chaid_dict = {}
for var in cat_var:
    #Set the inputs and outputs
    #The imputs are given as a dictionary along with the type
    #The output must be of string type
    #I have assume all features are nominal, we can change the features dictionary to include the ordinal type
    features = {var:'nominal'}
    label = 'current-energy-efficiency'
    #Create the Tree
    chaid_dict[var] = {}
    tree = Tree.from_pandas_df(df, i_variables = features, d_variable = label, alpha_merge = 0.0)
    #Loop through all the nodes and enter into a dictionary
    print('\n\n\nVariable: %s' % var)
    print('p-value: %f' % tree.tree_store[0].split.p)
    print('Chi2: %f' % tree.tree_store[0].split.score)
    for i in range(1, len(tree.tree_store)):
        count = tree.tree_store[i].members[0] + tree.tree_store[i].members[1]
        if count != 0:
          rate = tree.tree_store[i].members[1] / count
          print('\nNode %i:\n\tCount = %i\tRate = %f' % (i,count,rate))
          print('\t%s' % tree.tree_store[i].choices)
        chaid_dict[var]['node' + str(i)] = tree.tree_store[i].choices




Variable: mainheatcont-description
p-value: 0.000000
Chi2: 670869.797198

Node 1:
	Count = 1	Rate = 1.000000
	['2207 time and temperature zone control', 'time and temperature zone control', 'charging system linked to use of community heating, programmer and at least two room thermostatss', 'charging system linked to use of community heating, trvs']

Node 2:
	Count = 34	Rate = 1.000000
	['appliance thermostat', 'temperature zone control']

Node 3:
	Count = 30	Rate = 1.000000
	['automatic charge control', 'flat rate charging, no thermostat control', 'control for high heat retention storage heaters', 'flat rate charging, programmer, no room thermostat', 'manual charge control']

Node 5:
	Count = 16	Rate = 1.000000
	['celect-type control', 'programmer and appliance thermostat', 'flat rate charging, programmer and at least two room thermostats', 'charging system linked to use of community heating, room thermostat only']

Node 7:
	Count = 676	Rate = 1.000000
	['no thermostat control', 'no

In [5]:
with open('/content/drive/MyDrive/WMCA/chaid_dict.pkl', 'wb') as handle:
    pickle.dump(chaid_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Replace values
Using this naive method to group levels together. Will process more next iteration.

In [6]:
# Grouping doesn't make sense + small enough
cat_var.remove('transaction-type')

In [7]:
for var in cat_var:
  var_dict = {}
  for k, v in chaid_dict[var].items():
    var_dict.update(dict.fromkeys(v, k))
  df[var].replace(var_dict, inplace=True)

In [8]:
floor_level_dict = dict.fromkeys(['1','2','3','4'],'low floors')
floor_level_dict.update(dict.fromkeys(['-1', 'Ground']))
floor_level_dict.update(dict.fromkeys(['mid floor','5','6','7','8','9','10','11'],'mid floors'))
floor_level_dict.update(dict.fromkeys(['top floor','12','13','14','15','16','17','18','19','20',
                                   '21st or above', '20+'],'upper floors'))
df['floor-level'].replace(floor_level_dict, inplace=True)

In [9]:
glazed_dict = dict.fromkeys(['double glazing installed before 2002','double glazing, unknown install date'],'old double glazing')
glazed_dict.update(dict.fromkeys(['triple, known data','triple glazing'],'triple glazing'))
glazed_dict.update(dict.fromkeys(['secondary glazing','not defined','single glazing'],'old glazing'))
glazed_dict.update(dict.fromkeys(['double, known data','double glazing installed during or after 2002'],'double glazing'))
df['glazed-type'].replace(floor_level_dict, inplace=True)

In [10]:
df.to_csv("/content/drive/MyDrive/WMCA/chaid_data.csv", index=False)