In [1]:
import os
import sys

path_to_repo = os.path.dirname(os.getcwd())
sys.path.append(path_to_repo)

import sklearn.datasets
import pandas as pd
import numpy as np

import sklearn.model_selection
from sklearn.metrics import r2_score, accuracy_score
#from imodels import FIGSRegressor, FIGSClassifier
#from imodels.importance import RandomForestPlusRegressor

import openml
from ucimlrepo import fetch_ucirepo 

from figs_d import FIGSRegressor
from fourierDistill import *
from binary_mapper import *

In [2]:
miami_housing = openml.datasets.get_dataset(43093)
X, y, _, _ = miami_housing.get_data(target=miami_housing.default_target_attribute, dataset_format="dataframe")

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0)

In [4]:
fbm = FIGSBinaryMapper(figs=FIGSRegressor(max_rules=30, max_trees=5, round_deg = 3))
X_train_new = fbm.fit_transform(X_train, y_train, train=True)
X_val_new = fbm.transform(X_val)
X_test_new = fbm.transform(X_test)

In [5]:
X_train_new.shape, fbm.num_interactions, fbm.max_interaction_size, len(fbm.figs.trees_)

((8358, 60), 35, 12, 5)

In [31]:
[len(i[0]) for i in fbm.figs_rules[0]]

[2, 5, 5, 4, 3, 3, 6, 6, 5, 4, 3, 3]

In [33]:
[i for i in fbm.figs_rules[0]]

[([(TOT_LVG_AREA <= 3459.500 (Tree #0 root), 'flip'),
   (CNTR_DIST <= 63545.699 (split), 'flip')],
  641563.1066755188),
 ([(TOT_LVG_AREA <= 3459.500 (Tree #0 root), 'flip'),
   (CNTR_DIST <= 63545.699 (split), 'original'),
   (CNTR_DIST <= 33606.201 (split), 'flip'),
   (age <= 21.500 (split), 'flip'),
   (PARCELNO <= 2050130182144.000 (split), 'flip')],
  668145.2314475699),
 ([(TOT_LVG_AREA <= 3459.500 (Tree #0 root), 'flip'),
   (CNTR_DIST <= 63545.699 (split), 'original'),
   (CNTR_DIST <= 33606.201 (split), 'flip'),
   (age <= 21.500 (split), 'flip'),
   (PARCELNO <= 2050130182144.000 (split), 'original')],
  994562.7593892454),
 ([(TOT_LVG_AREA <= 3459.500 (Tree #0 root), 'flip'),
   (CNTR_DIST <= 63545.699 (split), 'original'),
   (CNTR_DIST <= 33606.201 (split), 'flip'),
   (age <= 21.500 (split), 'original')],
  1222548.9867905304),
 ([(TOT_LVG_AREA <= 3459.500 (Tree #0 root), 'flip'),
   (CNTR_DIST <= 63545.699 (split), 'original'),
   (CNTR_DIST <= 33606.201 (split), 'orig

In [28]:
[len(t) for t in fbm.figs_rules]

[12, 12, 4, 3, 4]

In [37]:
max([len(p) for p, w in fbm.interactions])

6

In [24]:
sorted(fbm.interactions, key = lambda x: -len(x[0]))

[[['TOT_LVG_AREA_<=_3459.5',
   'TOT_LVG_AREA_>_2366.5',
   'CNTR_DIST_<=_62929.6',
   'TOT_LVG_AREA_>_2673.5',
   'PARCELNO_>_341290352640.0',
   'age_>_22.5'],
  536492.4181499392],
 [['TOT_LVG_AREA_<=_3459.5',
   'TOT_LVG_AREA_>_2366.5',
   'CNTR_DIST_<=_62929.6',
   'TOT_LVG_AREA_>_2673.5',
   'PARCELNO_>_341290352640.0',
   'age_<=_22.5'],
  720698.9818004618],
 [['TOT_LVG_AREA_>_3459.5',
   'CNTR_DIST_<=_63545.699',
   'CNTR_DIST_>_33606.201',
   'age_>_21.5',
   'PARCELNO_>_2050130182144.0'],
  668145.2314475699],
 [['TOT_LVG_AREA_>_3459.5',
   'CNTR_DIST_<=_63545.699',
   'CNTR_DIST_>_33606.201',
   'age_>_21.5',
   'PARCELNO_<=_2050130182144.0'],
  994562.7593892454],
 [['TOT_LVG_AREA_<=_3459.5',
   'TOT_LVG_AREA_>_2366.5',
   'CNTR_DIST_<=_62929.6',
   'TOT_LVG_AREA_>_2673.5',
   'PARCELNO_<=_341290352640.0'],
  795166.4718347429],
 [['OCEAN_DIST_<=_12560.1',
   'structure_quality_<=_4.5',
   'LATITUDE_>_25.618',
   'OCEAN_DIST_<=_1636.8',
   'WATER_DIST_>_118.65'],
  492484.

In [16]:
fbm.figs

In [6]:
ftd = FTDistillRegressorCV(pre_max_features=0.5, post_max_features=fbm.num_interactions, size_interactions = min(fbm.max_interaction_size, 3), re_fit_alpha = 10**np.linspace(-3, 3, 100))
ftd.fit(X_train_new, y_train)

<fourierDistill.FTDistillRegressorCV at 0x7fe04c8eb8e0>

In [7]:
r2_score(y_val, ftd.predict(X_val_new)), r2_score(y_val, fbm.predict(X_val))

(0.8099188368235776, 0.8138987483988642)

In [8]:
len(ftd.post_sparsity_model.coef_)

34

In [9]:
ftd.post_interaction_features

Index([                                                          ('TOT_LVG_AREA_>_1741.5',),
                                                                ('TOT_LVG_AREA_<=_4531.0',),
                                                                  ('LONGITUDE_<=_-80.275',),
                                        ('SUBCNTR_DI_<=_24754.25', 'TOT_LVG_AREA_>_1741.5'),
                                           ('LONGITUDE_>_-80.189', 'TOT_LVG_AREA_>_2673.5'),
                                        ('OCEAN_DIST_<=_12560.1', 'SUBCNTR_DI_<=_24754.25'),
                                            ('LONGITUDE_>_-80.189', 'OCEAN_DIST_<=_1636.8'),
                                         ('SUBCNTR_DI_<=_10521.45', 'WATER_DIST_<=_118.65'),
                                      ('SPEC_FEAT_VAL_>_44493.5', 'SUBCNTR_DI_<=_10521.45'),
                                      ('SUBCNTR_DI_<=_24754.25', 'SPEC_FEAT_VAL_>_27111.5'),
                                     ('TOT_LVG_AREA_<=_4531.0', 'struc

In [10]:
interactions = []
figs_weight_dict = {}
for i, tree in enumerate(fbm.figs_rules):
        for interaction_weight in tree:
            interaction, weight = interaction_weight
            cur_interaction = []
            for rule, sign in interaction:
                if sign == 'flip':
                    cur_interaction.append(f'{rule.feature_names[rule.feature]}_>_{str(round(rule.threshold, 3))}')
                elif sign == 'original':
                    cur_interaction.append(f'{rule.feature_names[rule.feature]}_<=_{str(round(rule.threshold, 3))}')
                else:
                    print('?')
            figs_weight_dict[tuple(cur_interaction)] = weight
            interactions.append([cur_interaction, weight])

In [11]:
def make_figs_lm_df(X, interactions):
    df = pd.DataFrame()
    for interaction, weigth in interactions:
        cur_val = 1
        for inter in interaction:
            cur_val *= X[inter]
        df[tuple(interaction)] = cur_val.values
    return df

In [12]:
def figs_lm_predict(X, figs_weight_dict):
    pred_sum = 0 
    for interaction in figs_weight_dict.keys():
        pred_sum += figs_weight_dict[interaction] * X[interaction]
    return pred_sum


In [13]:
[tuple(inter) for inter, weight in interactions]

[('TOT_LVG_AREA_>_3459.5', 'CNTR_DIST_>_63545.699'),
 ('TOT_LVG_AREA_>_3459.5',
  'CNTR_DIST_<=_63545.699',
  'CNTR_DIST_>_33606.201',
  'age_>_21.5',
  'PARCELNO_>_2050130182144.0'),
 ('TOT_LVG_AREA_>_3459.5',
  'CNTR_DIST_<=_63545.699',
  'CNTR_DIST_>_33606.201',
  'age_>_21.5',
  'PARCELNO_<=_2050130182144.0'),
 ('TOT_LVG_AREA_>_3459.5',
  'CNTR_DIST_<=_63545.699',
  'CNTR_DIST_>_33606.201',
  'age_<=_21.5'),
 ('TOT_LVG_AREA_>_3459.5', 'CNTR_DIST_<=_63545.699', 'CNTR_DIST_<=_33606.201'),
 ('TOT_LVG_AREA_<=_3459.5', 'TOT_LVG_AREA_>_2366.5', 'CNTR_DIST_>_62929.6'),
 ('TOT_LVG_AREA_<=_3459.5',
  'TOT_LVG_AREA_>_2366.5',
  'CNTR_DIST_<=_62929.6',
  'TOT_LVG_AREA_>_2673.5',
  'PARCELNO_>_341290352640.0',
  'age_>_22.5'),
 ('TOT_LVG_AREA_<=_3459.5',
  'TOT_LVG_AREA_>_2366.5',
  'CNTR_DIST_<=_62929.6',
  'TOT_LVG_AREA_>_2673.5',
  'PARCELNO_>_341290352640.0',
  'age_<=_22.5'),
 ('TOT_LVG_AREA_<=_3459.5',
  'TOT_LVG_AREA_>_2366.5',
  'CNTR_DIST_<=_62929.6',
  'TOT_LVG_AREA_>_2673.5',
  'PAR

In [14]:
def figs_lm_get_coef(X):
    coefs = []
    for col in X.columns:
        coefs.append(figs_weight_dict[col])
    return np.array(coefs)


In [15]:
figs_lm_coefs = figs_lm_get_coef(figs_lm_train)

NameError: name 'figs_lm_train' is not defined

In [None]:
def solve_lambda(X, y, beta):
    A = X.T @ X
    b = X.T @ y
    #lambda_vec = (b - A @ beta) / beta
    lambdas = []
    for i, lam in enumerate(b- A@beta):
        lambdas.append(lam/beta[i])
    return np.array(lambdas)

lambda_vec = solve_lambda(figs_lm_train.values, y_train.values, figs_lm_coefs)

In [None]:
np.round(lambda_vec, 3)

In [None]:
figs_lm_train = make_figs_lm_df(X_train_new, interactions)

In [None]:
len(pd.Series(fbm.predict(X_train), name = 'figs_preds').value_counts())

In [None]:
np.where(figs_lm_predict(figs_lm_train, figs_weight_dict).to_numpy() != fbm.predict(X_train))

In [None]:
figs_lm_predict(figs_lm_train, figs_weight_dict).to_numpy()[7803], fbm.predict(X_train)[7803]

In [None]:
np.mean(abs(figs_lm_predict(figs_lm_train, figs_weight_dict).to_numpy() - fbm.predict(X_train))< 1)

In [None]:
fbm.predict(X_train)

In [None]:
[tuple([f'{xi[0].feature_names[xi[0].feature]}_<=_{str(xi[0].threshold)}' if xi[1] == 'original' else f'{xi[0].feature_names[xi[0].feature]}_>_{str(xi[0].threshold)}'  for xi in x]) for x in t2]

In [None]:
t2 = [tuple(xi[0]) for x in t for xi in x]

df = pd.DataFrame(columns = sorted([tuple([f'{xi[0].feature_names[xi[0].feature]}_<=_{str(xi[0].threshold)}' if xi[1] == 'original' else f'{xi[0].feature_names[xi[0].feature]}_>_{str(xi[0].threshold)}'  for xi in x]) for x in t2], key = len))
df.columns

In [None]:
figs_paths = sorted([tuple([f'{xi[0].feature_names[xi[0].feature]}_<=_{str(xi[0].threshold)}' if xi[1] == 'original' else f'{xi[0].feature_names[xi[0].feature]}_>_{str(xi[0].threshold)}'  for xi in x]) for x in t2], key = len)
figs_sorted = [sorted(fp) for fp in figs_paths]

In [None]:
ftd_paths = ftd.post_interaction_features
ftd_sorted = [sorted(fp) for fp in ftd_paths]

In [None]:
len(figs_sorted), len(ftd_sorted)

In [None]:
np.sum([o == t for o in figs_sorted for t in ftd_sorted])

In [None]:
t2 = [tuple(xi[0]) for x in fbm.figs_rules for xi in x]

figs_paths = sorted([tuple([f'{xi[0].feature_names[xi[0].feature]}_<=_{str(xi[0].threshold)}' if xi[1] == 'original' else f'{xi[0].feature_names[xi[0].feature]}_>_{str(xi[0].threshold)}'  for xi in x]) for x in t2], key=len)
ftd_paths = ftd.post_interaction_features
                     
list1 = [set(fp) for fp in figs_paths]
list2 = [set(fp) for fp in ftd_paths]
frozensets_list1 = set(frozenset(s) for s in list1)
frozensets_list2 = set(frozenset(s) for s in list2)

# Step 2: Find the intersection of the two sets
common_frozensets = frozensets_list1.intersection(frozensets_list2)

# Step 3: Count the number of elements in the intersection
common_count = len(common_frozensets)

print(f"Number of common sets: {common_count}")


In [None]:
[set(fp) for fp in ftd_paths]

In [None]:
if ftd.pre_interaction_model is not None:
    X_cur = X_train_new[ftd.pre_interaction_features]

poly_features = list(map(lambda s: set(s.split()), ftd.poly.get_feature_names_out(X_cur.columns)))
        
Chi = pd.DataFrame(ftd.poly.transform(X_cur), columns=list(map(lambda f: tuple(f), poly_features))).loc[:, ftd.features]

Chi.drop(columns = [('1',)], inplace=True)

if ftd.re_fit_alpha is not None:
    Chi[('1',)] = 1
    Chi = Chi[np.array([('1',)]+list(ftd.post_interaction_features), dtype=object)]

In [None]:
new_coefs = []
for i in Chi.columns:
    new_coefs.append(np.sum(Chi[i] * y_train)/np.sum(Chi[i]))
new_coefs = np.array(new_coefs)

In [None]:
new_coefs, ftd.post_sparsity_model.coef_

In [None]:
Chi @new_coefs, y_train

In [None]:
r2_score(y_train, Chi @new_coefs)

In [None]:
r2_score(y_train, fbm.figs.predict(X_train)), r2_score(y_train, ftd.predict(X_train_new))

In [None]:
r2_score(y_val, fbm.figs.predict(X_val)), r2_score(y_val, ftd.predict(X_val_new))

In [None]:
r2_score(y_test, fbm.figs.predict(X_test)), r2_score(y_test, ftd.predict(X_test_new))

In [None]:
def process_figs_splits(figs):
    figs_rc = [traverse_paths(t) for t in figs.trees_]
    print(figs_rc)
    #figs_r = [r for r, c in figs_rc]
    figs_rules = [x for xs in figs_rc for x in xs]
    return figs_rules

In [None]:
t = process_figs_splits(figs)
t

In [None]:
ftd = FTDistillRegressorCV(pre_interaction=None, post_max_features=10)
ftd.fit(X_train_new, y_train)

In [None]:
figs = FIGSRegressor(max_rules = 17, max_trees = 10)
figs.fit(X_train_new, y_train, feature_names = list(X_train_new.columns))

In [None]:
len(figs.trees_)

In [None]:
figs_rules = [traverse_paths(t) for t in figs.trees_]

In [None]:
figs_rules

In [None]:
figs_rules = [
    x
    for xs in figs_rules
    for x in xs
]

In [None]:
len(figs_rules)

In [None]:
figs_rules

In [None]:
paths = []
weights = []
for path, weight in figs_rules:
    cur_path = []
    for stop in path:
        cur_path.append(stop.feature_names[stop.feature])
    paths.append(tuple(cur_path))
    weights.append(weight)

In [None]:
'AJKD'.lower()

In [None]:
figs_int = sorted(paths, key=lambda x: (len(x), x[0].lower()))

In [None]:
ftd_int = sorted(list(ftd.post_interaction_features), key=lambda x: (len(x), x[0].lower()))

In [None]:
[sorted(f) for f in figs_int]

In [None]:
[sorted(f) for f in ftd_int]

In [None]:
sum([x == y for x in figs_int for y in ftd_int])

In [None]:
flat_figs_int = [s for t in figs_int for s in t]
flat_ftd_int = [s for t in ftd_int for s in t]
print(f'FIGS unique: {sorted(list(set(flat_figs_int)))}')
print(f'FT Distill unique: {sorted(list(set(flat_ftd_int)))}')

In [None]:
from sklearn.metrics import r2_score

In [None]:
print(r2_score(y_val, ftd.predict(X_val_new)))
print(r2_score(y_val, figs.predict(X_val_new)))

In [None]:
print(r2_score(y_train, ftd.predict(X_train_new)))
print(r2_score(y_train, figs.predict(X_train_new)))

In [None]:
len(sorted(list(set(flat_figs_int))))

In [None]:
len(sorted(list(set(flat_ftd_int))))

In [None]:
len(list(set(flat_figs_int + flat_ftd_int)))

In [None]:
set(flat_figs_int) - set.intersection(set(flat_figs_int), set(flat_ftd_int))

In [None]:
'CNTR_DIST_region2',
 'SPEC_FEAT_VAL_region3',
 'SPEC_FEAT_VAL_region4',
 'SUBCNTR_DI_region2',
 'WATER_DIST_region1',
 'avno60plus_0',
 'structure_quality_4'

In [None]:
len(set(flat_figs_int))

In [None]:
set.intersection(set(flat_figs_int), set(flat_ftd_int))

In [None]:
import seaborn as sns

corr = X_train_new.corr()

# plot the heatmap
sns.heatmap(corr)

plt.title('Correlation Matrix of GMM BM miami_housing Dataset')
plt.savefig('figs/corr.png', bbox_inches='tight')

In [None]:
sns.heatmap(corr > 0.75)
plt.title('Heatmap Correlation Matrix > 0.75 of GMM BM miami_housing Dataset')
plt.savefig('figs/corr_0.75.png', bbox_inches='tight')

In [None]:
corr = [(X_train_new.columns[t[0]], X_train_new.columns[t[1]]) for t in zip(np.where(corr > 0.5)[0], np.where(corr > 0.5)[1]) if t[0] != t[1]]

In [None]:
corr = sorted(corr, key= lambda x: x[0])
corr

In [None]:
np.where(abs(corr) > 0.6)[0]

In [None]:
import pyreadr

result = pyreadr.read_r('/home/mattyshen/interpretableDistillation/interpretDistill/data/enhancer.Rdata') # also works for Rds

# done! let's see what we got

In [None]:
t = + 0
for col in result['X']:
    uv = result['X'][col].unique()
    t += len(uv)
    

In [None]:
result.keys()

In [None]:
t