In [37]:
import yaml
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from slmvp import SLMVPTrain, SLMVP_transform
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from datasets import coil2000

data = coil2000()

X_train, y_train = data.train[:, :-1], data.train[:, -1]
# Standardize
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# y_train = scaler.fit_transform(y_train.reshape(-1, 1))

"""
rank: 5, 10, 50, 100, 200
typeK: radial, linear, polynomial
    if typeK == radial
        gamma: 0.01, 0.1, 1, 10, 1/(X_train.shape[0] * X_train.var())
    if typeK == linear
        polyValue: 5
"""
rank = [x for x in [5, 10, 50, 100, 200] if x <= X_train.shape[0]]
typeK = ['radial', 'linear', 'polynomial']  # polynomial
gamma = [0.01, 0.1, 1, 10, 1/(X_train.shape[0] * X_train.var())]
poly_order = 5

kernel_sett = [*[(x, y) for x in ['radial']
                 for y in gamma], *[(x, None) for x in typeK[1:]]]
settings = [(x, *y) for x in rank for y in kernel_sett]


settings = [(3, 'polynomial', None)]
for sett in settings:
    print(sett)
    BAux, Sx = SLMVPTrain(X=X_train.T, Y=y_train,
                          rank=sett[0],
                          typeK=sett[1],
                          gammaX=sett[2],
                          gammaY=sett[2],
                          polyValue=poly_order)

(3, 'polynomial', None)


## Explainability

In [53]:
from lol import LOL

lmao = LOL(n_components=4, svd_solver='full')
lmao.fit(X_train, y_train)
P_data = lmao.transform(X_train).T

# Load the data into a Pandas df
df = pd.DataFrame(X_train, columns=data.col_names[:-1])

# Add the principal components as columns
for i in range(len(P_data)):
    df['prin'+str(i+1)] = P_data[i]

# Correlations between the original data and each principal component
df_corr_lol = df.corr().iloc[:len(X_train[0]), len(X_train[0]):]
#df_corr.to_csv('corr_lol.csv')

In [51]:
df_corr

Unnamed: 0,prin1,prin2,prin3
"Subtype: High Income, expensive child",0.257302,0.276555,0.137127
Subtype: Very Important Provincials,0.084625,0.096432,0.081903
Subtype: High status seniors,0.104406,0.101591,0.121569
Subtype: Affluent senior apartments,0.034910,0.029378,0.085628
Subtype: Mixed seniors,-0.016149,-0.013272,0.047547
...,...,...,...
Number of surfboard policies,0.020596,0.026239,0.000061
Number of boat policies,0.033813,0.022800,-0.001129
Number of bicycle policies,0.028787,0.040841,0.011462
Number of property insurance policies,0.029670,0.020944,0.000792


In [54]:
from sklearn.decomposition import PCA

pca_model = PCA(3)
P_data = pca_model.fit_transform(data.train).T

# Load the data
df = pd.DataFrame(data.train, columns=data.col_names)

# Add the principal components as columns
for i in range(len(P_data)):
    df['prin'+str(i+1)] = P_data[i]

# Correlations between the original data and each principal component
df_corr_pca = df.corr().iloc[:len(data.train[0]), len(data.train[0]):]
#df_corr.to_csv('corr_pca.csv')

In [35]:
df_corr

Unnamed: 0,prin1,prin2,prin3
"Subtype: High Income, expensive child",-0.276510,0.137197,0.012826
Subtype: Very Important Provincials,-0.096329,0.081930,-0.010750
Subtype: High status seniors,-0.101735,0.121589,0.059316
Subtype: Affluent senior apartments,-0.029495,0.085631,0.012487
Subtype: Mixed seniors,0.013345,0.047548,0.030715
...,...,...,...
Number of boat policies,-0.023034,-0.001133,-0.003799
Number of bicycle policies,-0.040669,0.011480,0.011576
Number of property insurance policies,-0.021120,0.000790,-0.020828
Number of social security insurance policies,-0.061990,0.002911,-0.066642


In [55]:
# Get the principal components
P_data = SLMVP_transform(BAux.T, X_train.T)

# Load the data into a Pandas df
df = pd.DataFrame(X_train, columns=data.col_names[:-1])

# Add the principal components as columns
for i in range(len(P_data)):
    df['prin'+str(i+1)] = P_data[i]

# Correlations between the original data and each principal component
df_corr_slmvp = df.corr().iloc[:len(X_train[0]), len(X_train[0]):]
# df_corr.to_csv('corr_slmvp.csv')

# def highlight_cells(val):
#     condition = abs(val) >= 0.4
#     color = 'yellow' if condition else ''
#     font_color = 'black' if condition else ''
#     return 'background-color: {}; color: {}'.format(color, font_color)

# df_corr.style.applymap(highlight_cells)

In [60]:
df_corr_tchnq = pd.DataFrame()
df_corr_tchnq['prin1_lol'] = df_corr_lol['prin1']
df_corr_tchnq['prin1_slmvp'] = df_corr_slmvp['prin1']
df_corr_tchnq['prin1_pca'] = df_corr_pca['prin1']
df_corr_tchnq.corr()

Unnamed: 0,prin1_lol,prin1_slmvp,prin1_pca
prin1_lol,1.0,0.376467,-0.983807
prin1_slmvp,0.376467,1.0,-0.24842
prin1_pca,-0.983807,-0.24842,1.0
