In [1]:
import glob, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from tqdm import tqdm

from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

from ChemTDA import VariancePersist

Read in data to `VariancePersist`, try to classify based on PIs:

In [2]:
# Define hyperparameters of persistence image:
PI_hyp = {
    'pixelx': 50,
    'pixely': 50,
    'myspread': 2,
    'showplot': False,
    'max_dim': 2
}

MakePI = partial(VariancePersist, **PI_hyp) # For more readable code later

In [3]:
all_files = glob.glob(os.path.join('..', 'data', 'initial_data', 'STRUCTS', '*'))
energies = pd.read_excel(os.path.join('..', 'data', 'initial_data', 'energies.xlsx'), index_col = 0)

X = []
y = energies.iloc[:,0].to_numpy() # To list so we can sort later
keys = []

for i in tqdm(range(len(all_files))):
    f = all_files[i]
    keys.append(int(os.path.basename(f)[7:-4])) # Gets the number of the file
    X.append(MakePI(f))
    
# Sort to keep labeling consistent:
sorting_args = np.argsort(keys)
X = [X[i] for i in sorting_args]
keys = [keys[i] for i in sorting_args]

100%|██████████| 1042/1042 [00:29<00:00, 35.50it/s]


In [4]:
print(X[0].shape)

(2500,)


In [5]:
svm = SVR()
cross_val_score(svm, X, y, scoring='r2')

array([-0.15556635, -0.2518944 , -0.2000001 , -0.25118516, -0.09170254])

In [6]:
from sklearn.ensemble import GradientBoostingRegressor as XGBoost
xgb = XGBoost()
cross_val_score(xgb, X, y, scoring='r2')

array([-0.05020406, -0.10528115, -0.10594959, -0.04302611,  0.01078493])

In [9]:
from sklearn.kernel_ridge import KernelRidge
krr = KernelRidge()
cross_val_score(krr, X, y, scoring='r2')

array([-0.12802338, -0.96834224, -0.86229737, -0.94113888, -0.508422  ])

XGBoost and SVR do fairly terribly on the PIs.