In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 31 20:44:46 2023

@author: michal
"""

import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull, Delaunay
from pycaret.regression import setup, compare_models
from sklearn import datasets
from scipy.stats.qmc import PoissonDisk

iris = datasets.load_iris()
data = iris.data, iris.feature_names
df = pd.DataFrame(iris.data, columns=iris.feature_names)


mapper = umap.UMAP(n_components=3, metric="minkowski", n_neighbors=20, min_dist=1)
mapping = mapper.fit_transform(df)



hull = ConvexHull(mapping)

for simplex in hull.simplices:
    plt.plot(mapping[simplex, 0], mapping[simplex, 1], 'k-')

def poisson_disc_sampling(points, radius=0.1):
    # Generate the Poisson disk samples within the convex hull
    tri = Delaunay(points)
    lb, ub = tri.min_bound, tri.max_bound
    size = np.abs(ub - lb)
    poisson = PoissonDisk(d=3, radius=radius)
    unit_samples = poisson.fill_space()
    samples = lb + size * unit_samples

    # Select the points that are within the convex hull
    simplex = tri.find_simplex(samples)
    samples = samples[simplex >= 0]
    return samples


# Generate random points in 3D space
points = mapping

# Compute the convex hull
hull = ConvexHull(points)

# Compute the Delaunay triangulation of the hull
tri = Delaunay(points)

# generate poisson disk samples within the convex hull
samples = poisson_disc_sampling(points, radius=0.1)

inverse_samples = mapper.inverse_transform(samples)
data_new = pd.concat([df, pd.DataFrame(inverse_samples, columns=df.columns)]).reset_index(drop=True)

print(data_new.shape, df.shape)

setup(df,
             target='sepal width (cm)',
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=8,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


(233, 4) (150, 4)


Unnamed: 0,Description,Value
0,Session id,2417
1,Target,sepal width (cm)
2,Target type,Regression
3,Original data shape,"(150, 4)"
4,Transformed data shape,"(150, 3)"
5,Transformed train set shape,"(105, 3)"
6,Transformed test set shape,"(45, 3)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.2265,0.0835,0.2856,0.4261,0.0712,0.0754,0.006
ada,AdaBoost Regressor,0.2311,0.0846,0.2877,0.4171,0.0711,0.0767,0.01
rf,Random Forest Regressor,0.2393,0.0902,0.2958,0.373,0.0727,0.0785,0.024
et,Extra Trees Regressor,0.247,0.0994,0.3103,0.3235,0.0759,0.081,0.02
xgboost,Extreme Gradient Boosting,0.2442,0.0998,0.3082,0.3161,0.0756,0.0802,0.008
gbr,Gradient Boosting Regressor,0.2461,0.0992,0.3105,0.3043,0.0764,0.0813,0.007
catboost,CatBoost Regressor,0.2513,0.1039,0.3164,0.2741,0.0778,0.0829,0.033
lightgbm,Light Gradient Boosting Machine,0.2597,0.111,0.3265,0.2447,0.0807,0.0867,0.006
ridge,Ridge Regression,0.2902,0.1328,0.3607,0.0705,0.0899,0.0976,0.005
huber,Huber Regressor,0.2903,0.1328,0.3604,0.0696,0.09,0.0978,0.005


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [9]:
setup(data_new,
             target='sepal width (cm)',
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=20,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


Unnamed: 0,Description,Value
0,Session id,649
1,Target,sepal width (cm)
2,Target type,Regression
3,Original data shape,"(233, 4)"
4,Transformed data shape,"(233, 3)"
5,Transformed train set shape,"(163, 3)"
6,Transformed test set shape,"(70, 3)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.2063,0.0728,0.2606,0.5587,0.0656,0.0704,0.006
ada,AdaBoost Regressor,0.2245,0.0789,0.2738,0.522,0.0685,0.0762,0.008
rf,Random Forest Regressor,0.2352,0.0893,0.2934,0.4362,0.0733,0.0798,0.026
lightgbm,Light Gradient Boosting Machine,0.2298,0.0833,0.283,0.4312,0.0711,0.0778,0.006
catboost,CatBoost Regressor,0.2443,0.0941,0.2997,0.416,0.0746,0.0824,0.051
et,Extra Trees Regressor,0.2422,0.0945,0.302,0.4086,0.0761,0.0825,0.022
gbr,Gradient Boosting Regressor,0.2388,0.0922,0.2965,0.4084,0.0739,0.0809,0.008
xgboost,Extreme Gradient Boosting,0.2718,0.1217,0.3411,0.2121,0.0857,0.0918,0.007
huber,Huber Regressor,0.2916,0.137,0.3614,0.2108,0.092,0.1017,0.006
ridge,Ridge Regression,0.2919,0.1375,0.3621,0.2075,0.0921,0.1016,0.005


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [3]:

df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')