In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 31 20:44:46 2023

@author: michal
"""

import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull, Delaunay
from pycaret.regression import setup, compare_models
from sklearn import datasets
from scipy.stats.qmc import PoissonDisk


iris = datasets.load_iris()
data = iris.data, iris.feature_names
df = pd.DataFrame(iris.data, columns=iris.feature_names)



mapper = umap.UMAP(n_components=3, metric="minkowski", n_neighbors=20, min_dist=1.0)
mapping = mapper.fit_transform(df)



hull = ConvexHull(mapping)

for simplex in hull.simplices:
    plt.plot(mapping[simplex, 0], mapping[simplex, 1], 'k-')

def poisson_disc_sampling(points, radius=0.1):
    # Generate the Poisson disk samples within the convex hull
    tri = Delaunay(points)
    lb, ub = tri.min_bound, tri.max_bound
    size = np.abs(ub - lb)
    poisson = PoissonDisk(d=3, radius=radius)
    unit_samples = poisson.fill_space()
    samples = lb + size * unit_samples

    # Select the points that are within the convex hull
    simplex = tri.find_simplex(samples)
    samples = samples[simplex >= 0]
    return samples


# Generate random points in 3D space
points = mapping

# Compute the convex hull
hull = ConvexHull(points)

# Compute the Delaunay triangulation of the hull
tri = Delaunay(points)

# generate poisson disk samples within the convex hull
samples = poisson_disc_sampling(points, radius=0.1)

inverse_samples = mapper.inverse_transform(samples)
data_new = pd.concat([df, pd.DataFrame(inverse_samples, columns=df.columns)]).reset_index(drop=True)

print(data_new.shape, df.shape)

setup(df,
             target="petal width (cm)",
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=20,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


(207, 4) (150, 4)


Unnamed: 0,Description,Value
0,Session id,1767
1,Target,petal width (cm)
2,Target type,Regression
3,Original data shape,"(150, 4)"
4,Transformed data shape,"(150, 4)"
5,Transformed train set shape,"(105, 4)"
6,Transformed test set shape,"(45, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.15,0.0351,0.1837,0.9376,0.0808,0.2165,0.005
br,Bayesian Ridge,0.1442,0.0359,0.1828,0.9314,0.0794,0.2037,0.005
lr,Linear Regression,0.144,0.0359,0.1827,0.9313,0.0801,0.2056,0.26
ridge,Ridge Regression,0.1453,0.0362,0.1837,0.9313,0.0784,0.2009,0.005
lar,Least Angle Regression,0.144,0.0359,0.1827,0.9313,0.0801,0.2056,0.005
huber,Huber Regressor,0.1456,0.0363,0.1845,0.9307,0.0812,0.2059,0.006
lightgbm,Light Gradient Boosting Machine,0.1468,0.0372,0.1859,0.9302,0.0786,0.1902,0.054
et,Extra Trees Regressor,0.154,0.0393,0.1914,0.9293,0.0807,0.2041,0.02
rf,Random Forest Regressor,0.1567,0.041,0.1968,0.9264,0.0825,0.2102,0.024
ada,AdaBoost Regressor,0.1546,0.041,0.1981,0.9236,0.0826,0.1976,0.007


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [3]:
setup(data_new,
             target="petal width (cm)",
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=20,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


Unnamed: 0,Description,Value
0,Session id,3497
1,Target,petal width (cm)
2,Target type,Regression
3,Original data shape,"(207, 4)"
4,Transformed data shape,"(207, 4)"
5,Transformed train set shape,"(144, 4)"
6,Transformed test set shape,"(63, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ada,AdaBoost Regressor,0.1273,0.0297,0.1665,0.9383,0.0753,0.2077,0.009
knn,K Neighbors Regressor,0.1331,0.0314,0.1732,0.9339,0.0783,0.2109,0.005
et,Extra Trees Regressor,0.134,0.0329,0.1799,0.9333,0.0809,0.2134,0.02
rf,Random Forest Regressor,0.1376,0.0334,0.1807,0.9325,0.0819,0.2222,0.025
lightgbm,Light Gradient Boosting Machine,0.1344,0.0342,0.1817,0.9293,0.0807,0.1999,0.006
huber,Huber Regressor,0.1364,0.0363,0.1868,0.9253,0.0849,0.2144,0.006
gbr,Gradient Boosting Regressor,0.1423,0.0374,0.191,0.9248,0.0845,0.2207,0.007
ridge,Ridge Regression,0.1379,0.0371,0.1885,0.9235,0.0841,0.2181,0.005
catboost,CatBoost Regressor,0.1473,0.0382,0.1928,0.9234,0.09,0.2557,0.064
br,Bayesian Ridge,0.1373,0.037,0.1882,0.9234,0.0853,0.2205,0.005


Processing:   0%|          | 0/85 [00:00<?, ?it/s]