In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 31 20:44:46 2023

@author: michal
"""

import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull, Delaunay
from pycaret.regression import setup, compare_models
from sklearn import datasets
from scipy.stats.qmc import PoissonDisk

iris = datasets.load_iris()
data = iris.data, iris.feature_names
df = pd.DataFrame(iris.data, columns=iris.feature_names)


class DataMapper:
    def __init__(self, n_components=3, metric="minkowski", n_neighbors=20, min_dist=0.5, radius=0.1):
        self.mapper = umap.UMAP(n_components=n_components, metric=metric, n_neighbors=n_neighbors, min_dist=min_dist)
        self.radius = radius

    def process_data(self, df):
        mapping = self.mapper.fit_transform(df)
        hull = ConvexHull(mapping)
        
        # Generate random points in 3D space
        points = mapping

        # Compute the Delaunay triangulation of the hull
        tri = Delaunay(points)

        # Generate Poisson disk samples within the convex hull
        samples = self.poisson_disc_sampling(points)

        # Inverse transform the samples
        inverse_samples = self.mapper.inverse_transform(samples)

        # Concatenate the original DataFrame with the inverse samples
        data_new = pd.concat([df, pd.DataFrame(inverse_samples, columns=df.columns)]).reset_index(drop=True)

        return data_new

    def poisson_disc_sampling(self, points):
        # Generate the Poisson disk samples within the convex hull
        tri = Delaunay(points)
        lb, ub = tri.min_bound, tri.max_bound
        size = np.abs(ub - lb)
        poisson = PoissonDisk(d=3, radius=self.radius)
        unit_samples = poisson.fill_space()
        samples = lb + size * unit_samples

        # Select the points that are within the convex hull
        simplex = tri.find_simplex(samples)
        samples = samples[simplex >= 0]
        return samples

# Example usage:
# Create an instance of the DataMapper class
mapper = DataMapper()

# Provide your DataFrame to the process_data method
data_new = mapper.process_data(df)

print(data_new.shape, df.shape)

setup(df,
             target='sepal width (cm)',
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=8,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


(233, 4) (150, 4)


Unnamed: 0,Description,Value
0,Session id,2790
1,Target,sepal width (cm)
2,Target type,Regression
3,Original data shape,"(150, 4)"
4,Transformed data shape,"(150, 3)"
5,Transformed train set shape,"(105, 3)"
6,Transformed test set shape,"(45, 3)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.2262,0.0849,0.284,0.4922,0.0712,0.0767,0.005
rf,Random Forest Regressor,0.2237,0.0861,0.2861,0.47,0.0713,0.0751,0.024
ada,AdaBoost Regressor,0.239,0.0929,0.2941,0.4455,0.0738,0.0804,0.01
gbr,Gradient Boosting Regressor,0.221,0.0891,0.289,0.4415,0.0721,0.0746,0.007
lightgbm,Light Gradient Boosting Machine,0.2428,0.0976,0.3011,0.4192,0.0747,0.0821,0.006
catboost,CatBoost Regressor,0.2371,0.0985,0.3046,0.4022,0.0758,0.0794,0.033
et,Extra Trees Regressor,0.2411,0.1057,0.3161,0.346,0.0788,0.0811,0.019
xgboost,Extreme Gradient Boosting,0.2606,0.1189,0.3353,0.2674,0.084,0.088,0.007
dt,Decision Tree Regressor,0.2728,0.1266,0.3448,0.1664,0.0853,0.0912,0.005
ridge,Ridge Regression,0.3081,0.155,0.3851,0.1085,0.0968,0.1048,0.005


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [13]:
setup(data_new,
             target='sepal width (cm)',
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=20,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


Unnamed: 0,Description,Value
0,Session id,5110
1,Target,sepal width (cm)
2,Target type,Regression
3,Original data shape,"(233, 4)"
4,Transformed data shape,"(233, 3)"
5,Transformed train set shape,"(163, 3)"
6,Transformed test set shape,"(70, 3)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.1901,0.0592,0.2392,0.4885,0.0602,0.0643,0.006
lightgbm,Light Gradient Boosting Machine,0.1959,0.0661,0.2523,0.4303,0.0633,0.0664,0.007
gbr,Gradient Boosting Regressor,0.194,0.0641,0.2497,0.4176,0.0627,0.0652,0.007
ada,AdaBoost Regressor,0.1987,0.0645,0.2517,0.414,0.063,0.0666,0.008
catboost,CatBoost Regressor,0.2006,0.0678,0.2558,0.3922,0.0638,0.0672,0.055
rf,Random Forest Regressor,0.2013,0.0672,0.2557,0.3858,0.0643,0.0677,0.026
et,Extra Trees Regressor,0.21,0.075,0.2709,0.3128,0.0678,0.0705,0.023
xgboost,Extreme Gradient Boosting,0.2197,0.0825,0.2836,0.2335,0.0711,0.0734,0.008
huber,Huber Regressor,0.2544,0.1155,0.3334,0.0507,0.0824,0.0848,0.005
ridge,Ridge Regression,0.255,0.1154,0.333,0.0461,0.0825,0.0854,0.005


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [3]:

df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')