In [118]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 31 20:44:46 2023

@author: michal
"""

import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull, Delaunay
from pycaret.regression import setup, compare_models
from sklearn import datasets
from scipy.stats.qmc import PoissonDisk
from sklearn.preprocessing import StandardScaler


df = pd.read_excel("/Users/michal/Library/CloudStorage/OneDrive-UniversityofGdansk/OneDrive - University of Gdansk (for Students)/Jakub Rudzinski/dane_do_modelowania/_RDKit_2D.xlsx")
y = pd.read_excel("/Users/michal/Library/CloudStorage/OneDrive-UniversityofGdansk/OneDrive - University of Gdansk (for Students)/Jakub Rudzinski/dane_do_modelowania/lumo.xlsx")
df.drop(['smiles'], axis=1, inplace=True)
y.drop(y.columns[[0,1]], axis=1, inplace=True)
df = pd.concat([y, df], axis=1)



class DataMapper:
    def __init__(self, n_components=3, metric="minkowski", n_neighbors=5, min_dist=0.5, radius=0.1):
        self.mapper = umap.UMAP(n_components=n_components, metric=metric, n_neighbors=n_neighbors, min_dist=min_dist)
        self.radius = radius

    def process_data(self, df):
        mapping = self.mapper.fit_transform(df)
        hull = ConvexHull(mapping)
        
        # Generate random points in 3D space
        points = mapping

        # Compute the Delaunay triangulation of the hull
        tri = Delaunay(points)

        # Generate Poisson disk samples within the convex hull
        samples = self.poisson_disc_sampling(points)

        # Inverse transform the samples
        inverse_samples = self.mapper.inverse_transform(samples)

        # Concatenate the original DataFrame with the inverse samples
        data_new = pd.concat([df, pd.DataFrame(inverse_samples, columns=df.columns)]).reset_index(drop=True)

        return data_new

    def poisson_disc_sampling(self, points):
        # Generate the Poisson disk samples within the convex hull
        tri = Delaunay(points)
        lb, ub = tri.min_bound, tri.max_bound
        size = np.abs(ub - lb)
        poisson = PoissonDisk(d=3, radius=self.radius)
        unit_samples = poisson.fill_space()
        samples = lb + size * unit_samples

        # Select the points that are within the convex hull
        simplex = tri.find_simplex(samples)
        samples = samples[simplex >= 0]
        return samples

# Example usage:
# Create an instance of the DataMapper class
mapper = DataMapper(n_components=3, metric="minkowski", n_neighbors=5, min_dist=0.5, radius=0.1)

# Provide your DataFrame to the process_data method
data_new = mapper.process_data(df)


print(data_new.shape, df.shape)

setup(df,
             target='lumo_energy [eV]',
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=20,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


(423, 197) (270, 197)


Unnamed: 0,Description,Value
0,Session id,7485
1,Target,lumo_energy [eV]
2,Target type,Regression
3,Original data shape,"(270, 197)"
4,Transformed data shape,"(270, 77)"
5,Transformed train set shape,"(189, 77)"
6,Transformed test set shape,"(81, 77)"
7,Numeric features,196
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.2688,0.2438,0.4464,0.8202,0.1808,0.3161,0.039
catboost,CatBoost Regressor,0.2855,0.2549,0.463,0.8107,0.1858,0.3124,0.419
lightgbm,Light Gradient Boosting Machine,0.301,0.2596,0.4647,0.8065,0.1896,0.3294,0.016
gbr,Gradient Boosting Regressor,0.2974,0.2587,0.4645,0.7915,0.1743,0.3729,0.033
rf,Random Forest Regressor,0.3107,0.2852,0.4855,0.7837,0.1996,0.3434,0.048
xgboost,Extreme Gradient Boosting,0.3017,0.2852,0.4967,0.7706,0.1807,0.3497,0.035
ada,AdaBoost Regressor,0.3823,0.3056,0.518,0.7566,0.2031,0.4303,0.028
omp,Orthogonal Matching Pursuit,0.4525,0.4419,0.6262,0.6714,0.2499,0.5243,0.012
dt,Decision Tree Regressor,0.3971,0.4973,0.6432,0.5981,0.2224,0.5252,0.013
dummy,Dummy Regressor,0.8992,1.3889,1.1625,-0.0622,0.3307,1.1667,0.011


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [121]:
setup(data_new,
             target='lumo_energy [eV]',
             preprocess=True,
             feature_selection=False,
             feature_selection_method="sequential",
             n_features_to_select=20,
             normalize=False,
             normalize_method="robust",
             remove_multicollinearity=True,
             multicollinearity_threshold=0.95,
             low_variance_threshold=0.1,
             pca=False,
             pca_method="linear",
             pca_components=6,
             profile=False)

best = compare_models()


Unnamed: 0,Description,Value
0,Session id,8442
1,Target,lumo_energy [eV]
2,Target type,Regression
3,Original data shape,"(321, 197)"
4,Transformed data shape,"(321, 81)"
5,Transformed train set shape,"(224, 81)"
6,Transformed test set shape,"(97, 81)"
7,Numeric features,196
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.3263,0.3188,0.5107,0.7765,0.192,0.669,0.809
gbr,Gradient Boosting Regressor,0.3277,0.3124,0.5108,0.7716,0.1958,0.5235,0.041
et,Extra Trees Regressor,0.3258,0.3379,0.5263,0.7609,0.2041,1.2198,0.049
rf,Random Forest Regressor,0.3445,0.3456,0.5318,0.7551,0.1978,0.9127,0.064
ada,AdaBoost Regressor,0.3941,0.3742,0.5603,0.7293,0.2113,0.7538,0.029
lightgbm,Light Gradient Boosting Machine,0.353,0.392,0.5668,0.7182,0.2188,0.9634,0.017
xgboost,Extreme Gradient Boosting,0.3699,0.4195,0.5857,0.6996,0.2066,1.0651,0.041
omp,Orthogonal Matching Pursuit,0.4825,0.5209,0.698,0.6205,0.2315,0.6049,0.012
dt,Decision Tree Regressor,0.4071,0.5182,0.6801,0.6086,0.2301,2.0251,0.012
lasso,Lasso Regression,0.5027,0.5643,0.7169,0.602,0.2673,1.1952,0.012


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [120]:
# Example usage:
# Create an instance of the DataMapper class
mapper = DataMapper(n_components=3, metric="minkowski", n_neighbors=3, min_dist=0.5, radius=0.15)

# Provide your DataFrame to the process_data method
data_new = mapper.process_data(df)
data_new


Unnamed: 0,lumo_energy [eV],MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.061770,10.300428,-0.200509,10.300428,0.200509,0.421378,146.210000,130.082000,146.117555,60.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1,-0.219868,10.388615,-4.269583,10.388615,0.177685,0.457566,388.554000,351.258000,388.220103,146.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,-1.148593,9.337963,-1.435185,9.337963,1.134259,0.564686,252.314000,232.154000,252.147392,100.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3,1.015802,9.337963,-1.435185,9.337963,1.134259,0.396484,190.243000,172.099000,190.131742,78.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,0.947501,10.222096,-0.903061,10.222096,0.234381,0.049514,652.190000,562.478000,651.689331,278.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,25.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,-1.336009,9.308488,-5.855314,9.364276,1.088153,0.419697,248.825485,228.505936,248.473495,84.706841,...,0.0,0.004584,0.0,0.0,0.0,0.0,-0.007670,0.0,-0.004509,0.0
317,-2.569245,8.847991,-6.465350,8.693183,1.296118,0.562771,254.197418,237.664429,253.920792,89.594917,...,0.0,-0.096542,0.0,0.0,0.0,0.0,0.095496,0.0,-0.040509,0.0
318,0.014813,9.972083,-2.040946,10.192616,0.640065,0.055545,561.782471,493.830109,561.371582,231.319916,...,0.0,0.013878,0.0,0.0,0.0,0.0,0.010326,0.0,16.028860,0.0
319,-2.930494,4.973080,1.117602,5.055158,1.171420,0.489329,177.177032,166.147415,177.023315,67.849037,...,0.0,-0.022627,0.0,0.0,0.0,0.0,0.881989,0.0,-0.030734,0.0


In [88]:
print(data_new[data_new.columns[0]])


0       2.687000
1       5.413000
2      13.050000
3       9.272000
4       9.786000
         ...    
310    13.515699
311    12.604997
312    11.079408
313    13.049479
314    13.009744
Name: dipole moment [Debye], Length: 315, dtype: float64
