### #Update file / folder directories as per your configuration

# Download Datasets

In [None]:
from sklearn import datasets
import pandas as pd
import os
import numpy as np

# Create a directory to save all datasets
if not os.path.exists('datasets'):
    os.makedirs('datasets')

# Classification datasets from sklearn
classification_datasets = {
    'wine': datasets.load_wine(),
    'breast_cancer': datasets.load_breast_cancer(),
    'iris': datasets.load_iris(),
    'digits': datasets.load_digits(),
    'linnerud': datasets.load_linnerud(),
    # Additional datasets from openml by name
    'adult': datasets.fetch_openml(name='adult'),
    'australian': datasets.fetch_openml(name='australian'),
    'bank-marketing': datasets.fetch_openml(name='bank-marketing'),
    'car': datasets.fetch_openml(name='car'),
    'tic-tac-toe': datasets.fetch_openml(name='tic-tac-toe'),
}

for dataset_name, dataset in classification_datasets.items():
    df = pd.DataFrame(data=dataset.data)
    
    if len(dataset.target.shape) > 1:
        for i in range(dataset.target.shape[1]):
            df[f'target_{i}'] = dataset.target[:, i]
    else:
        df['target'] = dataset.target

    df.to_csv(f'datasets/{dataset_name}.csv', index=False)

# Regression datasets from sklearn
regression_datasets = {
    'boston': datasets.load_boston(),
    'diabetes': datasets.load_diabetes(),
    'linnerud': datasets.load_linnerud(),
    'california_housing': datasets.fetch_california_housing(),
    # Additional datasets from openml by name
    'boston': datasets.fetch_openml(name='boston'),
    'diabetes': datasets.fetch_openml(name='diabetes'),
    'house_8L': datasets.fetch_openml(name='house_8L'),
    'cpu': datasets.fetch_openml(name='cpu'),
    'qsar-biodeg': datasets.fetch_openml(name='qsar-biodeg'),
    'bike_sharing': datasets.fetch_openml(name='Bike_Sharing_Demand')
}

for dataset_name, dataset in regression_datasets.items():
    df = pd.DataFrame(data=dataset.data)
    
    if len(dataset.target.shape) > 1:
        for i in range(dataset.target.shape[1]):
            df[f'target_{i}'] = dataset.target[:, i]
    else:
        df['target'] = dataset.target

    df.to_csv(f'datasets/{dataset_name}_regression.csv', index=False)

# Experiment - AutoFlex

In [None]:
import pandas as pd
import sys
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import *
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import *
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import r2_score, accuracy_score
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import time
import warnings
import logging

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_dataset(file_path):
    """
    Load the dataset from a CSV file.
    """
    logger.info(f"Loading dataset from {file_path}...")
    data = pd.read_csv(file_path)
    logger.info("Dataset loaded.")
    return data

def handle_missing_values(data):
    """
    Handle missing values in the dataset.
    """
    logger.info("Handling missing values...")
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns

    # Fill missing values with the mean for numerical columns
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

    # Fill missing values with "Unknown" for categorical columns
    data[categorical_cols] = data[categorical_cols].fillna('Unknown')

    logger.info("Missing values handled.")
    return data

def split_data(data, target_variable):
    """
    Split the data into features (X) and target variable (y).
    """
    logger.info("Splitting the data into features and target variable...")
    X = data.drop(columns=[target_variable])
    y = data[target_variable]
    logger.info("Data split complete.")
    return X, y

def preprocess_numerical_col(col, preprocessing_methods, X_train, y_train, models):
    """
    Preprocess a numerical column and find the best preprocessing technique.
    """
    try:
        best_preprocess = None
        best_score = None
        for preprocess_name, preprocess_method in preprocessing_methods.items():
            preprocessor = ColumnTransformer([(preprocess_name, preprocess_method, [col])], remainder='passthrough')
            for model_name, (model, model_params) in models.items():
                pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('model', model)
                ])
                pipeline.fit(X_train, y_train)
                score = pipeline.score(X_train, y_train)
                if best_score is None or score > best_score:
                    best_score = score
                    best_preprocess = preprocess_name
        return col, best_preprocess
    except (ValueError, TypeError) as e:
        return col, None, str(e)


def preprocess_categorical_col(col, categorical_preprocessing_methods, feature_selection_methods, X_train, y_train, models):
    """
    Preprocess a categorical column and find the best preprocessing technique.
    """
    try:
        best_preprocess = None
        best_score = None
        best_feature_selection = None  # Store the best feature selection method
        for preprocess_name, preprocess_method in categorical_preprocessing_methods.items():
            preprocessor = ColumnTransformer([(preprocess_name, preprocess_method, [col])], remainder='drop')
            for feature_selection_name, feature_selection_method in feature_selection_methods.items():
                feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_method, [col])])
                for model_name, (model, model_params) in models.items():
                    pipeline = Pipeline([
                        ('preprocessor', preprocessor),
                        ('feature_selection', feature_selector),
                        ('model', model)
                    ])
                    pipeline.fit(X_train, y_train)
                    score = pipeline.score(X_train, y_train)
                    if best_score is None or score > best_score:
                        best_score = score
                        best_preprocess = preprocess_name
                        best_feature_selection = feature_selection_name
        return col, best_preprocess, best_feature_selection
    except (ValueError, TypeError) as e:
        return col, None, None, str(e)

def process_numerical_cols(X_train, numerical_cols, preprocessing_methods, models, y_train):
    """
    Preprocess numerical columns and find the best preprocessing techniques for each column.
    """
    logger.info("Processing numerical columns...")
    numerical_cols_processed = []
    with Parallel(n_jobs=-1) as parallel:
        numerical_cols_processed = parallel(delayed(preprocess_numerical_col)(col, preprocessing_methods, X_train, y_train, models) for col in tqdm(numerical_cols, desc="Numerical Columns Preprocessing"))
    logger.info("Numerical columns processing complete.")
    return numerical_cols_processed

def process_categorical_cols(X_train, categorical_cols, preprocessing_methods, feature_selection_methods, models, y_train):
    """
    Preprocess categorical columns and find the best preprocessing techniques for each column.
    """
    logger.info("Processing categorical columns...")
    categorical_cols_processed = []
    categorical_cols_processed_temp = []
    with Parallel(n_jobs=-1) as parallel:
        categorical_cols_processed = parallel(delayed(preprocess_categorical_col)(
            col, categorical_preprocessing_methods, feature_selection_methods, X_train, y_train, models
        ) for col in tqdm(categorical_cols, desc="Categorical Columns Preprocessing"))

    # Add the processed categorical columns to the list
    categorical_cols_processed.extend(cols for cols in categorical_cols_processed_temp if cols not in categorical_cols_processed)

    return categorical_cols_processed



def perform_grid_search(model_name, model, model_params, cols, X_train, y_train, X_test, y_test, task_type,
                        preprocessing_techniques):
    """
    Perform grid search with cross-validation to find the best hyperparameters and preprocessing techniques.
    """
    pipeline_count = 0
    preprocessing_steps = []

    if len(cols) > 0:
        pipeline_count += 1
        preprocessor = ColumnTransformer(preprocessing_techniques, remainder='passthrough')
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        param_grid = {f'model__{param_name}': param_range for param_name, param_range in model_params.items()}

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=ConvergenceWarning)
            start_time = time.time()

            try:
                grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=KFold(n_splits=5))
                grid_search.fit(X_train, y_train)
            except Exception as e:
                raise Exception(f"Grid search failed for {model_name}: {e}")

            end_time = time.time()

        best_score = grid_search.best_score_
        best_params = grid_search.best_params_
        pipeline_score = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=KFold(n_splits=5)).mean()

        if task_type == "Regression":
            y_pred = grid_search.predict(X_train)
            r2 = r2_score(y_train, y_pred)
            test_score = grid_search.score(X_test, y_test)
            result = {
                'Model': model_name,
                'Preprocessing': cols,
                'Best Parameters': best_params,
                'Best Score': best_score,
                'Pipeline Score': pipeline_score,
                'Accuracy': r2,
                'Test Score': test_score,
                'Execution Time (s)': end_time - start_time
            }
        else:
            y_pred = grid_search.predict(X_train)
            accuracy = accuracy_score(y_train, y_pred)
            test_score = grid_search.score(X_test, y_test)
            result = {
                'Model': model_name,
                'Preprocessing': cols,
                'Best Parameters': best_params,
                'Best Score': best_score,
                'Pipeline Score': pipeline_score,
                'Accuracy': accuracy,
                'Test Score': test_score,
                'Execution Time (s)': end_time - start_time
            }

        preprocessing_steps = [f"{step[0]}: {step[1]}" for step in cols]

        return result, pipeline_count, preprocessing_steps, preprocessing_techniques
    else:
        return {}, pipeline_count, preprocessing_steps, preprocessing_techniques

def perform_grid_search_parallel(models, cols, X_train, y_train, X_test, y_test, task_type, preprocessing_techniques):
    """
    Perform grid search with cross-validation in parallel for each model and column combination.
    """
    pipeline_counts = []
    preprocessing_techniques_str = []
    results = []

    with Parallel(n_jobs=-1) as parallel:
        total_pipelines_count = len(cols) * len(models)
        pipeline_bar = tqdm(total=total_pipelines_count, desc='Optimization Progress', leave=False)
        for model_name, (model, model_params) in models.items():
            try:
                processed_results = parallel(delayed(perform_grid_search)(
                    model_name, model, model_params, cols, X_train, y_train, X_test, y_test, task_type,
                    preprocessing_techniques
                ) for cols in tqdm(cols, desc=f"Model: {model_name}", total=len(cols), file=sys.stdout))
            except Exception as e:
                raise Exception(f"Grid search parallel failed for {model_name}: {e}")

            results.extend([result for result, _, _, _ in processed_results])

            pipeline_counts.extend([(count, steps, techniques) for _, count, steps, techniques in processed_results])

            pipeline_bar.update(len(cols))

            time.sleep(0.1)

            pipeline_bar.set_postfix({'Pipelines Generated': f"{sum(count for count, _, _ in pipeline_counts)}/{total_pipelines_count}"})

    preprocessing_techniques_str = '\n'.join([f"{step[0]}: {step[1]}" for step in preprocessing_techniques])

    results_df = pd.DataFrame(results)

    results_df['Preprocessing Techniques'] = preprocessing_techniques_str

    results_df_sorted = results_df.sort_values(by='Best Score', ascending=False)

    return results_df_sorted, pipeline_counts

if __name__ == '__main__':
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Load your dataset
    data = load_dataset('d.csv')

    # Handle missing values
    data = handle_missing_values(data)

    # Assume the target variable column name is 'target'
    target_variable = 'target'

    # Split the data into features (X) and target variable (y)
    X, y = split_data(data, target_variable)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Calculate the unique percentage of the target variable
    unique_percentage = y.nunique() / y.shape[0]

    # Determine the target variable class based on the unique percentage
    if unique_percentage <= 0.05:
        target_variable_class = "Binary"
    elif unique_percentage <= 0.1:
        target_variable_class = "Multi-class"
    else:
        target_variable_class = "Regression"

    # Print the target variable class
    logger.info("Target Variable Class: %s", target_variable_class)

    # Define task type based on the target variable class
    if target_variable_class == "Binary":
        # Binary classification task
        task_type = "Binary Classification"
        models = {
            'Decision Tree Classifier': (DecisionTreeClassifier(), {'max_depth': [None, 3, 5, 10]}),
            'Gradient Boosting Classifier': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
            'Random Forest Classifier': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
            'Neural Network Classifier': (MLPClassifier(max_iter=1000), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
            'Logistic Regression': (LogisticRegression(max_iter=10000), {'penalty': ['l2'], 'C': [0.1, 1, 10]}),
            'KNN Classifier': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        }
    elif target_variable_class == "Multi-class":
        # Multi-class classification task
        task_type = "Multi-class Classification"
        models = {
            'Decision Tree Classifier': (DecisionTreeClassifier(), {'max_depth': [None, 3, 5, 10]}),
            'Gradient Boosting Classifier': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
            'Random Forest Classifier': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
            'Neural Network Classifier': (MLPClassifier(max_iter=10000), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
            'Logistic Regression': (LogisticRegression(max_iter=10000), {'penalty': ['l2'], 'C': [0.1, 1, 10]}),
            'KNN Classifier': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        }
    else:
        # Regression task
        task_type = "Regression"
        models = {
            'Decision Tree Regressor': (DecisionTreeRegressor(), {'max_depth': [None, 3, 5, 10]}),
            'Gradient Boosting Regressor': (GradientBoostingRegressor(), {'n_estimators': [50, 100, 200]}),
            'Random Forest Regressor': (RandomForestRegressor(), {'n_estimators': [50, 100, 200]}),
            'Neural Network Regressor': (MLPRegressor(), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
            'Linear Regression': (LinearRegression(), {'normalize': [True, False]}),
            'KNN Regressor': (KNeighborsRegressor(), {'n_neighbors': [3, 5, 7]}),
        }

    # Define your preprocessing techniques
    numerical_preprocessing_methods = {
        'SimpleImputer': SimpleImputer(),
        'StandardScaler': StandardScaler(),
        'RobustScaler': RobustScaler(),
        'MinMaxScaler': MinMaxScaler(),
        'PolynomialFeatures': PolynomialFeatures(),
        'PCA': PCA()
    }

    categorical_preprocessing_methods = {
        'OneHotEncoder': OneHotEncoder(handle_unknown='ignore'),
        'LabelEncoder': LabelEncoder(),
    }

    feature_selection_methods = {
        'SelectKBest_f_regression': SelectKBest(f_regression),
        'SelectKBest_chi2': SelectKBest(chi2)
    }

    # Handle numerical columns
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    numerical_cols_processed = process_numerical_cols(X_train, numerical_cols, numerical_preprocessing_methods, models, y_train)


    # Handle categorical columns
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
    categorical_cols_processed = process_categorical_cols(X_train, categorical_cols, categorical_preprocessing_methods, feature_selection_methods, models, y_train)

    # Concatenate the processed numerical and categorical columns
    cols = (numerical_cols_processed or []) + (categorical_cols_processed or [])


    preprocessing_techniques = []
    for col, preprocess_name in numerical_cols_processed:
        if preprocess_name is not None:
            preprocessing_techniques.append((f'num_preprocess_{col}', numerical_preprocessing_methods[preprocess_name], [col]))

    for col, preprocess_name, feature_selection_name in categorical_cols_processed:
        if preprocess_name is not None:
            preprocessing_techniques.append((f'cat_preprocess_{col}', categorical_preprocessing_methods[preprocess_name], [col]))
            if feature_selection_name is not None:
                preprocessing_techniques.append((f'feature_selection_{col}', feature_selection_methods[feature_selection_name], [col]))

    # Split the data into features (X) and target variable (y)
    X, y = split_data(data, target_variable)

    # Convert X_train and X_test to pandas DataFrames if they are not already in that format
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Convert column names to strings
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)

    # Apply preprocessing steps to training data
    for col, preprocess_name in numerical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, numerical_preprocessing_methods[preprocess_name], [col])], remainder='passthrough')
            X_train[[col]] = preprocessor.fit_transform(X_train[[col]], y_train)


    for col, preprocess_name, feature_selection_name in categorical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, categorical_preprocessing_methods[preprocess_name], [col])], remainder='drop')
            X_train = preprocessor.fit_transform(X_train, y_train)
            X_train = pd.DataFrame(X_train)  # Convert back to pandas DataFrame
            if feature_selection_name is not None:
                feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_methods[feature_selection_name], [col])])
                X_train = feature_selector.fit_transform(X_train, y_train)
                X_train = pd.DataFrame(X_train)  # Convert back to pandas DataFrame

    # Apply preprocessing steps to testing data
    for col, preprocess_name in numerical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, numerical_preprocessing_methods[preprocess_name], [col])], remainder='passthrough')
            preprocessor.fit(X_train[[col]])  # Fit on training data
            X_test[[col]] = preprocessor.transform(X_test[[col]])

    for col, preprocess_name, feature_selection_name in categorical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, categorical_preprocessing_methods[preprocess_name], [col])], remainder='drop')
            X_test = preprocessor.transform(X_test)
            X_test = pd.DataFrame(X_test)  # Convert back to pandas DataFrame
            if feature_selection_name is not None:
                feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_methods[feature_selection_name], [col])])
                X_test = feature_selector.transform(X_test)
                X_test = pd.DataFrame(X_test)  # Convert back to pandas DataFrame

    results_df_sorted, pipeline_counts = perform_grid_search_parallel(models, cols, X_train, y_train, X_test, y_test, task_type, preprocessing_techniques)

    excel_file_path = 'results_sorted.xlsx'
    results_df_sorted.to_excel(excel_file_path, index=False)

    total_pipelines = sum(count for count, _, _ in pipeline_counts)
    total_preprocessing_steps = sum(len(steps) for _, steps, _ in pipeline_counts)

    logger.info(f"Total Models: {len(models)}")
    logger.info(f"Total Pipelines: {total_pipelines}")
    logger.info(f"Total Preprocessing Methods: {len(preprocessing_techniques)}")
    logger.info(f"Total Preprocessing Steps: {total_preprocessing_steps}")

    best_train_result = results_df_sorted.iloc[0]
    logger.info("Best Train Result:")
    logger.info(best_train_result)

    best_test_result = results_df_sorted.sort_values(by='Test Score', ascending=False).iloc[0]
    logger.info("Best Test Result:")
    logger.info(best_test_result)

    logger.info(f"Sorted Results saved to: {excel_file_path}")

In [None]:
import pandas as pd
import sys
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import *
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import *
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import r2_score, accuracy_score
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import time
import warnings
import logging
import os

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_dataset(file_path):
    """
    Load the dataset from a CSV file.
    """
    logger.info(f"Loading dataset from {file_path}...")
    data = pd.read_csv(file_path)
    logger.info("Dataset loaded.")
    return data

def handle_missing_values(data):
    """
    Handle missing values in the dataset.
    """
    logger.info("Handling missing values...")
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns

    # Fill missing values with the mean for numerical columns
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

    # Fill missing values with "Unknown" for categorical columns
    data[categorical_cols] = data[categorical_cols].fillna('Unknown')

    logger.info("Missing values handled.")
    return data

def split_data(data, target_variable):
    """
    Split the data into features (X) and target variable (y).
    """
    logger.info("Splitting the data into features and target variable...")
    X = data.drop(columns=[target_variable])
    y = data[target_variable]
    logger.info("Data split complete.")
    return X, y

def preprocess_numerical_col(col, preprocessing_methods, X_train, y_train, models):
    """
    Preprocess a numerical column and find the best preprocessing technique.
    """
    try:
        best_preprocess = None
        best_score = None
        for preprocess_name, preprocess_method in preprocessing_methods.items():
            preprocessor = ColumnTransformer([(preprocess_name, preprocess_method, [col])], remainder='passthrough')
            for model_name, (model, model_params) in models.items():
                pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('model', model)
                ])
                pipeline.fit(X_train, y_train)
                score = pipeline.score(X_train, y_train)
                if best_score is None or score > best_score:
                    best_score = score
                    best_preprocess = preprocess_name
        return col, best_preprocess
    except (ValueError, TypeError) as e:
        return col, None, str(e)


def preprocess_categorical_col(col, categorical_preprocessing_methods, feature_selection_methods, X_train, y_train, models):
    """
    Preprocess a categorical column and find the best preprocessing technique.
    """
    try:
        best_preprocess = None
        best_score = None
        best_feature_selection = None  # Store the best feature selection method
        for preprocess_name, preprocess_method in categorical_preprocessing_methods.items():
            preprocessor = ColumnTransformer([(preprocess_name, preprocess_method, [col])], remainder='drop')
            for feature_selection_name, feature_selection_method in feature_selection_methods.items():
                feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_method, [col])])
                for model_name, (model, model_params) in models.items():
                    pipeline = Pipeline([
                        ('preprocessor', preprocessor),
                        ('feature_selection', feature_selector),
                        ('model', model)
                    ])
                    pipeline.fit(X_train, y_train)
                    score = pipeline.score(X_train, y_train)
                    if best_score is None or score > best_score:
                        best_score = score
                        best_preprocess = preprocess_name
                        best_feature_selection = feature_selection_name
        return col, best_preprocess, best_feature_selection
    except (ValueError, TypeError) as e:
        return col, None, None, str(e)

def process_numerical_cols(X_train, numerical_cols, preprocessing_methods, models, y_train):
    """
    Preprocess numerical columns and find the best preprocessing techniques for each column.
    """
    logger.info("Processing numerical columns...")
    numerical_cols_processed = []
    with Parallel(n_jobs=-1) as parallel:
        numerical_cols_processed = parallel(delayed(preprocess_numerical_col)(col, preprocessing_methods, X_train, y_train, models) for col in tqdm(numerical_cols, desc="Numerical Columns Preprocessing"))
    logger.info("Numerical columns processing complete.")
    return numerical_cols_processed

def process_categorical_cols(X_train, categorical_cols, preprocessing_methods, feature_selection_methods, models, y_train):
    """
    Preprocess categorical columns and find the best preprocessing techniques for each column.
    """
    logger.info("Processing categorical columns...")
    categorical_cols_processed = []
    categorical_cols_processed_temp = []
    with Parallel(n_jobs=-1) as parallel:
        categorical_cols_processed = parallel(delayed(preprocess_categorical_col)(
            col, categorical_preprocessing_methods, feature_selection_methods, X_train, y_train, models
        ) for col in tqdm(categorical_cols, desc="Categorical Columns Preprocessing"))

    # Add the processed categorical columns to the list
    categorical_cols_processed.extend(cols for cols in categorical_cols_processed_temp if cols not in categorical_cols_processed)

    return categorical_cols_processed



def perform_grid_search(model_name, model, model_params, cols, X_train, y_train, X_test, y_test, task_type,
                        preprocessing_techniques):
    try:
        """
        Perform grid search with cross-validation to find the best hyperparameters and preprocessing techniques.
        """
        pipeline_count = 0
        preprocessing_steps = []

        if len(cols) > 0:
            pipeline_count += 1
            preprocessor = ColumnTransformer(preprocessing_techniques, remainder='passthrough')
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('model', model)
            ])
            param_grid = {f'model__{param_name}': param_range for param_name, param_range in model_params.items()}

            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', category=ConvergenceWarning)
                start_time = time.time()

                try:
                    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=KFold(n_splits=5))
                    grid_search.fit(X_train, y_train)
                except Exception as e:
                    raise Exception(f"Grid search failed for {model_name}: {e}")

                end_time = time.time()

            best_score = grid_search.best_score_
            best_params = grid_search.best_params_
            pipeline_score = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=KFold(n_splits=5)).mean()

            if task_type == "Regression":
                y_pred = grid_search.predict(X_train)
                r2 = r2_score(y_train, y_pred)
                test_score = grid_search.score(X_test, y_test)
                result = {
                    'Model': model_name,
                    'Preprocessing': cols,
                    'Best Parameters': best_params,
                    'Best Score': best_score,
                    'Pipeline Score': pipeline_score,
                    'Accuracy': r2,
                    'Test Score': test_score,
                    'Execution Time (s)': end_time - start_time
                }
            else:
                y_pred = grid_search.predict(X_train)
                accuracy = accuracy_score(y_train, y_pred)
                test_score = grid_search.score(X_test, y_test)
                result = {
                    'Model': model_name,
                    'Preprocessing': cols,
                    'Best Parameters': best_params,
                    'Best Score': best_score,
                    'Pipeline Score': pipeline_score,
                    'Accuracy': accuracy,
                    'Test Score': test_score,
                    'Execution Time (s)': end_time - start_time
                }

            preprocessing_steps = [f"{step[0]}: {step[1]}" for step in cols]

            return result, pipeline_count, preprocessing_steps, preprocessing_techniques
        else:
            return {}, pipeline_count, preprocessing_steps, preprocessing_techniques
        
    except (ValueError, TypeError, IndexError) as e:
        # Log the error and raise a more informative exception
        logger.error(f"Error occurred during grid search for {model_name}: {e}")
        logger.error(f"Cols: {cols}")
        logger.error(f"Preprocessing Techniques: {preprocessing_techniques}")
        raise Exception(f"Grid search failed for {model_name}: {e}")

def perform_grid_search_parallel(models, cols, X_train, y_train, X_test, y_test, task_type, preprocessing_techniques):
    """
    Perform grid search with cross-validation in parallel for each model and column combination.
    """
    pipeline_counts = []
    preprocessing_techniques_str = []
    results = []

    with Parallel(n_jobs=-1) as parallel:
        total_pipelines_count = len(cols) * len(models)
        pipeline_bar = tqdm(total=total_pipelines_count, desc='Optimization Progress', leave=False)
        for model_name, (model, model_params) in models.items():
            try:
                processed_results = parallel(delayed(perform_grid_search)(
                    model_name, model, model_params, cols, X_train, y_train, X_test, y_test, task_type,
                    preprocessing_techniques
                ) for cols in tqdm(cols, desc=f"Model: {model_name}", total=len(cols), file=sys.stdout))
            except Exception as e:
                raise Exception(f"Grid search parallel failed for {model_name}: {e}")

            results.extend([result for result, _, _, _ in processed_results])

            pipeline_counts.extend([(count, steps, techniques) for _, count, steps, techniques in processed_results])

            pipeline_bar.update(len(cols))

            time.sleep(0.1)

            pipeline_bar.set_postfix({'Pipelines Generated': f"{sum(count for count, _, _ in pipeline_counts)}/{total_pipelines_count}"})

    preprocessing_techniques_str = '\n'.join([f"{step[0]}: {step[1]}" for step in preprocessing_techniques])

    results_df = pd.DataFrame(results)

    results_df['Preprocessing Techniques'] = preprocessing_techniques_str
    results_df_sorted = results_df.sort_values(by='Best Score', ascending=False)

    return results_df_sorted, pipeline_counts

# List of percentages for record selection
percentages = [0.25, 0.5, 1.0]

# Dictionary of notations for each percentage
percent_notations = {
    0.25: "25p",
    0.5: "50p",
    1.0: "100p"
}

# Assume the target variable column name is 'target'
target_variable = 'target'

if __name__ == '__main__':
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    folder_path = 'datasets/INT'
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    for csv_file in csv_files:
        # Construct the full file path
        file_path = os.path.join(folder_path, csv_file)

        # Load the dataset
        data = load_dataset(file_path)

        # Handle missing values
        data = handle_missing_values(data)
        
        for percentage in percentages:
            # Determine the number of records based on the percentage
            num_records = int(data.shape[0] * percentage)

            # Sample the specified number of records
            sampled_data = data.sample(n=num_records, random_state=42)

            # Handle missing values
            sampled_data = handle_missing_values(sampled_data)
            
            # Split the data into features (X) and target variable (y)
            X, y = split_data(sampled_data, target_variable)

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Calculate the unique percentage of the target variable
            unique_percentage = y.nunique() / y.shape[0]

            # Determine the target variable class based on the unique percentage
            if unique_percentage <= 0.05:
                target_variable_class = "Binary"
            elif unique_percentage <= 0.1:
                target_variable_class = "Multi-class"
            else:
                target_variable_class = "Regression"

            # Print the target variable class
            logger.info("Target Variable Class: %s", target_variable_class)

            # Define task type based on the target variable class
            if target_variable_class == "Binary":
                # Binary classification task
                task_type = "Binary Classification"
                models = {
                    'Decision Tree Classifier': (DecisionTreeClassifier(), {'max_depth': [None, 3, 5, 10]}),
                    'Gradient Boosting Classifier': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
                    'Random Forest Classifier': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
                    'Neural Network Classifier': (MLPClassifier(max_iter=1000), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
                    'Logistic Regression': (LogisticRegression(max_iter=10000), {'penalty': ['l2'], 'C': [0.1, 1, 10]}),
                    'KNN Classifier': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
                }
            elif target_variable_class == "Multi-class":
                # Multi-class classification task
                task_type = "Multi-class Classification"
                models = {
                    'Decision Tree Classifier': (DecisionTreeClassifier(), {'max_depth': [None, 3, 5, 10]}),
                    'Gradient Boosting Classifier': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
                    'Random Forest Classifier': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
                    'Neural Network Classifier': (MLPClassifier(max_iter=10000), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
                    'Logistic Regression': (LogisticRegression(max_iter=10000), {'penalty': ['l2'], 'C': [0.1, 1, 10]}),
                    'KNN Classifier': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
                }
            else:
                # Regression task
                task_type = "Regression"
                models = {
                    'Decision Tree Regressor': (DecisionTreeRegressor(), {'max_depth': [None, 3, 5, 10]}),
                    'Gradient Boosting Regressor': (GradientBoostingRegressor(), {'n_estimators': [50, 100, 200]}),
                    'Random Forest Regressor': (RandomForestRegressor(), {'n_estimators': [50, 100, 200]}),
                    'Neural Network Regressor': (MLPRegressor(), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
                    'Linear Regression': (LinearRegression(), {'copy_X': [True, False]}),
                    'KNN Regressor': (KNeighborsRegressor(), {'n_neighbors': [3, 5, 7]}),
                }

            # Define your preprocessing techniques
            numerical_preprocessing_methods = {
                #'SimpleImputer': SimpleImputer(),
                'StandardScaler': StandardScaler(),
                'RobustScaler': RobustScaler(),
                'MinMaxScaler': MinMaxScaler(),
                'PolynomialFeatures': PolynomialFeatures(),
                'PCA': PCA()
            }

            categorical_preprocessing_methods = {
                'OneHotEncoder': OneHotEncoder(handle_unknown='ignore'),
                'LabelEncoder': LabelEncoder(),
            }

            feature_selection_methods = {
                'SelectKBest_f_regression': SelectKBest(f_regression),
                'SelectKBest_chi2': SelectKBest(chi2)
            }

            # Handle numerical columns
            numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
            numerical_cols_processed = process_numerical_cols(X_train, numerical_cols, numerical_preprocessing_methods, models, y_train)


            # Handle categorical columns
            categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
            categorical_cols_processed = process_categorical_cols(X_train, categorical_cols, categorical_preprocessing_methods, feature_selection_methods, models, y_train)

            # Concatenate the processed numerical and categorical columns
            cols = (numerical_cols_processed or []) + (categorical_cols_processed or [])


            preprocessing_techniques = []
            for col, preprocess_name in numerical_cols_processed:
                if preprocess_name is not None:
                    preprocessing_techniques.append((f'num_preprocess_{col}', numerical_preprocessing_methods[preprocess_name], [col]))

            for col, preprocess_name, feature_selection_name in categorical_cols_processed:
                if preprocess_name is not None:
                    preprocessing_techniques.append((f'cat_preprocess_{col}', categorical_preprocessing_methods[preprocess_name], [col]))
                    if feature_selection_name is not None:
                        preprocessing_techniques.append((f'feature_selection_{col}', feature_selection_methods[feature_selection_name], [col]))

            # Split the data into features (X) and target variable (y)
            X, y = split_data(data, target_variable)

            # Convert X_train and X_test to pandas DataFrames if they are not already in that format
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

            # Convert column names to strings
            X_train.columns = X_train.columns.astype(str)
            X_test.columns = X_test.columns.astype(str)

            # Apply preprocessing steps to training data
            for col, preprocess_name in numerical_cols_processed:
                if preprocess_name is not None:
                    preprocessor = ColumnTransformer([(preprocess_name, numerical_preprocessing_methods[preprocess_name], [col])], remainder='passthrough')
                    X_train[[col]] = preprocessor.fit_transform(X_train[[col]], y_train)


            for col, preprocess_name, feature_selection_name in categorical_cols_processed:
                if preprocess_name is not None:
                    preprocessor = ColumnTransformer([(preprocess_name, categorical_preprocessing_methods[preprocess_name], [col])], remainder='drop')
                    X_train = preprocessor.fit_transform(X_train, y_train)
                    X_train = pd.DataFrame(X_train)  # Convert back to pandas DataFrame
                    if feature_selection_name is not None:
                        feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_methods[feature_selection_name], [col])])
                        X_train = feature_selector.fit_transform(X_train, y_train)
                        X_train = pd.DataFrame(X_train)  # Convert back to pandas DataFrame

            # Apply preprocessing steps to testing data
            for col, preprocess_name in numerical_cols_processed:
                if preprocess_name is not None:
                    preprocessor = ColumnTransformer([(preprocess_name, numerical_preprocessing_methods[preprocess_name], [col])], remainder='passthrough')
                    preprocessor.fit(X_train[[col]])  # Fit on training data
                    X_test[[col]] = preprocessor.transform(X_test[[col]])

            for col, preprocess_name, feature_selection_name in categorical_cols_processed:
                if preprocess_name is not None:
                    preprocessor = ColumnTransformer([(preprocess_name, categorical_preprocessing_methods[preprocess_name], [col])], remainder='drop')
                    X_test = preprocessor.transform(X_test)
                    X_test = pd.DataFrame(X_test)  # Convert back to pandas DataFrame
                    if feature_selection_name is not None:
                        feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_methods[feature_selection_name], [col])])
                        X_test = feature_selector.transform(X_test)
                        X_test = pd.DataFrame(X_test)  # Convert back to pandas DataFrame

            results_df_sorted, pipeline_counts = perform_grid_search_parallel(models, cols, X_train, y_train, X_test, y_test, task_type, preprocessing_techniques)

            excel_file_name = f"results/{csv_file.split('.')[0]}_{percent_notations[percentage]}_results_sorted.xlsx"
            excel_file_path = excel_file_name

            #Add the "File Name" column to the DataFrame and move it to the first position
            results_df_sorted.insert(0, "File Name", csv_file.split('.')[0])    

            # Create the directory if it doesn't exist
            if not os.path.exists("results"):
                os.makedirs("results")

            if os.path.exists(excel_file_path):
                os.remove(excel_file_path)
            results_df_sorted.to_excel(excel_file_path, index=False, sheet_name='Results')

            total_pipelines = sum(count for count, _, _ in pipeline_counts)
            total_preprocessing_steps = sum(len(steps) for _, steps, _ in pipeline_counts)

            logger.info(f"Total Models: {len(models)}")
            logger.info(f"Total Pipelines: {total_pipelines}")
            logger.info(f"Total Preprocessing Methods: {len(preprocessing_techniques)}")
            logger.info(f"Total Preprocessing Steps: {total_preprocessing_steps}")

            best_train_result = results_df_sorted.iloc[0]
            logger.info("Best Train Result:")
            logger.info(best_train_result)

            best_test_result = results_df_sorted.sort_values(by='Test Score', ascending=False).iloc[0]
            logger.info("Best Test Result:")
            logger.info(best_test_result)

            # Print the path of the saved Excel file
            logger.info(f"Sorted Results saved to: {excel_file_path}")

# TPOT

In [None]:
import os
import pandas as pd
from tpot import TPOTClassifier, TPOTRegressor
from sklearn.model_selection import train_test_split
import time

# Define the folder path containing the datasets
folder_path = 'datasets/INT'

# Get the list of CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Dataset', 'Task Type', 'Observation Percentage', 'TPOT Score', 'Auto-sklearn Score', 'H2O AutoML Score',
                                   'TPOT Execution Time', 'Auto-sklearn Execution Time', 'H2O AutoML Execution Time',
                                   'TPOT Best Model', 'Auto-sklearn Best Model', 'H2O AutoML Best Model',
                                   'TPOT Pipeline Scores', 'Auto-sklearn Pipeline Scores', 'H2O AutoML Pipeline Scores'])

# Iterate over each CSV file in the folder
for csv_file in csv_files:
    # Construct the full file path
    file_path = os.path.join(folder_path, csv_file)
    
    # Load the dataset
    data = pd.read_csv(file_path)

    print(file_path)

    # Assume the target variable column name is 'target'
    target_variable = 'target'

    # Split the data into features (X) and target variable (y)
    X = data.drop(columns=[target_variable])
    y = data[target_variable]

    # Define the task type based on the target variable class
    unique_percentage = y.nunique() / y.shape[0]
    if unique_percentage <= 0.05:
        task_type = "classification"
    elif unique_percentage <= 0.1:
        task_type = "multiclassification"
    else:
        task_type = "regression"

    print(task_type)

    # Iterate over different observation percentages
    for percentage in [0.25, 0.5, 1.0]:
        # Determine the number of observations based on the percentage
        num_observations = int(len(X) * percentage)

        # Take a subset of the data based on the number of observations
        X_subset = X.iloc[:num_observations]
        y_subset = y.iloc[:num_observations]

        # Split the subset data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

        # TPOT
        tpot_start_time = time.time()
        if task_type == "classification" or task_type == "multiclassification":
            tpot = TPOTClassifier(generations=5, population_size=50, random_state=42, verbosity=2)
        else:
            tpot = TPOTRegressor(generations=5, population_size=50, random_state=42, verbosity=2)

        tpot.fit(X_train, y_train)
        tpot_execution_time = time.time() - tpot_start_time
        tpot_score = tpot.score(X_test, y_test)
        tpot_best_model = tpot.fitted_pipeline_
        tpot_pipeline_scores = tpot.evaluated_individuals_

        # Append the results to the DataFrame
        results_df = results_df.append({'Dataset': csv_file, 'Task Type': task_type, 'Observation Percentage': percentage,
                                        'TPOT Score': tpot_score, 

                                        'TPOT Execution Time': tpot_execution_time,

                                        'TPOT Best Model': tpot_best_model,

                                        'TPOT Pipeline Scores': tpot_pipeline_scores},

                                       ignore_index=True)

# Save the results to an Excel file
results_file = 'tpot_results.xlsx'
results_df.to_excel(results_file, index=False)
print("Results saved to:", results_file)

# Datasets Split to Subsets

In [None]:
import csv
import os

# Folder path containing the CSV files
folder_path = "datasets/INT"

# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV file
    if file_name.endswith(".csv"):
        # Construct the full path to the CSV file
        file_path = os.path.join(folder_path, file_name)

        # Read the CSV file
        with open(file_path, mode='r') as file:
            reader = csv.reader(file)
            data = list(reader)

        # Iterate over different observation percentages
        for percentage in [0.25, 0.5, 1.0]:
            # Determine the number of observations based on the percentage
            num_observations = int(len(data) * percentage)

            # Generate the new CSV file name
            new_file_name = f"{int(percentage * 100)}p_{file_name}"

            # Create a new CSV file in a separate folder
            new_folder_path = "new_folder_with_csvs"
            os.makedirs(new_folder_path, exist_ok=True)
            new_file_path = os.path.join(new_folder_path, new_file_name)

            # Write the data to the new CSV file
            with open(new_file_path, mode='w', newline='') as new_file:
                writer = csv.writer(new_file)
                writer.writerows(data[:num_observations])

            print(f"New CSV file {new_file_name} created in the folder {new_folder_path}.")


# H2O

In [None]:
import os
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
import time

def run_h2o_automl(csv_file):
    h2o.init()
    # Load the CSV file into H2O
    data = h2o.import_file(csv_file)
    # Set the target variable
    target = 'target'

    # Run H2O AutoML
    aml = H2OAutoML(max_runtime_secs=300, max_runtime_secs_per_model=60)

    aml.train(y=target, training_frame=data)

    # Get the leaderboard
    leaderboard = aml.leaderboard
    leaderboard_df = leaderboard.as_data_frame()

    # Get the best model
    best_model = aml.leader

    # Get the pipeline scores
    pipeline_scores = pd.DataFrame(best_model.scoring_history())

    return leaderboard_df, best_model, pipeline_scores

results_df = pd.DataFrame(columns=['Dataset','leaderboard_df', 'H2O AutoML Execution Time', 'H2O AutoML Best Model', 'H2O AutoML Pipeline Scores'])


folder_path = 'new_folder_with_csvs'

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        csv_file = os.path.join(folder_path, filename)
        print(csv_file)

        # Start the timer
        start_time = time.time()

        leaderboard_df, best_model, pipeline_scores = run_h2o_automl(csv_file)

        end_time = time.time()
        execution_time = end_time - start_time
        
        # Append the results to the DataFrame
        results_df = results_df.append({
            'Dataset': csv_file,
            'H2O leaderboard_df': leaderboard_df,
            'H2O AutoML Execution Time': execution_time,
            'H2O AutoML Best Model': best_model.model_id,
            'H2O AutoML Pipeline Scores': pipeline_scores
        }, ignore_index=True)

        h2o.shutdown()
        time.sleep(5)


output_file = 'h2o_excel_file2.xlsx'
results_df.to_excel(output_file, index=False)

## Correltion between Execution time and Model Accuracy scores

In [None]:
import pandas as pd

# Create a DataFrame with the provided data
data = {
    'Subset ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
    'TPOT time': [11.31737, 18.6722, 104.2202, 55.56861, 136.1373, 18.39068, 62.11533, 78.67906, 77.77921, 112.801, 74.58719, 142.5261, 83.22899, 113.8769, 146.8045, 177.662, 98.19801, 90.08522, 110.0856, 65.36148, 120.7857, 316.6925, 23.63281, 367.1847, 82.60517, 1118.463, 700.4246, 1394.041, 858.111, 3765.884],
    'TPOT score': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0.965517, 0.95614, 0.894737, 0.855072, 0.828571, 0.884058, 0.772727, 0.74359, 0.753247, 0.877358, 1, 0.862559, 0.988889, 0.977778, 0.983333, 0, 0, 0, 0],
    'AutoFlex time': [0.234, 1.6476, 0.2208, 2.5347, 2.4179, 1.916, 1.5536, 1.5288, 0.8817, 2.3265, 1.7511, 2.9757, 3.1438, 1.5556, 4.1173, 3.8609, 3.6914, 2.9864, 3.5765, 0.5166, 0.5116, 17.08, 5.0032, 35.623, 10.107, 20.344, 7.1832, 31.097, 15.37, 61.619],
    'AutoFlex score': [1, 1, 0.95, 1, 0.9857, 1, 0.7199, 0.9427, 0.9382, 0.7736, 0.704, 0.8706, 0.9909, 0.9802, 0.9868, 0.8913, 0.9053, 0.8877, 0.7688, 0.7843, 0.8081, 0.8788, 0.9048, 0.8874, 0.9526, 0.9805, 0.9791, 0.8041, 0.7803, 0.8052],
    'H2O score': [0, None, 0.306, 0.52, 0.121, None, 0.391759, 0.22964, 0.277712, 0.47598, 0.32591, 0.73521, 0.33, 0.186, 0.281, 0.885, 0.914, 0.904, 0.1554, 0.163, 0.1757, None, None, None, 0.46, 0.2731, 0.424, 0.1604, 0.1459, 0.1809],
    'H2O time': [299.4589, None, 300.4525, 307.3777, 299.2099, None, 295.9526, 306.3068, 299.7177, 298.5576, 299.0544, 306.3237, 299.0565, 301.3412, 299.1535, 299.331, 299.4442, 298.9338, 294.89, 298.8632, 298.4761, None, None, None, 300.2729, 305.1297, 301.2625, 307.6791, 307.1827, 307.7052]
}

df = pd.DataFrame(data)

# Calculate the correlation coefficients
correlation_tpot = df['TPOT time'].corr(df['TPOT score'])
correlation_autoflex = df['AutoFlex time'].corr(df['AutoFlex score'])
correlation_h2o = df['H2O time'].corr(df['H2O score'])

print("Correlation coefficient for TPOT: ", correlation_tpot)
print("Correlation coefficient for AutoFlex: ", correlation_autoflex)
print("Correlation coefficient for H2O AutoML: ", correlation_h2o)

## Datasets Shape

In [2]:
import os
import pandas as pd

# Set the path to the folder containing the CSV files
folder_path = "datasets/finished"

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# Iterate through each CSV file and retrieve its shape
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    shape = df.shape
    print(f"{file}: {shape}")