In [None]:
import pandas as pd
import sys
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import *
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import *
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import r2_score, accuracy_score
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import time
import warnings
import logging

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_dataset(file_path):
    """
    Load the dataset from a CSV file.
    """
    logger.info(f"Loading dataset from {file_path}...")
    data = pd.read_csv(file_path)
    logger.info("Dataset loaded.")
    return data

def handle_missing_values(data):
    """
    Handle missing values in the dataset.
    """
    logger.info("Handling missing values...")
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns

    # Fill missing values with the mean for numerical columns
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

    # Fill missing values with "Unknown" for categorical columns
    data[categorical_cols] = data[categorical_cols].fillna('Unknown')

    logger.info("Missing values handled.")
    return data

def split_data(data, target_variable):
    """
    Split the data into features (X) and target variable (y).
    """
    logger.info("Splitting the data into features and target variable...")
    X = data.drop(columns=[target_variable])
    y = data[target_variable]
    logger.info("Data split complete.")
    return X, y

def preprocess_numerical_col(col, preprocessing_methods, X_train, y_train, models):
    """
    Preprocess a numerical column and find the best preprocessing technique.
    """
    try:
        best_preprocess = None
        best_score = None
        for preprocess_name, preprocess_method in preprocessing_methods.items():
            preprocessor = ColumnTransformer([(preprocess_name, preprocess_method, [col])], remainder='passthrough')
            for model_name, (model, model_params) in models.items():
                pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('model', model)
                ])
                pipeline.fit(X_train, y_train)
                score = pipeline.score(X_train, y_train)
                if best_score is None or score > best_score:
                    best_score = score
                    best_preprocess = preprocess_name
        return col, best_preprocess
    except (ValueError, TypeError) as e:
        return col, None, str(e)


def preprocess_categorical_col(col, categorical_preprocessing_methods, feature_selection_methods, X_train, y_train, models):
    """
    Preprocess a categorical column and find the best preprocessing technique.
    """
    try:
        best_preprocess = None
        best_score = None
        best_feature_selection = None  # Store the best feature selection method
        for preprocess_name, preprocess_method in categorical_preprocessing_methods.items():
            preprocessor = ColumnTransformer([(preprocess_name, preprocess_method, [col])], remainder='drop')
            for feature_selection_name, feature_selection_method in feature_selection_methods.items():
                feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_method, [col])])
                for model_name, (model, model_params) in models.items():
                    pipeline = Pipeline([
                        ('preprocessor', preprocessor),
                        ('feature_selection', feature_selector),
                        ('model', model)
                    ])
                    pipeline.fit(X_train, y_train)
                    score = pipeline.score(X_train, y_train)
                    if best_score is None or score > best_score:
                        best_score = score
                        best_preprocess = preprocess_name
                        best_feature_selection = feature_selection_name
        return col, best_preprocess, best_feature_selection
    except (ValueError, TypeError) as e:
        return col, None, None, str(e)

def process_numerical_cols(X_train, numerical_cols, preprocessing_methods, models, y_train):
    """
    Preprocess numerical columns and find the best preprocessing techniques for each column.
    """
    logger.info("Processing numerical columns...")
    numerical_cols_processed = []
    with Parallel(n_jobs=-1) as parallel:
        numerical_cols_processed = parallel(delayed(preprocess_numerical_col)(col, preprocessing_methods, X_train, y_train, models) for col in tqdm(numerical_cols, desc="Numerical Columns Preprocessing"))
    logger.info("Numerical columns processing complete.")
    return numerical_cols_processed

def process_categorical_cols(X_train, categorical_cols, preprocessing_methods, feature_selection_methods, models, y_train):
    """
    Preprocess categorical columns and find the best preprocessing techniques for each column.
    """
    logger.info("Processing categorical columns...")
    categorical_cols_processed = []
    categorical_cols_processed_temp = []
    with Parallel(n_jobs=-1) as parallel:
        categorical_cols_processed = parallel(delayed(preprocess_categorical_col)(
            col, categorical_preprocessing_methods, feature_selection_methods, X_train, y_train, models
        ) for col in tqdm(categorical_cols, desc="Categorical Columns Preprocessing"))

    # Add the processed categorical columns to the list
    categorical_cols_processed.extend(cols for cols in categorical_cols_processed_temp if cols not in categorical_cols_processed)

    return categorical_cols_processed



def perform_grid_search(model_name, model, model_params, cols, X_train, y_train, X_test, y_test, task_type,
                        preprocessing_techniques):
    """
    Perform grid search with cross-validation to find the best hyperparameters and preprocessing techniques.
    """
    pipeline_count = 0
    preprocessing_steps = []

    if len(cols) > 0:
        pipeline_count += 1
        preprocessor = ColumnTransformer(preprocessing_techniques, remainder='passthrough')
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        param_grid = {f'model__{param_name}': param_range for param_name, param_range in model_params.items()}

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=ConvergenceWarning)
            start_time = time.time()

            try:
                grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=KFold(n_splits=5))
                grid_search.fit(X_train, y_train)
            except Exception as e:
                raise Exception(f"Grid search failed for {model_name}: {e}")

            end_time = time.time()

        best_score = grid_search.best_score_
        best_params = grid_search.best_params_
        pipeline_score = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=KFold(n_splits=5)).mean()

        if task_type == "Regression":
            y_pred = grid_search.predict(X_train)
            r2 = r2_score(y_train, y_pred)
            test_score = grid_search.score(X_test, y_test)
            result = {
                'Model': model_name,
                'Preprocessing': cols,
                'Best Parameters': best_params,
                'Best Score': best_score,
                'Pipeline Score': pipeline_score,
                'Accuracy': r2,
                'Test Score': test_score,
                'Execution Time (s)': end_time - start_time
            }
        else:
            y_pred = grid_search.predict(X_train)
            accuracy = accuracy_score(y_train, y_pred)
            test_score = grid_search.score(X_test, y_test)
            result = {
                'Model': model_name,
                'Preprocessing': cols,
                'Best Parameters': best_params,
                'Best Score': best_score,
                'Pipeline Score': pipeline_score,
                'Accuracy': accuracy,
                'Test Score': test_score,
                'Execution Time (s)': end_time - start_time
            }

        preprocessing_steps = [f"{step[0]}: {step[1]}" for step in cols]

        return result, pipeline_count, preprocessing_steps, preprocessing_techniques
    else:
        return {}, pipeline_count, preprocessing_steps, preprocessing_techniques

def perform_grid_search_parallel(models, cols, X_train, y_train, X_test, y_test, task_type, preprocessing_techniques):
    """
    Perform grid search with cross-validation in parallel for each model and column combination.
    """
    pipeline_counts = []
    preprocessing_techniques_str = []
    results = []

    with Parallel(n_jobs=-1) as parallel:
        total_pipelines_count = len(cols) * len(models)
        pipeline_bar = tqdm(total=total_pipelines_count, desc='Optimization Progress', leave=False)
        for model_name, (model, model_params) in models.items():
            try:
                processed_results = parallel(delayed(perform_grid_search)(
                    model_name, model, model_params, cols, X_train, y_train, X_test, y_test, task_type,
                    preprocessing_techniques
                ) for cols in tqdm(cols, desc=f"Model: {model_name}", total=len(cols), file=sys.stdout))
            except Exception as e:
                raise Exception(f"Grid search parallel failed for {model_name}: {e}")

            results.extend([result for result, _, _, _ in processed_results])

            pipeline_counts.extend([(count, steps, techniques) for _, count, steps, techniques in processed_results])

            pipeline_bar.update(len(cols))

            time.sleep(0.1)

            pipeline_bar.set_postfix({'Pipelines Generated': f"{sum(count for count, _, _ in pipeline_counts)}/{total_pipelines_count}"})

    preprocessing_techniques_str = '\n'.join([f"{step[0]}: {step[1]}" for step in preprocessing_techniques])

    results_df = pd.DataFrame(results)

    results_df['Preprocessing Techniques'] = preprocessing_techniques_str

    results_df_sorted = results_df.sort_values(by='Best Score', ascending=False)

    return results_df_sorted, pipeline_counts

if __name__ == '__main__':
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Load your dataset
    data = load_dataset('d.csv')

    # Handle missing values
    data = handle_missing_values(data)

    # Assume the target variable column name is 'target'
    target_variable = 'target'

    # Split the data into features (X) and target variable (y)
    X, y = split_data(data, target_variable)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Calculate the unique percentage of the target variable
    unique_percentage = y.nunique() / y.shape[0]

    # Determine the target variable class based on the unique percentage
    if unique_percentage <= 0.05:
        target_variable_class = "Binary"
    elif unique_percentage <= 0.1:
        target_variable_class = "Multi-class"
    else:
        target_variable_class = "Regression"

    # Print the target variable class
    logger.info("Target Variable Class: %s", target_variable_class)

    # Define task type based on the target variable class
    if target_variable_class == "Binary":
        # Binary classification task
        task_type = "Binary Classification"
        models = {
            'Decision Tree Classifier': (DecisionTreeClassifier(), {'max_depth': [None, 3, 5, 10]}),
            'Gradient Boosting Classifier': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
            'Random Forest Classifier': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
            'Neural Network Classifier': (MLPClassifier(max_iter=1000), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
            'Logistic Regression': (LogisticRegression(max_iter=10000), {'penalty': ['l2'], 'C': [0.1, 1, 10]}),
            'KNN Classifier': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        }
    elif target_variable_class == "Multi-class":
        # Multi-class classification task
        task_type = "Multi-class Classification"
        models = {
            'Decision Tree Classifier': (DecisionTreeClassifier(), {'max_depth': [None, 3, 5, 10]}),
            'Gradient Boosting Classifier': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
            'Random Forest Classifier': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
            'Neural Network Classifier': (MLPClassifier(max_iter=10000), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
            'Logistic Regression': (LogisticRegression(max_iter=10000), {'penalty': ['l2'], 'C': [0.1, 1, 10]}),
            'KNN Classifier': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        }
    else:
        # Regression task
        task_type = "Regression"
        models = {
            'Decision Tree Regressor': (DecisionTreeRegressor(), {'max_depth': [None, 3, 5, 10]}),
            'Gradient Boosting Regressor': (GradientBoostingRegressor(), {'n_estimators': [50, 100, 200]}),
            'Random Forest Regressor': (RandomForestRegressor(), {'n_estimators': [50, 100, 200]}),
            'Neural Network Regressor': (MLPRegressor(), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]}),
            'Linear Regression': (LinearRegression(), {'normalize': [True, False]}),
            'KNN Regressor': (KNeighborsRegressor(), {'n_neighbors': [3, 5, 7]}),
        }

    # Define your preprocessing techniques
    numerical_preprocessing_methods = {
        'SimpleImputer': SimpleImputer(),
        'StandardScaler': StandardScaler(),
        'RobustScaler': RobustScaler(),
        'MinMaxScaler': MinMaxScaler(),
        'PolynomialFeatures': PolynomialFeatures(),
        'PCA': PCA()
    }

    categorical_preprocessing_methods = {
        'OneHotEncoder': OneHotEncoder(handle_unknown='ignore'),
        'LabelEncoder': LabelEncoder(),
    }

    feature_selection_methods = {
        'SelectKBest_f_regression': SelectKBest(f_regression),
        'SelectKBest_chi2': SelectKBest(chi2)
    }

    # Handle numerical columns
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    numerical_cols_processed = process_numerical_cols(X_train, numerical_cols, numerical_preprocessing_methods, models, y_train)


    # Handle categorical columns
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
    categorical_cols_processed = process_categorical_cols(X_train, categorical_cols, categorical_preprocessing_methods, feature_selection_methods, models, y_train)

    # Concatenate the processed numerical and categorical columns
    cols = (numerical_cols_processed or []) + (categorical_cols_processed or [])


    preprocessing_techniques = []
    for col, preprocess_name in numerical_cols_processed:
        if preprocess_name is not None:
            preprocessing_techniques.append((f'num_preprocess_{col}', numerical_preprocessing_methods[preprocess_name], [col]))

    for col, preprocess_name, feature_selection_name in categorical_cols_processed:
        if preprocess_name is not None:
            preprocessing_techniques.append((f'cat_preprocess_{col}', categorical_preprocessing_methods[preprocess_name], [col]))
            if feature_selection_name is not None:
                preprocessing_techniques.append((f'feature_selection_{col}', feature_selection_methods[feature_selection_name], [col]))

    # Split the data into features (X) and target variable (y)
    X, y = split_data(data, target_variable)

    # Convert X_train and X_test to pandas DataFrames if they are not already in that format
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Convert column names to strings
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)

    # Apply preprocessing steps to training data
    for col, preprocess_name in numerical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, numerical_preprocessing_methods[preprocess_name], [col])], remainder='passthrough')
            X_train[[col]] = preprocessor.fit_transform(X_train[[col]], y_train)


    for col, preprocess_name, feature_selection_name in categorical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, categorical_preprocessing_methods[preprocess_name], [col])], remainder='drop')
            X_train = preprocessor.fit_transform(X_train, y_train)
            X_train = pd.DataFrame(X_train)  # Convert back to pandas DataFrame
            if feature_selection_name is not None:
                feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_methods[feature_selection_name], [col])])
                X_train = feature_selector.fit_transform(X_train, y_train)
                X_train = pd.DataFrame(X_train)  # Convert back to pandas DataFrame

    # Apply preprocessing steps to testing data
    for col, preprocess_name in numerical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, numerical_preprocessing_methods[preprocess_name], [col])], remainder='passthrough')
            preprocessor.fit(X_train[[col]])  # Fit on training data
            X_test[[col]] = preprocessor.transform(X_test[[col]])

    for col, preprocess_name, feature_selection_name in categorical_cols_processed:
        if preprocess_name is not None:
            preprocessor = ColumnTransformer([(preprocess_name, categorical_preprocessing_methods[preprocess_name], [col])], remainder='drop')
            X_test = preprocessor.transform(X_test)
            X_test = pd.DataFrame(X_test)  # Convert back to pandas DataFrame
            if feature_selection_name is not None:
                feature_selector = ColumnTransformer([(feature_selection_name, feature_selection_methods[feature_selection_name], [col])])
                X_test = feature_selector.transform(X_test)
                X_test = pd.DataFrame(X_test)  # Convert back to pandas DataFrame

    results_df_sorted, pipeline_counts = perform_grid_search_parallel(models, cols, X_train, y_train, X_test, y_test, task_type, preprocessing_techniques)

    excel_file_path = 'results_sorted.xlsx'
    results_df_sorted.to_excel(excel_file_path, index=False)

    total_pipelines = sum(count for count, _, _ in pipeline_counts)
    total_preprocessing_steps = sum(len(steps) for _, steps, _ in pipeline_counts)

    logger.info(f"Total Models: {len(models)}")
    logger.info(f"Total Pipelines: {total_pipelines}")
    logger.info(f"Total Preprocessing Methods: {len(preprocessing_techniques)}")
    logger.info(f"Total Preprocessing Steps: {total_preprocessing_steps}")

    best_train_result = results_df_sorted.iloc[0]
    logger.info("Best Train Result:")
    logger.info(best_train_result)

    best_test_result = results_df_sorted.sort_values(by='Test Score', ascending=False).iloc[0]
    logger.info("Best Test Result:")
    logger.info(best_test_result)

    logger.info(f"Sorted Results saved to: {excel_file_path}")