# Functions María

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr, f_oneway, mannwhitneyu

In [2]:
# Load the datasets
titanic_df = pd.read_csv('../data/titanic.csv')
cities_df = pd.read_csv('../data/california_cities.csv')
inmo_df = pd.read_csv('../data/ejemplo_housing.csv')
flights_df = pd.read_csv('../data/dataset_viajes_jun23.csv')
customers_df = pd.read_csv('../data/Marketing-Customer-Analysis.csv')

In [3]:
cities_df

Unnamed: 0.1,Unnamed: 0,city,latd,longd,elevation_m,elevation_ft,population_total,area_total_sq_mi,area_land_sq_mi,area_water_sq_mi,area_total_km2,area_land_km2,area_water_km2,area_water_percent
0,0,Adelanto,34.576111,-117.432778,875.0,2871.0,31765,56.027,56.009,0.018,145.107,145.062,0.046,0.03
1,1,AgouraHills,34.153333,-118.761667,281.0,922.0,20330,7.822,7.793,0.029,20.260,20.184,0.076,0.37
2,2,Alameda,37.756111,-122.274444,,33.0,75467,22.960,10.611,12.349,59.465,27.482,31.983,53.79
3,3,Albany,37.886944,-122.297778,,43.0,18969,5.465,1.788,3.677,14.155,4.632,9.524,67.28
4,4,Alhambra,34.081944,-118.135000,150.0,492.0,83089,7.632,7.631,0.001,19.766,19.763,0.003,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,477,Yountville,38.403056,-122.362222,30.0,98.0,2933,1.531,1.531,0.000,3.966,3.966,0.000,0.00
478,478,Yreka,41.726667,-122.637500,787.0,2582.0,7765,10.053,9.980,0.073,26.036,25.847,0.188,0.72
479,479,YubaCity,39.134722,-121.626111,18.0,59.0,64925,14.656,14.578,0.078,37.959,37.758,0.201,0.53
480,480,Yucaipa,34.030278,-117.048611,798.0,2618.0,51367,27.893,27.888,0.005,72.244,72.231,0.013,0.02


### Reviewed: describe_df()

In [4]:
def describe_df(df): 
    """
    Generates a summary DataFrame describing the input DataFrame's data types, percentage of missing values, number of unique values and cardinality (percentage of unique values).
    
    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to be described.
    
    Returns
    -------
    df_summary: pd.DataFrame
        A DataFrame with a summary of data types, missing values, unique values and cardinality for each column of the input DataFrame.
    
    Raises
    ------
    TypeError
        If the input is not a pandas DataFrame.
    
    ValueError
        If the DataFrame is empty.
    """

    # Validate input type
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f'Input must be a pandas DataFrame, but received {type(df).__name__}.')
    
    # Calculate the length of the DataFrame once
    num_rows = len(df)
    
    # Validate DataFrame length to prevent dividing by 0 later on
    if num_rows == 0:
        raise ValueError('The DataFrame is empty.')
    
    # Calculate data types, missing values percentage, unique values and cardinality
    data_type = df.dtypes
    missings = round(df.isna().sum() / num_rows * 100, 2)
    unique_values = df.nunique()
    cardin = round(unique_values / num_rows * 100, 2)
    
    # Construct the summary DataFrame
    df_summary = pd.DataFrame({
        'DATA_TYPE': data_type,
        'MISSINGS (%)': missings,
        'UNIQUE_VALUES': unique_values,
        'CARDIN (%)': cardin
    }).T

    return df_summary


In [5]:
def describe_df_extra(df, count = False): 
    """
    Generates a summary DataFrame describing the input DataFrame's data types, percentage of missing values, number of unique values, cardinality (percentage of unique values), and optionally, the count of non-null values.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to be described.
    
    count : bool, optional
        If True, includes the count of non-null values in each column (default is False).
    
    Returns
    -------
    df_summary: pd.DataFrame
        A DataFrame with a summary of data types, missing values, unique values, cardinality, and optionally, the count of non-null values for each column.
    
    Raises
    ------
    TypeError
        If the input is not a pandas DataFrame.
    
    ValueError
        If the DataFrame is empty.
    """

    # Validate input type
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f'Input must be a pandas DataFrame, but received {type(df).__name__}.')
    
    # Calculate the length of the DataFrame once
    num_rows = len(df)
    
    # Validate DataFrame length to prevent dividing by 0 later on
    if num_rows == 0:
        raise ValueError('The DataFrame is empty.')
    
    # Calculate data types, missing values percentage, unique values and cardinality
    data_type = df.dtypes
    missings = round(df.isna().sum() / num_rows * 100, 2)
    unique_values = df.nunique()
    cardin = round(unique_values / num_rows * 100, 2)
    
    # Construct the summary DataFrame
    df_summary = pd.DataFrame({
        'DATA_TYPE': data_type,
        'MISSINGS (%)': missings,
        'UNIQUE_VALUES': unique_values,
        'CARDIN (%)': cardin
    })
    
    # Optionally add the count of non-null values and rearrange the columns
    if count:
        not_null_count = df.notna().sum()
        df_summary.insert(1, 'NOT-NULL COUNT', not_null_count)

    return df_summary.T


In [6]:
describe_df(titanic_df)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
DATA_TYPE,int64,int64,object,float64,int64,int64,float64,object,object,object,bool,object,object,object,bool
MISSINGS (%),0.0,0.0,0.0,19.87,0.0,0.0,0.0,0.22,0.0,0.0,0.0,77.22,0.22,0.0,0.0
UNIQUE_VALUES,2,3,2,88,7,7,248,3,3,3,2,7,3,2,2
CARDIN (%),0.22,0.34,0.22,9.88,0.79,0.79,27.83,0.34,0.34,0.34,0.22,0.79,0.34,0.22,0.22


In [7]:
describe_df_extra(titanic_df, True)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
DATA_TYPE,int64,int64,object,float64,int64,int64,float64,object,object,object,bool,object,object,object,bool
NOT-NULL COUNT,891,891,891,714,891,891,891,889,891,891,891,203,889,891,891
MISSINGS (%),0.0,0.0,0.0,19.87,0.0,0.0,0.0,0.22,0.0,0.0,0.0,77.22,0.22,0.0,0.0
UNIQUE_VALUES,2,3,2,88,7,7,248,3,3,3,2,7,3,2,2
CARDIN (%),0.22,0.34,0.22,9.88,0.79,0.79,27.83,0.34,0.34,0.34,0.22,0.79,0.34,0.22,0.22


### Reviewed: tipifica_variables()

In [8]:
def tipifica_variables(df, umbral_categoria, umbral_continua):
    """
    Classifies the columns of a DataFrame based on their cardinality and percentage cardinality.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame whose columns will be classified.
    umbral_categoria : int
        The threshold for categorical variables. Columns with unique values less than or equal to this threshold will be classified as 'Categorica'.
    umbral_continua : float
        The threshold for continuous numerical variables, based on the percentage of unique values in the column. 
        If the percentage of unique values is greater than or equal to this threshold, the column is classified as 'Numerica Continua'.

    Returns
    -------
    df_type : pandas.DataFrame
        A DataFrame with columns 'nombre_variable' (variable names) and 'tipo_sugerido' (suggested type based on cardinality and percentage).
        It provides the column names and their suggested type classification based on cardinality thresholds.
    
    Raises
    ------
    TypeError
        If the input `df` is not a pandas DataFrame, or if `umbral_categoria` is not an integer, or `umbral_continua` is not a float.
    """
    
    # Validate input types
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f'Parameter df must be a pandas DataFrame, but received {type(df).__name__}.')
    if not isinstance(umbral_categoria, int):
        raise TypeError(f'Parameter umbral_categoria must be int, but received {type(umbral_categoria).__name__}.')
    if not isinstance(umbral_continua, float):
        raise TypeError(f'Parameter umbral_continua must be float, but received {type(umbral_continua).__name__}.')
    
    # Get the number of rows in the DataFrame
    num_rows = len(df) 
    
    # Lists to store column names and their suggested types
    col_name = []
    suggested_type = []

    # Loop through each column in the DataFrame
    for col in df.columns:
        # Calculate cardinality and percentage cardinality
        cardinality = df[col].nunique()
        percentage_cardinality = cardinality / num_rows * 100
        
        # Classify the variable based on cardinality and percentage cardinality
        if cardinality == 2:
            type_classification = 'Binaria'
        elif cardinality < umbral_categoria:
            type_classification = 'Categorica'
        else:
            type_classification = 'Numerica Continua' if percentage_cardinality >= umbral_continua else 'Numerica Discreta'
        
        # Add the column name and its classification to the respective lists
        col_name.append(col)
        suggested_type.append(type_classification)
    
    # Create a DataFrame with the column names and their suggested types
    df_type = pd.DataFrame({'nombre_variable': col_name, 'tipo_sugerido': suggested_type})
        
    # Return the final DataFrame with classifications
    return df_type


In [9]:
def tipifica_variables_extra(df, umbral_categoria, umbral_continua, *, show_cardinality=False, show_percentage=False):
    """
    Classifies the columns of a DataFrame based on their cardinality and percentage cardinality.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame whose columns will be classified.
    umbral_categoria : int
        The threshold for categorical variables. Columns with unique values less than or equal to this threshold will be classified as 'Categorica'.
    umbral_continua : float
        The threshold for continuous numerical variables, based on the percentage of unique values in the column. 
        If the percentage of unique values is greater than or equal to this threshold, the column is classified as 'Numerica Continua'.
    show_cardinality : bool, optional (default=False)
        If True, includes the cardinality (number of unique values) of each column in the output DataFrame.
    show_percentage : bool, optional (default=False)
        If True, includes the percentage of unique values (cardinality relative to the total number of rows) of each column in the output DataFrame.

    Returns
    -------
    df_type : pandas.DataFrame
        A DataFrame with columns 'nombre_variable', 'tipo_sugerido', and optionally 'cardinalidad' and '%_cardinalidad'based on the input flags (show_cardinality and show_percentage).
        The DataFrame provides the column names and their suggested type classification.
    
    Raises
    ------
    TypeError
        If the input `df` is not a pandas DataFrame, or if `umbral_categoria` is not an integer, or `umbral_continua` is not a float.
    """
    
    # Validate input types
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f'Parameter df must be a pandas DataFrame, but received {type(df).__name__}.')
    if not isinstance(umbral_categoria, int):
        raise TypeError(f'Parameter umbral_categoria must be int, but received {type(umbral_categoria).__name__}.')
    if not isinstance(umbral_continua, float):
        raise TypeError(f'Parameter umbral_continua must be float, but received {type(umbral_continua).__name__}.')

    # Get the number of rows in the DataFrame
    num_rows = len(df) 
    
    # Lists to store column names and their suggested type
    col_name = []
    suggested_type = []
    
    # Lists to store cardinality and percentage, if required
    if show_cardinality:
        cardinality_list = []
    if show_percentage:
        percentage_list = []

    # Loop through each column in the DataFrame
    for col in df.columns:
        # Calculate cardinality and percentage cardinality
        cardinality = df[col].nunique()
        percentage_cardinality = cardinality / num_rows * 100
        
        # Classify the variable based on cardinality and percentage cardinality
        if cardinality == 2:
            type_classification = 'Binaria'
        elif cardinality < umbral_categoria:
            type_classification = 'Categorica'
        else:
            type_classification = 'Numerica Continua' if percentage_cardinality >= umbral_continua else 'Numerica Discreta'
        
        # Add column name and its classification to their respective lists
        col_name.append(col)
        suggested_type.append(type_classification)
        
        # If show_cardinality is True, store the cardinality value
        if show_cardinality:
            cardinality_list.append(cardinality)
        # If show_percentage is True, store the percentage cardinality, rounded to 2 decimal places
        if show_percentage:
            percentage_list.append(round(percentage_cardinality, 2))
    
    # Create a DataFrame with column names and their suggested types
    df_type = pd.DataFrame({'nombre_variable': col_name, 'tipo_sugerido': suggested_type})
    
    # Insert additional columns based on the flags: show_cardinality and show_percentage
    if show_cardinality and show_percentage:
        df_type.insert(1, 'cardinalidad', cardinality_list)
        df_type.insert(2, '%_cardinalidad', percentage_list)
    elif show_cardinality:
        df_type.insert(1, 'cardinalidad', cardinality_list)
    elif show_percentage:
        df_type.insert(1, '%_cardinalidad', percentage_list)

    # Return the final DataFrame with the classifications
    return df_type


In [10]:
tipifica_variables_extra(titanic_df, 3, 9.6, show_cardinality = True, show_percentage = True)

Unnamed: 0,nombre_variable,cardinalidad,%_cardinalidad,tipo_sugerido
0,survived,2,0.22,Binaria
1,pclass,3,0.34,Numerica Discreta
2,sex,2,0.22,Binaria
3,age,88,9.88,Numerica Continua
4,sibsp,7,0.79,Numerica Discreta
5,parch,7,0.79,Numerica Discreta
6,fare,248,27.83,Numerica Continua
7,embarked,3,0.34,Numerica Discreta
8,class,3,0.34,Numerica Discreta
9,who,3,0.34,Numerica Discreta


### Reviewed: get_features_num_regression()

In [11]:
def get_features_num_regression(df, target_col, umbral_corr, *, pvalue = None, card = 20):
    """
    Identifies numeric columns in a DataFrame whose correlation with the 'target_col' exceeds a specified
    correlation threshold and, optionally, passes a statistical significance test based on the p-value.

    Parameters
    ----------
    df: pandas.DataFrame
        DataFrame containing the data.
    target_col: str
        Target column to correlate with other numeric columns.
    umbral_corr: float 
        Correlation threshold for filtering columns (absolute value between 0 and 1).
    pvalue : float, optional
        Significance level to filter statistically significant correlations (between 0 and 1).
    card: int, float
        Minimum cardinality required for 'target_col' to be considered continuous.

    Returns
    -------
    features_num: list
        A list of numeric column names whose correlation with 'target_col' exceeds the threshold.
    """
    
    # Validate the DataFrame
    if not isinstance(df, pd.DataFrame):
        print('The "df" parameter must be a pandas DataFrame.')
        return None
    
    # Validate target_col exists in the DataFrame
    if target_col not in df.columns:
        print(f'The column "{target_col}" is not present in the DataFrame.')
        return None
    
    # Validate target_col and card are numeric
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        print(f'The column "{target_col}" must be numeric.')
        return None
    
    if not isinstance(card, (int, float)):
        print('The "card" parameter must be a number (int or float).')
        return None
    
    # Validate target_col has high cardinality
    percentage_card = df[target_col].nunique() * 100
    if percentage_card <= card:
        print(f'The column "{target_col}" does not have sufficient cardinality. More than {card}% of unique values are required.')
        return None
    
    # Validate umbral_corr is a float between 0 and 1
    if not isinstance(umbral_corr, (int, float)) or not (0 <= umbral_corr <= 1):
        print('The "umbral_corr" value must be a number between 0 and 1.')
        return None
    
    # Validate pvalue is a float between 0 and 1 if provided
    if pvalue is not None:
        if not isinstance(pvalue, (int, float)) or not (0 <= pvalue <= 1):
            print('The "pvalue" must be "None" or a number (float) between 0 and 1.')
            return None
    
    # Select numeric columns excluding the target column
    numeric_cols = df.select_dtypes(include = [int, float]).columns.difference([target_col])
    
    # Initialize the list to store selected features
    features_num = []
    
    # Calculate correlations and filter by threshold
    for col in numeric_cols:
        corr, p_val = pearsonr(df[col], df[target_col])
        if abs(corr) > umbral_corr:
            if pvalue is None or p_val <= pvalue:
                features_num.append(col)

    # Return the list of selected numeric features
    return features_num


In [37]:
def get_features_num_regression_extra(df, target_col, umbral_corr, *, pvalue = None, card = 20, return_values = False):
    """
    Identifies numeric columns in a DataFrame whose correlation with 'target_col' exceeds a specified
    correlation threshold (absolute value) and, optionally, passes a statistical significance test based on the p-value.
    Optionally, returns detailed information about the correlations and p-values of the filtered features.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the data.
    target_col : str
        The target column name to calculate correlation with other numeric columns.
    umbral_corr : float
        The correlation threshold to filter columns (absolute value between 0 and 1).
    pvalue : float, optional
        The significance level to filter statistically significant correlations (between 0 and 1). Default is None.
    card : int, float, optional
        The minimum cardinality required for 'target_col' to be considered continuous. Default is 20.
    return_values : bool, optional
        If True, returns a DataFrame with correlations and p-values for each filtered column. Default is False.

    Returns
    -------
    features_num : list
        A list of column names whose correlation with 'target_col' exceeds the 'umbral_corr' threshold.
    all_values : pandas.DataFrame, optional
        If `return_values=True`, returns a DataFrame containing the correlation and p-value for each selected feature, 
        sorted by the correlation in descending order. Columns are named 'corr' and 'p_value'.
    """
    
    # Validate the DataFrame
    if not isinstance(df, pd.DataFrame):
        print('The "df" parameter must be a pandas DataFrame.')
        return None
    
    # Validate target_col exists in the DataFrame
    if target_col not in df.columns:
        print(f'The column "{target_col}" is not present in the DataFrame.')
        return None
    
    # Validate target_col and card are numeric
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        print(f'The column "{target_col}" must be numeric.')
        return None
    
    if not isinstance(card, (int, float)):
        print('The "card" parameter must be a number (int or float).')
        return None
    
    # Validate target_col has high cardinality
    percentage_card = df[target_col].nunique() * 100
    if percentage_card <= card:
        print(f'The column "{target_col}" does not have sufficient cardinality. More than {card}% of unique values are required.')
        return None
    
    # Validate umbral_corr is a float between 0 and 1
    if not isinstance(umbral_corr, (int, float)) or not (0 <= umbral_corr <= 1):
        print('The "umbral_corr" value must be a number between 0 and 1.')
        return None
    
    # Validate pvalue is a float between 0 and 1 if provided
    if pvalue is not None:
        if not isinstance(pvalue, (int, float)) or not (0 <= pvalue <= 1):
            print('The "pvalue" must be "None" or a number (float) between 0 and 1.')
            return None
    
    # Select numeric columns excluding the target column
    numeric_cols = df.select_dtypes(include = [int, float]).columns.difference([target_col])
    
    # Initialize the list to store selected features
    features_num = []
    
    # Initialize dictionary to store all correlations and p-values if return_values is True
    if return_values:
        all_values = {}
    
    # Calculate correlations and filter by threshold
    for col in numeric_cols:
        corr, p_val = pearsonr(df[col], df[target_col])
        if abs(corr) > umbral_corr:
            if pvalue is None or p_val <= pvalue:
                features_num.append(col)
                if return_values:
                    all_values[col] = {'corr': corr, 'p_value': p_val}
    

    # Return features_num and, if requested, a DataFrame with correlations and p-values
    if return_values:
        return features_num, pd.DataFrame(all_values).T.sort_values('corr', ascending = False)
    else:
        return features_num


In [40]:
get_features_num_regression_extra(titanic_df, 'fare', 0, return_values = False)

['parch', 'pclass', 'sibsp', 'survived']

### Review: plot_features_num_regression()

In [11]:
def plot_features_num_regression(df, target_col="", card=20, columns=[], umbral_corr=0, pvalue=None):

    """
    Generates pair plots for numeric columns in a DataFrame based on their correlation with a specified target column.
    Pair plots are generated in maximum 5x5 grids.
    If specific numeric columns are not specified the function will filter the numeric columns in the DataFrame based on
    a specified correlation threshold ('umbral_corr') and optionally a p-value significance level.
    Checks the threshold conditions of specified columns and offers options to remove if columns are not valid or continue
    anyway.
    
    Parameters:
    df (pd.DataFrame): Dataframe containing data.
    target_col (str): The target column to correlate with other numeric columns. Must be numeric continuous variable with high cardinality.
    card (int): Cardinality threshold checks for sufficient unique values in 'target_col'
    umbral_corr (float): Correlation threshold (between 0 and 1) for correlation testing if numeric columns are not specified.
    pvalue (float, optional, Defaul=None): Signifance level (between 0 and 1) for the correlation testing if numeric columns are not specified.

    Returns:
    list ('columns'): List of columns used for generating the pair plots
    """

    # First carry out checks to prevent errors
    #1. Check df is a dataframe
    if not isinstance(df, pd.DataFrame):
        print('First arguement must be a Pandas DataFrame.')
        return None

    #2. Check target_col is in DataFrame, and is numeric and continuous (high cardinality)
    if target_col not in df.columns or not (pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > card):
        print(f"The target column ('{target_col}') must be a numeric continuous variable with high cardinality.\nCheck 'card' value")
        return None
    
    #3. Check pvalue is float between 0 and 1 (and not (0 <= pvalue => 1)
    if pvalue is not None:
        if not isinstance(pvalue, (int, float)) or not (0 <= pvalue <= 1):
            print("'pvalue' must be 'None' or a number between 0 and 1.")
            return None

    # If no numeric columns are specified, get columns using function 'get_features_num_regression()' based on 'umbral_corr' and 'pvalue'
    if not columns:
        columns = get_features_num_regression(df=df, target_col=target_col, umbral_corr=umbral_corr, pvalue=pvalue)
    else:
        valid_cols = [] # Create empty list to store columns that meet threshold conditions
        for col in columns: # Loop through columns in columns list
            if col == target_col:
                continue # Skip the target column itself as already been checked for validity

            # Calculate pearsonr corr stat and p_value between column and target column
            corr, p_val = pearsonr(df[col], df[target_col])

            # Check corr stat and p-value meet specified thresholds
            if abs(corr) > umbral_corr:
                if pvalue is None or p_val <= pvalue:
                    valid_cols.append(col) # add column to valid_cols list if it meets both thresholds
                else:
                    # Warn that column does not meet the required p-value significance level
                    print(f"'{col}' did not meet the p-value signifcance level")
                    # Ask if you want to remove the column or continue anyway
                    question = input(f"Do you want to remove '{col}' from the columns list or continue anyway? Type 'remove' or 'continue'").strip().lower()

                    if question == 'continue': 
                        valid_cols.append(col) # adds column to valid_cols list if user types continue
                    else:
                        print(f"'{col}' was removed from columns list")
                        continue
            
            else:
                # Warn that column does not meet the required correlation threshold
                print(f"'{col}' did not meet the correlation threshold of {umbral_corr}.")
                # Ask if you want to remove the column or continue anyway
                question = input(f"Do you want to remove '{col}' from the columns list or continue anyway? Type 'remove' or 'continue'").strip().lower()
                if question == 'continue':
                    valid_cols.append(col) # adds column to valid_cols list if user types continue
                else:
                    print(f"'{col}' was removed from columns list")
                    continue
        
        if valid_cols: # Check there are still valid columns left in valid_cols
            columns = valid_cols # Sets columns to valid_columns after checks and warnings
        else:
            columns = get_features_num_regression(df=df, target_col=target_col, umbral_corr=umbral_corr, pvalue=pvalue)

    columns = [col for col in columns if col != target_col] # Make sure target is not in columns list to plot
    print(f"columns selected for pair plot analysis were: {columns}")
    
    # Generate pair plots in max 5x5 grids
    for i in range(0, len(columns), 4):
        sns.pairplot(df, vars=[target_col] + columns[i:i + 4])
        plt.show()

    # Return the selected numeric columns list 'columns'
    return columns

### Review: get_features_cat_regression()

In [12]:
# from scipy.stats import f_oneway, mannwhitneyu
# possibile addition? --> see comment marked ???
def get_features_cat_regression(df, target_col, pvalue=0.05, card=20):
    """
    Identifies and evaluates the significance of relationship between categorical columns and a specified numeric target column in a DataFrame.
    Uses ANOVA for multi-cats or Mann_whitney U for binary-cats
    Stores and returns a list of columns that have show a significant relationship with target column based on spcifed (optionally) pvalue.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the data
    target_col (str): Numeric target column for testing relationship with categorical columns
    pvalue (float, optional, Default=0.05): Significance level (between 0 and 1) for statistical test evaluation.
    card (int): Cardinality threshold (based on unique values) to determine if a column should be considered categorical.

    Returns:
    list ('categorical_features'): A list of categorical columns that have a significant relationship with target column based on pvalue arguement.
    """
    # Carry out input data checks
    #1. Check df is a dataframe
    if not isinstance(df, pd.DataFrame):
        print('First arguement must be a Pandas DataFrame.')
        return None
    
    #2. Check target column in DataFrame
    if not target_col in df.columns:
        print(f"The target column ('{target_col}') must be in the DataFrame.")
        return None
    
    # Check target column is numeric and has sufficiently high cardinality
    if not (pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > card):
        print(f"The target column ('{target_col}') must be a numeric continuous variable with high cardinality.\nCheck 'card' value")

    # Check pvalue is float between 0 and 1 (and not (0 <= pvalue => 1)
    if pvalue is not None:
        if not isinstance(pvalue, (int, float)) or not (0 <= pvalue <= 1):
            print("'pvalue' must be a 'None' or a number between 0 and 1.")
            return None
    
    # Create empty list to store columns considered to have statistically significant relationship with target column
    categorical_features = []

    # Loop through each column in the DataFrame
    for col in df.columns:
        if col == target_col: # Skip target column itself
            continue
        
        # Check the cardinality of column to decide if categorical or not
        if len(df[col].unique()) <= card: #??? Could we add if 'df[col].dtype == 'object' to this if?
            # If categorical and binary perform Mann-Whitney U test
            if df[col].nunique() == 2:
                groupA = df[df[col] == df[col].unique()[0]][target_col]
                groupB = df[df[col] == df[col].unique()[1]][target_col]

                p_val = mannwhitneyu(groupA, groupB).pvalue
            
            else:
                # If categorical with more than 2 groups, perform ANOVA test
                groups = df[col].unique()
                target_by_groups = [df[df[col] == group][target_col] for group in groups]

                p_val = f_oneway(*target_by_groups).pvalue

            # Check p-val against pvalue arguement to see if significance threshold is met
            if p_val <= pvalue:
                categorical_features.append(col) # Add to categorical_features list if deemed significant

    # Return list of categorical features
    return categorical_features

### Review: plot_features_cat_regression()

Esta función recibe un dataframe, una argumento "target_col" con valor por defecto "", una lista de strings ("columns") cuyo valor por defecto es la lista vacía, un argumento ("pvalue") con valor 0.05 por defecto y un argumento "with_individual_plot" a False.

Si la lista no está vacía, la función pintará los histogramas agrupados de la variable "target_col" para cada uno de los valores de las variables categóricas incluidas en columns que cumplan que su test de relación con "target_col" es significatio para el nivel 1-pvalue de significación estadística. La función devolverá los valores de "columns" que cumplan con las condiciones anteriores. 

Si la lista está vacía, entonces la función igualará "columns" a las variables numéricas del dataframe y se comportará como se describe en el párrafo anterior.

De igual manera que en la función descrita anteriormente deberá hacer un check de los valores de entrada y comportarse como se describe en el último párrafo de la función `get_features_cat_regression`.