In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [169]:
def variable_type(df, discrete_threshold = 9, continuous_threshold = 15, sort_ascending = None, sugg_type = None, index = None):
    '''
    Calculates cardinality and suggest a variable type to each column of a dataframe. It also suggests variables to use as index.

    Args:
        df (DataFrame): dataframe to analyze
        discrete_threshold (int): minimum cardinality threshold to consider the variable as a numeric discrete type.
        continuous_threshold (int): minimum cardinality threshold to consider the variable as a numeric continuous type.
        sort_ascending (None | bool): sorts by % cardinality, useful if suggested index is not correct.
        sugg_type (string | None): filters dataframe by specified suggested types.
        index (None | bool): filters dataframe by possible index.
    
    Returns:
        DataFrame
    '''
    # Dataframe creation
    df_temp = pd.DataFrame([df.nunique(), df.nunique() / len(df) * 100, df.dtypes]).T
    df_temp = df_temp.rename(columns = {0: 'cardinality', 1: '%_cardinality', 2: 'type'})
    
    # Suggested type based on calculated cardinality
    df_temp['suggested_type'] = 'Categorical'
    df_temp.loc[df_temp['cardinality'] == 1, '%_cardinality'] = 0.00
    df_temp.loc[df_temp['cardinality'] == 2, 'suggested_type'] = 'Binary'
    df_temp.loc[df_temp['%_cardinality'] >= discrete_threshold, 'suggested_type'] ='Numeric (discrete)'
    df_temp.loc[df_temp['%_cardinality'] >= continuous_threshold, 'suggested_type'] = 'Numeric (continuous)'
    
    # Index suggestion
    df_temp['possible_index'] = False
    index_cond = (df_temp['%_cardinality'] == 100) & (df_temp.index.str.contains('id', case = False, regex = False))
    df_temp.loc[index_cond, 'possible_index'] = True
    
    # Returns dataframe sorted by % cardinality, useful if suggested index is not correct
    if type(sort_ascending) is bool:
        df_temp.sort_values(by = '%_cardinality', ascending = sort_ascending, inplace = True)
    
    # Returns dataframe that only includes specified suggested types
    if sugg_type:
        df_temp = df_temp.loc[df_temp['suggested_type'].str.contains(sugg_type, case = False)]
    
    # Returns dataframe with possible index. Can also be set to exclude possible index suggestions
    if type(index) is bool:
        df_temp = df_temp.loc[df_temp['possible_index'] == index]
    
    return df_temp

In [None]:
def bar_donut_chart(df, variable):
    colors = ['#6CB4EE', '#318CE7', '#6495ED', '#87CEFA'] 

    category_counts = df[variable].value_counts()

    fig, axs = plt.subplots(1, 2, figsize = (14, 4)) 

    bars = axs[0].barh(category_counts.index, category_counts.values, color = colors)
    axs[0].set_title(f'{variable} Distribution', fontsize = 16)
    axs[0].set_xlabel('Count')
    axs[0].set_ylabel(f'{variable} Types')

    for bar, value in zip(bars, category_counts.values):
        width = bar.get_width()
        axs[0].text(width, bar.get_y() + bar.get_height() / 2, '%d' % int(width),
                    ha = 'left', va = 'center', color = 'black', fontsize = 10)

    status_counts = df[variable].value_counts()

    wedges, texts, autotexts = axs[1].pie(status_counts, labels=status_counts.index, colors=colors, autopct='%1.1f%%', startangle=140)

    centre_circle = plt.Circle((0,0),0.70,fc = 'white')
    plt.gca().add_artist(centre_circle)
    axs[1].axis('equal')

    plt.tight_layout()
    plt.show()