In [None]:
import pandas as pd 
import numpy as np
pd.set_option("display.max_columns", None)
from IPython.display import display, Markdown, HTML

In [None]:

#Function to print dataframe side by side in jupyter
#https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
def display_side_by_side(dfs:list, captions:list, tablespacing=5):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    for (caption, df) in zip(captions, dfs):
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += tablespacing * "\xa0"
    display(HTML(output))

In [None]:
#Function to return description of each cluster
#df: dataframe with cluster labels as a column
#drop_cols: list of columns to drop (Ie. ID, BAN, IMSI, etc.) If there are none, leave it blank
#cluster_label_string: Name of the column with the cluster labels
#print_by_column: True - will print the cluster description by column. False -will print the cluster description by cluster

def cluster_description(df, drop_cols=[] , cluster_label_string='', print_by_column=True):
    temp_df = df.drop(columns=drop_cols)
    clusters =np.sort(temp_df[cluster_label_string].unique())
    
    if print_by_column==True: #print description by column
        for col in list(set(temp_df.columns) - set([cluster_label_string])):
            print (f"------------------------- Column: {col} ------------------------- \n")

            if (temp_df.dtypes[col] == 'O'): #column is a string
                temp_df_list = []
                for cluster in clusters:
                    cluster_df = pd.DataFrame(temp_df.loc[temp_df[cluster_label_string]==cluster][col].value_counts(normalize=True).sort_index())
                    cluster_df = cluster_df.rename(columns = {col : 'Cluster: ' + str(cluster)})
                    temp_df_list.append(cluster_df)
                display(pd.concat(temp_df_list, axis=1))

            else: #column is numeric
                temp_df_list = []
                for cluster in clusters:
                    cluster_df = pd.DataFrame(temp_df.loc[temp_df[cluster_label_string]==cluster][col].describe())
                    cluster_df = cluster_df.rename(columns = {col : 'Cluster: ' + str(cluster)})
                    temp_df_list.append(cluster_df)
                display(pd.concat(temp_df_list, axis=1).rename_axis(index=None))

            print("\n")

    else: #print description by cluster
        for cluster in clusters:
            cluster_df = temp_df.loc[temp_df[cluster_label_string]==cluster]
            print (f"------------------------- Cluster: {cluster} ------------------------- \n")

            #Numeric columns:

            numeric_cols = list(cluster_df.dtypes[cluster_df.dtypes!='O'].index)
            display(cluster_df[numeric_cols].describe())

            print("\n")
            #Text Columns:

            string_cols = list(cluster_df.dtypes[cluster_df.dtypes=='O'].index)
            string_df_list = []
            string_col_title=[]
            for col in string_cols:
                string_df_list.append(pd.DataFrame(cluster_df[col].value_counts(normalize=True).sort_index()))
                string_col_title.append(col + '_value_counts')
            display_side_by_side(string_df_list, string_col_title)
    
            
    