In [2]:
import itertools
import numpy as np
import pandas as pd
import upsetplot
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

In [3]:
try: 
    del pd.Dataframe.datanull
except AttributeError:
    pass

In [24]:
@pd.api.extensions.register_dataframe_accessor("datex")
class DontMissMe:
    def __init__(self,pandas_obj):
        self._obj = pandas_obj
    
    def nnull(self) -> int:
        return self._obj.isna().sum().sum()

    def data_no_null(self) -> int:
        return self._obj.size - self._obj.datex.nnull()

    def percentage_null(self) -> int:
        return self._obj.datex.nnull()/self._obj.size*100  
        
    def percentage_null_variable(self) -> pd.DataFrame:
        return self._obj.isnull().pipe(
            lambda df: (
                df.sum()
                .reset_index(name="null")
                .rename(columns={"index":"variable"})
                .assign(
                    percentage=lambda x : x ["null"]/len(df)*100
                )
            )
        )

    def percentage_null_case(self) -> pd.DataFrame:
        return self._obj.assign(
            case = lambda x : x.index,
            n_null=lambda df: df.apply(
             axis=1,
             func=lambda x : x.isnull().sum()
            ),
            percentage = lambda x : x["n_null"]/x.shape[1]*100
            )[["case","n_null","percentage"]]
    
    def maxi_null_case(self) -> int:
        max = self._obj.datex.percentage_null_case().n_null.max()
        return  int(max)
       
    def table_data_exp(self) -> pd.DataFrame:
        return pd.DataFrame.from_dict({
            "Data":["total_data","total_nulls","total_no_null","%_null","max_null/case"],
            "value":[
            self._obj.size,
            self._obj.datex.nnull(),
            self._obj.datex.data_no_null(),
            self._obj.datex.percentage_null(),
            self._obj.datex.maxi_null_case()]
            }                
        )

    def null_for_period(self,variable: str, period: int):
        return(
            self._obj.assign(
            period_counter=lambda df: (
                np.repeat(a=range(df.shape[0]), repeats=period)[: df.shape[0]]
            )
        )
        .groupby("period_counter")
        .agg(
            x=(variable, "size"),
            n_null=(variable, lambda s:s.isnull().sum())
        )
        .assign(
            n_complete=lambda df: df.x -df.n_null,
            pct_null= lambda df: df.n_null / df.x * 100,
            pct_complete= lambda df: 100-df.pct_null
        )
        .drop(columns=["x"])
        .reset_index()
        )
    
    def sort_variables_by_null(self, ascending=False):

        return self._obj.pipe(
            lambda df: (df[df.isna().sum().sort_values(ascending=ascending).index])
        )
        
    def continuous_data_period(self,variable=str):
        run= self._obj[variable].pipe(
            lambda s: [[len(list(g)),k]for k, g in itertools.groupby(s.isnull())]
        )
        return (pd.DataFrame(
            data=run,
            columns=["period_data","data/null"]
        )
        .replace({False:"data",True:"null"})
        .assign(case = lambda x : x.index,)
        )

    def create_shadow_matrix(
        self,
        true_string: str = "null",
        false_string: str = "data",
        only_null: bool = False,
        suffix: str = "_NA"
    ) -> pd.DataFrame:
        return(
            self._obj.isna()
            .pipe(lambda df: df[df.columns[df.any()]] if only_null else df)
            .replace({False: false_string, True: true_string})
            .add_suffix(suffix)
        )

    def shadow_matrix(
        self,
        true_string: str = "null",
        false_string: str = "data",
        only_null: bool = False,
        suffix: str = "_NA"
    ) -> pd.DataFrame:
        return pd.concat(
            objs=[
                self._obj,
                self._obj.datex.create_shadow_matrix(
                    true_string=true_string,
                    false_string=false_string,
                    only_null=only_null,
                    suffix=suffix                
                )
            ],
            axis="columns"
    ) 

    def range_per_variable(self) -> pd.DataFrame:
        return self._obj.max(numeric_only=True)-self._obj.min(numeric_only=True) 

    def quantile(self) -> pd.DataFrame:
        return (
            self._obj
            .quantile(q=[.25,.5,.75])
            .transpose()
            .rename_axis("variable")
            .reset_index()
            .assign(
                iqr = lambda df: df[.75]-df[.25]
            )
            
        )

    def category_per_variable(self,variable:str) -> pd.DataFrame:
        return (
            pd.DataFrame(
                survival_df[variable]
                .unique()
            )
            .rename(columns={0:"value"})
            .sort_values(["value"],ascending=True, na_position="first")
        )
   
    def  categorical_ordinal(self):
         
        categorical_columns = self._obj.select_dtypes(include=[object, "category"]).columns

        categorical_transformer = sklearn.compose.make_column_transformer(
            (sklearn.preprocessing.OrdinalEncoder(), categorical_columns),
            remainder="passthrough"
        )
        return (
            pd.DataFrame(
                    categorical_transformer.fit_transform(self._obj),
                    columns= categorical_transformer.get_feature_names_out(),
                    index = self._obj.index    
                )
                .rename_columns(
                    function = lambda x: x.removeprefix("ordinalencoder__")
                )
                .rename_columns(
                    function = lambda x: x.removeprefix("remainder__")
            )
                
        )
        
        
    
# Plot

    def null_for_period_plot(self,variable: str, period: int,color:str="cividis_r"):
        (
            sns.barplot(
                data=self._obj.datex.null_for_period(variable=variable, period=period),
                x="period_counter",
                y="pct_null",
                palette=color                
            ).set(
                ylim=(0,100),
                title=f"Percentage values null for period of {period} data")
        ) 
        
    def continuous_data_period_plot(self,variable=str,color:str="gray_r"):
        (
            sns.lineplot(
                data=self._obj.datex.continuous_data_period(variable=variable),
                x="case",
                y="period_data",
                color="gray",
                palette=color
            )
            .set(
                title=f"Continuous data/null period of variable {variable}(d={self._obj.shape[0]})"
            )
        )

    def proportion_null_data(self,color:str="cividis_r"):
        return (
            self._obj
            .isnull()
            .melt()
            .pipe(
                lambda df: (
                    sns.displot(
                        data=df,
                        y= "variable",
                        hue= "value",
                        multiple='fill',
                        aspect=2,
                        palette=color

                    )
                )
            )
        )
    
    def null_per_variable_plot(self):
        return (
            self._obj.datex.sort_variables_by_null()
            .isnull()
            .transpose()
            .pipe(
                lambda df: sns.heatmap(data=df)
            )

        )

    def sort_null_per_variable_plot(self, variable:str):
        return(
            self._obj.datex.sort_variables_by_null()
            .isnull()
            .sort_values(by=variable)
            .pipe(
                lambda df: sns.heatmap(data=df)
            )
        )

    def null_upsetplot(self, variables: list[str] = None, **kwargs):

        if variables is None:
            variables = self._obj.columns.tolist()

        return (
            self._obj.isna()
            .value_counts(variables)
            .pipe(lambda df: upsetplot.plot(df, **kwargs))
        )   

    def null_variable_plot(self):
            df = self._obj.datex.percentage_null_variable().sort_values("null")

            plot_range = range(1, len(df.index) + 1)

            plt.hlines(y=plot_range, xmin=0, xmax=df.n_missing, color="black")

            plt.plot(df.n_missing, plot_range, "o", color="black")

            plt.yticks(plot_range, df.variable)

            plt.grid(axis="y")

            plt.xlabel("Number null")
            plt.ylabel("Variable") 
    
    def null_data_variable_box(self,x:str,y:str,color:str="cividis_r"):
        return (
            self._obj.datex.shadow_matrix(only_null=True)
            .pipe(
                lambda df: (
                    sns.boxenplot(
                        data=df,
                        x= x,
                        y= y,
                        palette=color
                    )
                )
            )
        )

    def null_mosaic_plot(
        self,
        target: str,
        x_categorical:str,
        y_categorical:str,
        size:int=(10,10)
    ): 
        fig, ax = plt.subplots(figsize = size)       
        return (
            self._obj    
            .select_columns(target,x_categorical,y_categorical)                    
            .assign(
                **{target : lambda df: df[target].isna().replace([True, False],["NA", "!NA"])}
            )
            .groupby(
                [y_categorical,x_categorical,target],
                dropna=False,
                as_index= True
            )
            .size()
            .pipe(
                lambda df: mosaic(
                    data = df,
                    properties=lambda key : {"color": "r" if "NA" in key else "y"},
                    horizontal=True,
                    axes_label=True,
                    title="",
                    labelizer= lambda key: "",
                    ax = ax                  

                )
            )

        )
    

  class DontMissMe:
