Data cleaning functions TEMP

In [None]:
def na_abs(df, ascend=False):
    df1 = df.isna().sum()
    df1=df1[df1!=0].sort_values(ascending = ascend)
    return df1

def na_perc(df, ascend=False):
    df1 = df.isna().mean()*100
    df1 = df1[df1!=0].sort_values(ascending = ascend)
    return df1

def na_absperc(df):
    return pd.concat([na_cols_abs(df), na_cols_perc(df)], axis=1, keys= ["abs_NA", "perc_NA"])


def categ_summ(df):
    '''
    Creates a modified version of 'describe objects' function
    
    Adds 3 new columns to evaluate the ratio between unique/level values
    and their frequency
    
    "resto_per" column may pinpoint potential misspelled errors as
    Top rows: indicate there is ONE LEVEL with EXCESIVE FREQ
    Bottom rows: indicate there are MANY LEVELS with very LOW FREQ
    
    '''

    sumdf=df.describe(include = "object").T
    sumdf["unicount_ratio"]=sumdf["unique"]/sumdf["count"]

    sumdf["resto_abs"]=(sumdf["count"]-sumdf["freq"])
    sumdf["resto_per"]=(sumdf["resto_abs"]*100)/sumdf["count"]

    sumdf.sort_values(["resto_per", "unique"])
    return sumdf



def variance_check(df, perc_a, perc_b):
    '''
    Creates a modified version of 'describe numeric' function
    
    Adds 2 new columns to dispay quantile A and B defined by the user
   
    NOTE: The function will only filter and evaluate the NUMERIC COLUMNS!
    perc_a and perc_b must be from 0-1
    '''
    subdf=df.select_dtypes(include='number')
    sumdf=subdf.describe(include="number").T

    sumdf["P" + str(int(perc_a*100))]=numeric_df.quantile(perc_a)
    sumdf["P" + str(int(perc_b*100))]=numeric_df.quantile(perc_b)

    return sumdf.sort_values("std", ascending = False)


def outliers(df):
    outliers = pd.DataFrame(columns=df.columns)
    stats=df.describe().transpose()
    stats['IQR'] = stats['75%'] - stats['25%']
    
    for col in stats.index:
        iqr = stats.at[col,'IQR']
        cutoff = iqr * 1.5
        lower = stats.at[col,'25%'] - cutoff
        upper = stats.at[col,'75%'] + cutoff
        results = df[(df[col] < lower) | 
                       (df[col] > upper)].copy()
        results['Outlier'] = col
        outliers = outliers.append(results)
    return outliers



def reduc_mem(df):
    '''
    reduce the memory usage of the dataframe by:
    1),2) downcasting the int and float columns into numeric with lowest bits possible
    3) collapsing object columns into cateory (factor levels)
    
    '''
    dytpes_list=df.dtypes

    for i in range(len(dytpes_list)):
        if dytpes_list[i]=="int" :
            df[df.columns[i]] = pd.to_numeric(df[df.columns[i]], downcast='integer')
        elif dytpes_list[i]=="float" :
            df[df.columns[i]] = pd.to_numeric(df[df.columns[i]], downcast='float')
        elif dytpes_list[i]=="object" :
            df[df.columns[i]] = df[df.columns[i]].astype('category')
        else:
            pass
    return df

def check_nan(df: pd.DataFrame) -> None:
    
    """
    Recibe un dataframe y enseña el % de valores nulos
    y lo grafica
    """
    
    nan_cols = df.isna().mean() * 100  # porcentaje de nulo en cada columna
    
    display(f'N nan cols: {len(nan_cols[nan_cols>0])}')
    display(nan_cols[nan_cols>0])
    
    
    # grafico de nulos en el dataframe
    #inicializa figura y establece un tamaño
    plt.figure(figsize=(10, 6)) # 100x60 pixeles

    sns.heatmap(df.isna(),          # datos
                yticklabels=False,  # quita las etiquetas del eje y
                cmap='viridis',     # mapa de color
                cbar=False,         # sin barra lateral
               )

    plt.show();