In [None]:
import pandas as pd
import numpy as np

In [None]:
def count_duplicatives(df, col_name=None):
    return df.duplicated(subset=col_name).sum()


def remove_duplicatives(df, col_name=None):
    return df.drop_duplicates(subset=col_name).copy()


def remove_corrupt_rows(df, num_max_missing_cols):
    return df.dropna(thresh=(len(df.columns)-num_max_missing_cols)).copy()


In [None]:
def treat_numeric(df):
    numbers = df.select_dtypes(include='number')
    values={}
    for col in numbers.columns:
        median = numbers[col].median()
        values[col] = median
    df = df.fillna(values)
    return df.copy()
        
def treat_categorical(df, col_to_def_val_dict):
    categoricals = df.select_dtypes(exclude='number')
    values={}
    for col in categoricals.columns:
        if col not in col_to_def_val_dict.keys():
            freq = categoricals[col].mode()[0]
            values[col] = freq
    df = df.fillna(values)
    return df.copy()

def treat_predefined(df, col_to_def_val_dict):
    df = df.fillna(col_to_def_val_dict)
    return df.copy()

def replace_missing_values(df, col_to_def_val_dict):
    df = treat_numeric(df)
    df = treat_categorical(df, col_to_def_val_dict)
    df = treat_predefined(df, col_to_def_val_dict)
    return df.copy()

In [None]:
def outlier_detection_iqr(df):
    numbers = df.select_dtypes(include='number').columns
    df = df.copy()
    for col in numbers:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        lower_range = Q1 - 1.5*IQR
        upper_range = Q3 + 1.5*IQR
        df[col] = np.where((df[col] < lower_range) | (df[col] > upper_range), np.nan, df[col])
    return df.copy()

In [None]:
def outlier_detection_zscore_dist(df):
    df = df.copy()
    numbers = df.select_dtypes(include='number').columns
    for col in numbers:
        z_score = (df[col] - df[col].mean()) / df[col].std()
        df[col] = np.where((abs(z_score) > 3), np.nan, df[col])
    return df.copy()


In [None]:
def transfer_to_categorical(df, numeric_to_bin_value_dict, categorical_col_names):  
    df = df.copy()
    for col in numeric_to_bin_value_dict.keys():
        labels = [1,2,3,4,5]
        df[f"{col}_categotial"] = pd.cut(df[col], numeric_to_bin_value_dict[col], labels=labels)
    
    for col in categorical_col_names:
        df[col] = df[col].astype('category')
        df = pd.get_dummies(df, columns=[col], prefix =[col], drop_first=True)
    return df.copy()