<h2> Class for data preparation <br>

General pipeline before you build a predictive model. 

Our general variable treatment follows the pipeline below

pipeline = [drop_nan_col, drop_zero_var_col, drop_zero_car_col,drop_high_levels, 
            replace_missing, encode_target, transform, create_dummies]

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb



In [1]:
class data_preparation:
    
    def drop_nan_col(self, df, threshold): 
        """
        Objective: Drops columns most of whose rows missing
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. threshold: Determines which columns will be dropped.
                      if threshold is .9, the columns with 90% missing value will be dropped
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """
        
        for c in df.columns:
            if (float(df[c].isnull().sum())/df.shape[0]) > threshold:
                df.drop(c, axis = 1, inplace = True)
                print(f"{c} has mora than {threshold*100} missing values hence it will be removed")
            else:
                pass
                print(f"{c} does not have more than {threshold*100}  % missing values")
        return df.head()
    
    def drop_zero_var_col(self, df):
        """
        Objective: Drops numerical columns with zero variance
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """
        
        for c in df.select_dtypes(include = ['float64', 'float32', 'int64']).columns:
            if np.array(df[c]).std() == 0:
                df.drop(c, axis = 1, inplace = True)
                print(f"{c} has 0 variance - REMOVE IT!")
            else:
                print(f"{c} does not have 0 variance - KEEP IT")
                pass
        return df.head()
    def drop_zero_car_col(self, df):
        """
        Objective: Drops categorical columns with same levels, such as a column with all 'yes' values
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """
        for c in df.select_dtypes(include = ['object']).columns:
            if len(df[c].unique().tolist()) == 1:
                df.drop(c, axis = 1, inplace = True)
            else:
                pass
        return df
    
        
        
    def drop_high_levels(self, df, threshold):
        """
        this task will eliminate categorical columns if this column has a lot of levels. 
        inputs:
        1. Dataframe df: Pandas dataframe
        2. Threshold: How many levels you want at most
        
        outputs:
        1. Dataframe df: updated dataframe without dropped columns
        
        """
        for c in df.select_dtypes(include = ['object']).columns:
            if len(df[c].unique().tolist()) > threshold:
                df.drop(c, axis = 1, inplace = True)
                print(f"Column {c} has levels more than {threshold} levels hence it has been removed")
            else:
                pass
        return df.head()
                
        

    def replace_missing(self, df, num_val):
        """
        Objective: Replaces missing values with given values
        Note: replace missing categorical variables with 'unknown' string
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. num_val: User decides with what values they want to replace the missing numerical values. 
                    This value can be mean median mode or zero
    
        
        
        Outputs:
        1. Dataframe df with imputed missing values
        """
        #df.select_dtypes(include = ['object']).fillna(value = "unknown", inplace = True) #fillna(value = 'unknown')
        #df.select_dtypes(include = ['float64', 'float32', 'int']).fillna(value = 'num_val', inplace = True)#fillna(value = median)
        for c in df.select_dtypes(include = ['object']):
            df[c].fillna(value = 'unknown', inplace = True)
        if num_val == 'median':
            for v in df.select_dtypes(include = ['float64', 'float32', 'int']).columns:
                df[v].fillna(df[v].median(), inplace=True)
        elif num_val == 'mean':
            for v in df.select_dtypes(include = ['float64', 'float32', 'int']).columns:
                df[v].fillna(df[v].mean(), inplace=True)
        elif num_val =='mode':
            for v in df.select_dtypes(include = ['float64', 'float32', 'int']).columns:
                df[v].fillna(df[v].mode(), inplace=True)
        else:
            df[v].fillna(0, inplace=True)
    
        return df
    
    def create_dummies(self, df, label_name, avoid_trap):
        """
        Objective: Creates dummy variables for categorical variables 
        (0 1 binary columns for each level for a categorical column - ignore one of the levels)
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. Target: label_name
        3. avoid_trap: True to acoid dummy variable trap
        
        Outputs:
        1. Dataframe df with dummy variables
        """
        import pandas as pd
        f = df[label_name]
        df = df.drop(label_name, axis = 1)
        df = pd.get_dummies(df, drop_first = avoid_trap)
        df[label_name] = f
        return df
    
    def encode_target(self, df, target_name):
        """
        Objective: Encodes the class label if class column is categorical.
                   If class column is numerical just return the same dataframe without doing anything
                   Do not forget that clas label might have more than 2 levels (yes and no is two levels)
                   Target levels can be agree, stringly agree, disagree strongly disagree, neutral (5 levels)
                   Do not hard code.
                   
        Inputs: 
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with encoded binary class labels. 
        """
        if(df[target_name].dtype != 'object'):
            return df
        else:
            e = df[target_name].unique().tolist()
            for i in range(0, len(df[target_name].value_counts())):
                df.loc[(df[target_name] == e[i]), target_name ] = i
            
            df[target_name] = df[target_name].astype(int)
            return df
        