# Transformers for Exploratory Data Analysis (EDA)

In [None]:
# transformer must be defined as a class implementing two methods: fit and transform
class RemoveColumnsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns_to_remove: list):
        self.columns_to_remove = columns_to_remove
    
    # even if nothing to fit, this method must return self
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X.drop(self.columns_to_remove, axis=1)

In [None]:
class ImputeColsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        for col in X:
            X[col] = X[col].ffill()
       
        return X

In [None]:
class MakeTSTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, index_column='date_time'):
        self.index_column = index_column
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):
        X = X.set_index(self.index_column)
        X.index = pd.to_datetime(X.index)
        return X

In [None]:
class CutDateTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, start_date, end_date):
        self.start_date = start_date
        self.end_date = end_date
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):
        X = X.sort_index()
        
        return X.loc[self.start_date:self.end_date]

In [None]:
class SelectColumnsByTypeTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns_type='number'):
        self.columns_type = columns_type
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X.select_dtypes(include=self.columns_type)

In [None]:
class HourlyResampleTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):
        for col in X:
            X[col] = X[col].resample('1H').mean().interpolate()  # interpolate missing values
        
        return X

In [None]:
class RemoveDuplicatesTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):       
        return X.loc[~X.index.duplicated()]

In [None]:
class SelectColumnsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns_to_select: list):
        self.columns_to_select = columns_to_select
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X[self.columns_to_select]

In [None]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, min_snow=MIN_SNOW_CM, melting_temp=MELTING_TEMPERATURE):
        self.min_snow = min_snow
        self.melting_temp = melting_temp
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        def gen_label(snow, temperature):
            '''Generate the output labels (Yes and No)'''
            if snow > self.min_snow and temperature < self.melting_temp:
                return "Yes"
            else:
                return "No"
            
        # extract temperature, humidity and snow precipitation
        t_list = X['tempC'].astype(float).to_list()
        h_list = X['humidity'].astype(float).to_list()
        s_list = X['totalSnow_cm'].astype(float).to_list()
         
        snow_labels = [gen_label(snow, temp) for snow, temp in zip(s_list, t_list)]

        df = pd.DataFrame(list(zip(t_list[:-2], t_list[1:-1], t_list[2:],
                                   h_list[:-2], h_list[1:-1], h_list[2:], 
                                   snow_labels[2:]
                                  )),
                          columns = ["Temp0", "Temp1", "Temp2", "Humi0", "Humi1", "Humi2", "Snow"])

        return df

In [None]:
class RemoveOutliersTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):     
        return self 
    
    def transform(self, X, y=None):
        '''Z-score criterium: |Z| > 3 for outlier'''

        zscore_df = np.abs(X.select_dtypes([int, float]).apply(zscore))
        z= zscore_df.apply(lambda x: x < 3) 
       
        return X[z.sum(axis=1) == len(z.columns)]  # only rows where outlier in none of the columns

In [None]:
class BalanceDatasetTransformer(BaseEstimator, TransformerMixin):
    '''balance the dataset by undersampling the majority class'''
    
    def __init__(self, label_column='Snow'):
        self.label_column = label_column
    
    def fit(self, X, y=None):     
        return self 
    
    def transform(self, X, y=None):
        '''Random subsampling of the majority class to guarantee 50% split'''

        df0 = X[X[self.label_column] == "No"]
        df1 = X[X[self.label_column] == "Yes"]

        if len(df1.index) < len(df0.index):
            df0_sub = df0.sample(len(df1.index))
            df = pd.concat([df0_sub, df1])
        
        else:
            df1_sub = df1.sample(len(df0.index))
            df = pd.concat([df1_sub, df0])
       
        return df.reset_index(drop=True)

In [None]:
class StandarizeColumnsTransformer(BaseEstimator, TransformerMixin):
    '''scale the input features with Z-score independently'''
    
    def __init__(self):
        self.t_avg, self.t_std = 0, 0
        self.h_avg, self.h_std = 0, 0
    
    def fit(self, X, y=None):
        
        def scaling(val, avg, std):
            '''Z-score scaling'''
            return (val - avg) / (std)

        # Get all values
        t_list = X['Temp0'].tolist() + X['Temp2'].tail(2).tolist()
        h_list = X['Humi0'].tolist() + X['Humi2'].tail(2).tolist()

        # Calculate mean and standard deviation
        self.t_avg, self.t_std = mean(t_list), std(t_list)
        self.h_avg, self.h_std = mean(h_list), std(h_list)
        
        return self 
    
    def transform(self, X, y=None):
        
        for col in ['Temp0', 'Temp1', 'Temp2']:
            X[col] = X[col].apply(lambda x: scaling(x, self.t_avg, self.t_std))

        for col in ['Humi0', 'Humi1', 'Humi2']:
            X[col] = X[col].apply(lambda x: scaling(x, self.h_avg, self.h_std))
       
        return X