In [103]:
import csv
import datetime
import numpy as np
import pandas as pd

from numpy import mean, std
from scipy.stats import zscore

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline 
from sklearn.base import BaseEstimator, TransformerMixin

## Transformers for Exploratory Data Analysis (EDA)

In [104]:
# transformer must be defined as a class implementing two methods: fit and transform
class RemoveColumnsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns_to_remove: list):
        self.columns_to_remove = columns_to_remove
    
    # even if nothing to fit, this method must return self
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X.drop(self.columns_to_remove, axis=1)

In [105]:
class ImputeColsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        for col in X:
            X[col] = X[col].ffill()
       
        return X

In [106]:
class MakeTSTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, index_column='date_time'):
        self.index_column = index_column
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):
        X = X.set_index(self.index_column)
        X.index = pd.to_datetime(X.index)
        return X

In [107]:
class CutDateTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, start_date, end_date):
        self.start_date = start_date
        self.end_date = end_date
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):
        X = X.sort_index()
        
        return X.loc[self.start_date:self.end_date]

In [108]:
class SelectColumnsByTypeTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns_type='number'):
        self.columns_type = columns_type
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X.select_dtypes(include=self.columns_type)

In [109]:
# instead of just interpolating using neighbouring points, we can consider wider context
# e.g., use smoothing with 3-5 points before or after the missing value
class HourlyResampleTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):
        for col in X:
            X[col] = X[col].resample('1H').mean().interpolate()  # interpolate missing values
        
        return X

In [110]:
class RemoveDuplicatesTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):   
        return self 
    
    def transform(self, X, y=None):       
        return X.loc[~X.index.duplicated()]

In [111]:
class SelectColumnsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns_to_select: list):
        self.columns_to_select = columns_to_select
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X[self.columns_to_select]

In [112]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, min_snow=MIN_SNOW_CM, melting_temp=MELTING_TEMPERATURE):
        self.min_snow = min_snow
        self.melting_temp = melting_temp
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        def gen_label(snow, temperature):
            '''Generate the output labels (Yes and No)'''
            if snow > self.min_snow and temperature < self.melting_temp:
                return "Yes"
            else:
                return "No"
            
        # extract temperature, humidity and snow precipitation
        t_list = X['tempC'].astype(float).to_list()
        h_list = X['humidity'].astype(float).to_list()
        s_list = X['totalSnow_cm'].astype(float).to_list()
         
        snow_labels = [gen_label(snow, temp) for snow, temp in zip(s_list, t_list)]

        df = pd.DataFrame(list(zip(t_list[:-2], t_list[1:-1], t_list[2:],
                                   h_list[:-2], h_list[1:-1], h_list[2:], 
                                   snow_labels[2:]
                                  )),
                          columns = ["Temp0", "Temp1", "Temp2", "Humi0", "Humi1", "Humi2", "Snow"])

        return df

In [113]:
class RemoveOutliersTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):     
        return self 
    
    def transform(self, X, y=None):
        '''Z-score criterium: |Z| > 3 for outlier'''

        zscore_df = np.abs(X.select_dtypes([int, float]).apply(zscore))
        z= zscore_df.apply(lambda x: x < 3) 
       
        return X[z.sum(axis=1) == len(z.columns)]  # only rows where outlier in none of the columns

In [114]:
class BalanceDatasetTransformer(BaseEstimator, TransformerMixin):
    '''balance the dataset by undersampling the majority class'''
    
    def __init__(self, label_column='Snow'):
        self.label_column = label_column
    
    def fit(self, X, y=None):     
        return self 
    
    def transform(self, X, y=None):
        '''Random subsampling of the majority class to guarantee 50% split'''

        df0 = X[X[self.label_column] == "No"]
        df1 = X[X[self.label_column] == "Yes"]

        if len(df1.index) < len(df0.index):
            df0_sub = df0.sample(len(df1.index))
            df = pd.concat([df0_sub, df1])
        
        else:
            df1_sub = df1.sample(len(df0.index))
            df = pd.concat([df1_sub, df0])
       
        return df.reset_index(drop=True)

In [115]:
class StandarizeColumnsTransformer(BaseEstimator, TransformerMixin):
    '''scale the input features with Z-score independently'''
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):     
        return self 
    
    def transform(self, X, y=None):
        
        def scaling(val, avg, std):
            '''Z-score scaling'''
            return (val - avg) / (std)

        # Get all values
        t_list = X['Temp0'].tolist() + X['Temp2'].tail(2).tolist()
        h_list = X['Humi0'].tolist() + X['Humi2'].tail(2).tolist()

        # Calculate mean and standard deviation
        t_avg, t_std = mean(t_list), std(t_list)
        h_avg, h_std = mean(h_list), std(h_list)
        
        for col in ['Temp0', 'Temp1', 'Temp2']:
            X[col] = X[col].apply(lambda x: scaling(x, t_avg, t_std))

        for col in ['Humi0', 'Humi1', 'Humi2']:
            X[col] = X[col].apply(lambda x: scaling(x, h_avg, h_std))
       
        return X

## Weather Dataset Transformations

In [116]:
MELTING_TEMPERATURE = 2
MIN_SNOW_CM = 0.5

LOCATION = 'Warsaw'
OUTPUT_DATASET_FILE = "dataset.csv"

# import dataset to CSV file
raw_df = pd.read_csv(f'weather_{LOCATION.lower()}.csv')
raw_df.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,moonrise,moonset,sunrise,...,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,location
0,2008-07-01 00:00:00,24,13,0.0,16.8,5,4,01:47 AM,08:04 PM,04:19 AM,...,15,5,86,0.0,1020,15,10,353,5,warsaw
1,2008-07-01 01:00:00,24,13,0.0,16.8,5,4,01:47 AM,08:04 PM,04:19 AM,...,16,5,86,0.0,1020,14,10,335,5,warsaw
2,2008-07-01 02:00:00,24,13,0.0,16.8,5,4,01:47 AM,08:04 PM,04:19 AM,...,16,6,86,0.0,1020,13,10,317,6,warsaw
3,2008-07-01 03:00:00,24,13,0.0,16.8,5,4,01:47 AM,08:04 PM,04:19 AM,...,12,6,86,0.0,1021,13,10,299,6,warsaw
4,2008-07-01 04:00:00,24,13,0.0,16.8,5,4,01:47 AM,08:04 PM,04:19 AM,...,12,6,77,0.0,1021,14,10,301,7,warsaw


In [117]:
steps = [
    ('location_remover', RemoveColumnsTransformer(columns_to_remove='location')),
    ('NA_imputer', ImputeColsTransformer()),
    ('timeseries_transformer', MakeTSTransformer(index_column='date_time')),
    ('cutdates_transformer', CutDateTransformer(start_date='2008-07-01', end_date='2023-08-31')),
    ('numeric_selector', SelectColumnsByTypeTransformer(columns_type='number')),
    ('hourly_resampler', HourlyResampleTransformer()),
    ('duplicates_remover', RemoveDuplicatesTransformer()),
    ('cols_selector', SelectColumnsTransformer(
        columns_to_select=['totalSnow_cm', 'DewPointC', 'WindGustKmph', 'cloudcover', 'humidity', 'precipMM',
                           'pressure', 'tempC', 'winddirDegree', 'windspeedKmph'])
    ),
    ('cols_remover', RemoveColumnsTransformer(columns_to_remove=['DewPointC', 'WindGustKmph', 'winddirDegree'])),
    ('features_transformer', FeatureEngineeringTransformer(min_snow=MIN_SNOW_CM,
                                                           melting_temp=MELTING_TEMPERATURE)),
    ('outliers_remover', RemoveOutliersTransformer()),
    ('data_balancer', BalanceDatasetTransformer(label_column='Snow')),
    ('standarizer', StandarizeColumnsTransformer())
]

pipeline = Pipeline(steps=steps)
pipeline.fit(raw_df)

In [118]:
df_final = pipeline.transform(raw_df)
df_final

Unnamed: 0,Temp0,Temp1,Temp2,Humi0,Humi1,Humi2,Snow
0,-0.822222,-0.939322,-1.056422,0.937006,0.937006,0.937006,No
1,-0.119623,-0.119623,-0.119623,0.376222,0.313913,0.251604,No
2,-0.705122,-0.705122,-0.705122,0.937006,0.937006,0.937006,No
3,0.582976,0.465876,0.465876,-0.433798,-0.246870,-0.246870,No
4,-0.002523,-0.002523,0.114576,0.126985,0.064676,-0.059943,No
...,...,...,...,...,...,...,...
2731,-0.353823,-0.353823,-0.353823,0.999315,0.999315,0.937006,Yes
2732,-0.353823,-0.353823,-0.353823,0.999315,0.937006,0.937006,Yes
2733,-0.353823,-0.353823,-0.353823,0.937006,0.937006,0.937006,Yes
2734,-0.353823,-0.353823,-0.353823,0.937006,0.937006,0.874696,Yes


In [119]:
# export final dataset to CSV file
df_final.to_csv(f'./outputs/{OUTPUT_DATASET_FILE}', index=False)