# Feature Engineering Notebook
This notebook will cover the feature engineering steps needed to transform the raw data to transformed features to capture more information./


## Import Libraries and Dataset

In [14]:
import pandas as pd
pd.pandas.set_option('display.max_columns', None)

import numpy as np 

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


In [15]:
# Load the dataset
path = '/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/SeoulBikeData.csv'

raw_df = pd.read_csv(path, encoding='unicode_escape')

print("Total Rows and features are: , ", raw_df.shape)
raw_df.head()

Total Rows and features are: ,  (8760, 14)


Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


## Feature Engineering Steps
This section will include all the feature engineering steps need to transfomr the dat before feeding to the model.

First, and foremost, we will divide the data set into train, val and test data set.

### 1. Train - Validation - Test Splitting

In [16]:
class DatasetSplitter:
    def __init__(self, train_split=0.8, val_split=0.1, test_split=0.1):
        # split percentages
        self.train_split = train_split
        self.val_split = val_split
        self.test_split = test_split

    def split_dataframe(self, df):
        #Splitting counts
        self.train_split_cnt = int(len(df) * self.train_split) 
        self.val_split_cnt = int(len(df) * self.val_split)  
        self.test_split_cnt = int(len(df) * self.test_split) 

        # Splitting Datasets
        train_df = df[:self.train_split_cnt]
        val_df = df[self.train_split_cnt:self.train_split_cnt + self.val_split_cnt].reset_index(drop=True)
        test_df = df[self.train_split_cnt + self.val_split_cnt: self.train_split_cnt + self.val_split_cnt + self.test_split_cnt].reset_index(drop=True)

        return train_df, val_df, test_df
    
    def get_split_counts(self):
        print(f"Train set has {self.train_split_cnt}")
        print(f"Validation set has {self.val_split_cnt}")
        print(f"Test set has {self.test_split_cnt}")

# Function to save Dataframe
def save_dataframe(df: pd.DataFrame, path: str) -> None:
    df.to_csv(path, index=False)
    
    return


# Split the dataset
DatasetSplitterObj = DatasetSplitter()
train_df, val_df, test_df = DatasetSplitterObj.split_dataframe(raw_df)  
DatasetSplitterObj.get_split_counts()


#save the datasets
train_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/train_data.csv"
val_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/validation_data.csv"
test_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/test_data.csv"

save_dataframe(train_df, train_path), save_dataframe(val_df, val_path), save_dataframe(test_df, test_path)

Train set has 7008
Validation set has 876
Test set has 876


(None, None, None)

We will be trying out all the transformations and data processing using the train dataset.

### 2. Clean Column Names
THis section we weill clean the column names by removing spaces, unwanted symbols, cases etc.

In [19]:
def clean_col_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    #name mapper
    column_name_mapper = {'Temperature(°C)': 'Temperature', 'Humidity(%)': 'Humidity', 
                      'Wind speed (m/s)': 'Wind speed', 'Visibility (10m)': 'Visibility', 
                      'Dew point temperature(°C)': 'Dew point temperature', 'Solar Radiation (MJ/m2)': 'Solar Radiation', 
                      'Rainfall(mm)': 'Rainfall', 'Snowfall (cm)': 'Snowfall'
                      }
    
    try:
        df = df.rename(columns=column_name_mapper)  # rename
        df.columns = df.columns.str.lower() # lower case
        df.columns = df.columns.str.replace('\s+', '_', regex=True) # replace space with '_'
        
        df['date'] = pd.to_datetime(df['date'], dayfirst=True) # convert to datetime
        return df

    except Exception as E:
        print(f'\033[31m{type(E).__name__}: {E} !!!\033[0m')

# apply function
transformed_df = clean_col_names(train_df)
transformed_df

Unnamed: 0,date,rented_bike_count,hour,temperature,humidity,wind_speed,visibility,dew_point_temperature,solar_radiation,rainfall,snowfall,seasons,holiday,functioning_day
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.00,0.0,0.0,Winter,No Holiday,Yes
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.00,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7003,2018-09-18,0,19,23.2,57,2.2,2000,14.2,0.03,0.0,0.0,Autumn,No Holiday,No
7004,2018-09-18,0,20,22.6,58,1.4,2000,13.9,0.00,0.0,0.0,Autumn,No Holiday,No
7005,2018-09-18,0,21,22.1,61,1.5,2000,14.2,0.00,0.0,0.0,Autumn,No Holiday,No
7006,2018-09-18,0,22,21.8,65,0.3,2000,14.9,0.00,0.0,0.0,Autumn,No Holiday,No


### 2. Handling Null Vales and duplicates.

This section will consist on logic to handle the null values.

In [20]:
class NullValueImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.num_cols = X.select_dtypes(include=[np.number]).columns
        self.cat_cols = X.select_dtypes(include=[object]).columns
        
        self.num_means = X[self.num_cols].mean()    # Calculate mean
        self.cat_modes = X[self.cat_cols].mode().iloc[0]    # Calculate Mode
        
        return self
    
    def transform(self, X):
        X = X.copy()
        X = X.drop_duplicates().reset_index(drop=True)  # Drop Duplicates

        # Transform null rows
        X[self.num_cols] = X[self.num_cols].fillna(self.num_means)
        X[self.cat_cols] = X[self.cat_cols].fillna(self.cat_modes)
        
        return X

# apply function
imputer = NullValueImputer()
transformed_df = imputer.fit_transform(transformed_df)

transformed_df

Unnamed: 0,date,rented_bike_count,hour,temperature,humidity,wind_speed,visibility,dew_point_temperature,solar_radiation,rainfall,snowfall,seasons,holiday,functioning_day
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.00,0.0,0.0,Winter,No Holiday,Yes
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.00,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7003,2018-09-18,0,19,23.2,57,2.2,2000,14.2,0.03,0.0,0.0,Autumn,No Holiday,No
7004,2018-09-18,0,20,22.6,58,1.4,2000,13.9,0.00,0.0,0.0,Autumn,No Holiday,No
7005,2018-09-18,0,21,22.1,61,1.5,2000,14.2,0.00,0.0,0.0,Autumn,No Holiday,No
7006,2018-09-18,0,22,21.8,65,0.3,2000,14.9,0.00,0.0,0.0,Autumn,No Holiday,No


### 3. Feature Extraction
Add new features by binning, or new categories based on the existing features, to handle skew.

#### 3.1 Binning skewed Features 

In [21]:
# Categorizing Skewed Columns
class SkewDiscretizer:

    @staticmethod
    def discrete_rainfall(df, col='rainfall'):
        # Define conditions for rainfall categories
        conditions = [
            df[col] == 0,
            (df[col] > 0) & (df[col] <= 3.5),
            (df[col] > 3.5)
            ]
        
        # Define corresponding category labels
        categories = ['No', 'Light', 'Medium']
        
        # Apply conditions to create the 'rainfall_class' column
        df['rainfall_class'] = np.select(conditions, categories, default='Unknown')
        
        return df
    
    @staticmethod   
    def discrete_snowfall(df, col='snowfall'):
        # Define conditions for snowfall categories
        conditions = [
            df[col] == 0,
            (df[col] > 0) & (df[col] <= 0.5),
            (df[col] > 0.5) & (df[col] <= 2.0),
            (df[col] > 2.0) & (df[col] <= 4.0),
            df[col] > 4.0
        ]
        
        # Define corresponding category labels
        categories = ['No', 'Light', 'Medium', 'Heavy', 'Extreme']
        
        # Apply conditions to create the 'snowfall_class' column
        df['snowfall_class'] = np.select(conditions, categories, default='Unknown')
        
        return df
    
    @staticmethod
    def discrete_snowfall(df, col='snowfall'):
        # Define conditions for snowfall categories
        conditions = [
            df[col] == 0,
            (df[col] > 0) & (df[col] <= 0.5),
            (df[col] > 0.5) & (df[col] <= 2.0),
            (df[col] > 2.0)
        ]
        
        # Define corresponding category labels
        categories = ['No', 'Light', 'Medium', 'Heavy']
        
        # Apply conditions to create the 'snowfall_class' column
        df['snowfall_class'] = np.select(conditions, categories, default='Unknown')
        
        return df
    
    @staticmethod
    def discrete_visibility(df, col='visibility'):    
        df[f'{col}_scaled'] = (df[col] * 10) / 1000

        # Define conditions for visibility categories
        conditions = [
            (df[f'{col}_scaled'] <= 5),
            (df[f'{col}_scaled'] > 5) & (df[f'{col}_scaled'] <= 10),
            df[f'{col}_scaled'] > 10
        ]

        # Define corresponding category labels
        categories = ['Poor', 'Moderate', 'Good']
        
        # Apply conditions to create the 'snowfall_class' column
        df['visibility_class'] = np.select(conditions, categories, default='Unknown')
        
        return df.drop([f'{col}_scaled'], axis=1)

    @staticmethod
    def discrete_radiation(df, col='solar_radiation'):
        # Define conditions for visibility categories
        conditions = [
            df[col] <= 0.5,
            (df[col] > 0.5) & (df[col] <= 1),
            (df[col] > 1) & (df[col] <= 2.5),
            (df[col] > 2.5) & (df[col] <= 5),
            df[col] > 5
        ]

        # Define corresponding category labels
        categories = ['Very Low', 'Low', 'Moderate', 'High', 'Extreme']
        
        # Apply conditions to create the 'snowfall_class' column
        df['solar_radiation_class'] = np.select(conditions, categories, default='Unknown')
        
        return df


# Initialize the SkewDiscretizer
skew_discretizer = SkewDiscretizer()


# Apply the discretization functions
transformed_df = skew_discretizer.discrete_rainfall(transformed_df)
transformed_df = skew_discretizer.discrete_snowfall(transformed_df)
transformed_df = skew_discretizer.discrete_visibility(transformed_df)
transformed_df = skew_discretizer.discrete_radiation(transformed_df)

#### 3.2 Remove Redundant Features

In [40]:
# Remove higly correlated features
def remove_multicollinear_features(raw_df):
    features_to_remove = ['dew_point_temperature']

    return raw_df.drop(features_to_remove, axis=1)

# Apply the function
transformed_df = remove_multicollinear_features(transformed_df)
transformed_df.columns

Index(['date', 'rented_bike_count', 'hour', 'temperature', 'humidity',
       'wind_speed', 'visibility', 'solar_radiation', 'rainfall', 'snowfall',
       'seasons', 'holiday', 'functioning_day', 'rainfall_class',
       'snowfall_class', 'visibility_class', 'solar_radiation_class'],
      dtype='object')

#### 3.3 Categorical Encoding

In [41]:
for col in transformed_df.select_dtypes(include=[object]).columns:
    print(col, transformed_df[col].unique())

In [42]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.cat_cols = transformed_df.select_dtypes(include=[object]).columns
        self.cat_cols_mapping = {'seasons': {'Spring': 0, 'Summer': 1, 'Autumn': 2, 'Winter': 3}, 
                                 'holiday': {'No Holiday': 0, 'Holiday': 1},
                                 'functioning_day': {'No': 0, 'Yes': 1},
                                 'rainfall_class': {'No': 0, 'Light': 1, 'Medium': 2},
                                 'snowfall_class': {'No': 0, 'Light': 1, 'Medium': 2, 'Heavy': 3},
                                 'visibility_class': {'Poor': 0, 'Moderate': 1, 'Good': 2},
                                 'solar_radiation_class': {'Very Low': 0, 'Low': 1, 'Moderate': 2, 'High': 3, 'Extreme': 4}
        }
        
        return self

    def transform(self, X):
        
        X = X.copy()
        
        for col in self.cat_cols:
            X[col] = X[col].map(self.cat_cols_mapping[col])
        
        return X
    
# Initialize the CategoricalEncoder
categorical_encoder = CategoricalEncoder()

# Fit and Transform the data
transformed_df = categorical_encoder.fit_transform(transformed_df)
transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7008 entries, 0 to 7007
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   7008 non-null   datetime64[ns]
 1   rented_bike_count      7008 non-null   int64         
 2   hour                   7008 non-null   int64         
 3   temperature            7008 non-null   float64       
 4   humidity               7008 non-null   int64         
 5   wind_speed             7008 non-null   float64       
 6   visibility             7008 non-null   int64         
 7   solar_radiation        7008 non-null   float64       
 8   rainfall               7008 non-null   float64       
 9   snowfall               7008 non-null   float64       
 10  seasons                7008 non-null   int64         
 11  holiday                7008 non-null   int64         
 12  functioning_day        7008 non-null   int64         
 13  rai