# Feature Engineering Notebook
This notebook will cover the feature engineering steps needed to transform the raw data to transformed features to capture more information./


## Import Libraries and Dataset

In [35]:
import pandas as pd
pd.pandas.set_option('display.max_columns', None)

import numpy as np 

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

In [36]:
# Load the dataset
path = '/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/SeoulBikeData.csv'

raw_df = pd.read_csv(path, encoding='unicode_escape')

print("Total Rows and features are: , ", raw_df.shape)
raw_df.head()

Total Rows and features are: ,  (8760, 14)


Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


## Feature Engineering Steps
This section will include all the feature engineering steps need to transfomr the dat before feeding to the model.

First, and foremost, we will divide the data set into train, val and test data set.

### 1. Train - Validation - Test Splitting

In [51]:
class DatasetSplitter:
    def __init__(self, train_split=0.8, val_split=0.1, test_split=0.1):
        # split percentages
        self.train_split = train_split
        self.val_split = val_split
        self.test_split = test_split

    def split_dataframe(self, df):
        #Splitting counts
        self.train_split_cnt = int(len(df) * self.train_split) 
        self.val_split_cnt = int(len(df) * self.val_split)  
        self.test_split_cnt = int(len(df) * self.test_split) 

        # Splitting Datasets
        train_df = df[:self.train_split_cnt]
        val_df = df[self.train_split_cnt:self.train_split_cnt + self.val_split_cnt].reset_index(drop=True)
        test_df = df[self.train_split_cnt + self.val_split_cnt: self.train_split_cnt + self.val_split_cnt + self.test_split_cnt].reset_index(drop=True)

        return train_df, val_df, test_df
    
    def get_split_counts(self):
        print(f"Train set has {self.train_split_cnt}")
        print(f"Validation set has {self.val_split_cnt}")
        print(f"Test set has {self.test_split_cnt}")

# Function to save Dataframe
def save_dataframe(df: pd.DataFrame, path: str) -> None:
    df.to_csv(path, index=False)

# Split the dataset
DatasetSplitterObj = DatasetSplitter()
train_df, val_df, test_df = DatasetSplitterObj.split_dataframe(raw_df)  
DatasetSplitterObj.get_split_counts()


#save the datasets
train_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/train_data.csv"
val_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/validation_data.csv"
test_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/test_data.csv"

save_dataframe(train_df, train_path), save_dataframe(val_df, val_path), save_dataframe(test_df, test_path)

Train set has 7008
Validation set has 876
Test set has 876


(None, None, None)

We will be trying out all the transformations and data processing using the train dataset.

### 2. Clean Column Names
THis section we weill clean the column names by removing spaces, unwanted symbols, cases etc.

In [50]:
def clean_col_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    #name mapper
    column_name_mapper = {'Temperature(°C)': 'Temperature', 'Humidity(%)': 'Humidity', 
                      'Wind speed (m/s)': 'Wind speed', 'Visibility (10m)': 'Visibility', 
                      'Dew point temperature(°C)': 'Dew point temperature', 'Solar Radiation (MJ/m2)': 'Solar Radiation', 
                      'Rainfall(mm)': 'Rainfall', 'Snowfall (cm)': 'Snowfall'
                      }
    
    try:
        df = df.rename(columns=column_name_mapper)  # rename
        df.columns = df.columns.str.lower() # lower case
        df.columns = df.columns.str.replace('\s+', '_', regex=True) # replace space with '_'
        
        return df


    except Exception as E:
        print(f'\033[31m{type(E).__name__}: {E} !!!\033[0m')

# apply function
transformed_df = clean_col_names(train_df)
transformed_df

Unnamed: 0,date,rented_bike_count,hour,temperature,humidity,wind_speed,visibility,dew_point_temperature,solar_radiation,rainfall,snowfall,seasons,holiday,functioning_day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.00,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.00,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7003,18/09/2018,0,19,23.2,57,2.2,2000,14.2,0.03,0.0,0.0,Autumn,No Holiday,No
7004,18/09/2018,0,20,22.6,58,1.4,2000,13.9,0.00,0.0,0.0,Autumn,No Holiday,No
7005,18/09/2018,0,21,22.1,61,1.5,2000,14.2,0.00,0.0,0.0,Autumn,No Holiday,No
7006,18/09/2018,0,22,21.8,65,0.3,2000,14.9,0.00,0.0,0.0,Autumn,No Holiday,No


### 2. Handling Null Vales.

This section will consist on logic to handle the null values.

In [None]:
def fill_null_values(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # Fill numerical columns with mean
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        df[col].fillna(df[col].mean(), inplace=True)
    
    # Fill categorical columns with mode
    cat_cols = df.select_dtypes(include=[object]).columns
    for col in cat_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df    
