# Feature Engineering Notebook
This notebook will cover the feature engineering steps needed to transform the raw data to transformed features to capture more information./


## Import Libraries and Dataset

In [275]:
import pandas as pd
pd.pandas.set_option('display.max_columns', None)

import numpy as np 

import warnings
warnings.filterwarnings('ignore')

import joblib
import cloudpickle

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer 

from sklearn.pipeline import Pipeline


In [276]:
# Load the dataset
path = '/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/SeoulBikeData.csv'

raw_df = pd.read_csv(path, encoding='unicode_escape')

print("Total Rows and features are: , ", raw_df.shape)
raw_df.head()

Total Rows and features are: ,  (8760, 14)


Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


## Feature Engineering Steps
This section will include all the feature engineering steps need to transfomr the dat before feeding to the model.

First, and foremost, we will divide the data set into train, val and test data set.

### 1. Train - Validation - Test Splitting

In [277]:
class DatasetSplitter:
    def __init__(self, train_split=0.8, val_split=0.1, test_split=0.1):
        # split percentages
        self.train_split = train_split
        self.val_split = val_split
        self.test_split = test_split

    def split_dataframe(self, df):
        #Splitting counts
        self.train_split_cnt = int(len(df) * self.train_split) 
        self.val_split_cnt = int(len(df) * self.val_split)  
        self.test_split_cnt = int(len(df) * self.test_split) 

        # Splitting Datasets
        train_df = df[:self.train_split_cnt]
        val_df = df[self.train_split_cnt:self.train_split_cnt + self.val_split_cnt].reset_index(drop=True)
        test_df = df[self.train_split_cnt + self.val_split_cnt: self.train_split_cnt + self.val_split_cnt + self.test_split_cnt].reset_index(drop=True)

        return train_df, val_df, test_df
    
    def get_split_counts(self):
        print(f"Train set has {self.train_split_cnt}")
        print(f"Validation set has {self.val_split_cnt}")
        print(f"Test set has {self.test_split_cnt}")

# Function to save Dataframe
def save_dataframe(df: pd.DataFrame, path: str) -> None:
    df.to_csv(path, index=False)
    
    return


# Split the dataset
DatasetSplitterObj = DatasetSplitter()
train_df, val_df, test_df = DatasetSplitterObj.split_dataframe(raw_df)  
DatasetSplitterObj.get_split_counts()

#save the datasets
train_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/train_data.csv"
val_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/validation_data.csv"
test_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/test_data.csv"

save_dataframe(train_df, train_path), save_dataframe(val_df, val_path), save_dataframe(test_df, test_path)

Train set has 7008
Validation set has 876
Test set has 876


(None, None, None)

We will be trying out all the transformations and data processing using the train dataset.

### 2. Clean Column Names
THis section we weill clean the column names by removing spaces, unwanted symbols, cases etc.

In [278]:
def clean_col_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    #name mapper
    column_name_mapper = {'Temperature(°C)': 'Temperature', 'Humidity(%)': 'Humidity', 
                      'Wind speed (m/s)': 'Wind speed', 'Visibility (10m)': 'Visibility', 
                      'Dew point temperature(°C)': 'Dew point temperature', 'Solar Radiation (MJ/m2)': 'Solar Radiation', 
                      'Rainfall(mm)': 'Rainfall', 'Snowfall (cm)': 'Snowfall'
                      }
    
    try:
        df = df.rename(columns=column_name_mapper)  # rename
        df.columns = df.columns.str.lower() # lower case
        df.columns = df.columns.str.replace('\s+', '_', regex=True) # replace space with '_'
        
        df['date'] = pd.to_datetime(df['date'], dayfirst=True) # convert to datetime
        return df

    except Exception as E:
        print(f'\033[31m{type(E).__name__}: {E} !!!\033[0m')

# create transformer
clean_col_transformer = FunctionTransformer(func=clean_col_names)
# transform the dataset
transformed_train_df = clean_col_transformer.transform(train_df)

transformed_train_df.head(), transformed_train_df.shape

(        date  rented_bike_count  hour  temperature  humidity  wind_speed  \
 0 2017-12-01                254     0         -5.2        37         2.2   
 1 2017-12-01                204     1         -5.5        38         0.8   
 2 2017-12-01                173     2         -6.0        39         1.0   
 3 2017-12-01                107     3         -6.2        40         0.9   
 4 2017-12-01                 78     4         -6.0        36         2.3   
 
    visibility  dew_point_temperature  solar_radiation  rainfall  snowfall  \
 0        2000                  -17.6              0.0       0.0       0.0   
 1        2000                  -17.6              0.0       0.0       0.0   
 2        2000                  -17.7              0.0       0.0       0.0   
 3        2000                  -17.6              0.0       0.0       0.0   
 4        2000                  -18.6              0.0       0.0       0.0   
 
   seasons     holiday functioning_day  
 0  Winter  No Holiday   

#### 2.1 Save Column Cleaner Object

In [279]:
# Save the column cleaner transformer
clean_col_transformer_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/clean_col_transformer.pkl"

with open(clean_col_transformer_path, 'wb') as f:
    cloudpickle.dump(clean_col_transformer, f)

### 3. Handling Null Vales and duplicates.

This section will consist on logic to handle the null values.

In [251]:
class NullValueImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.num_cols = X.select_dtypes(include=[np.number]).columns
        self.cat_cols = X.select_dtypes(include=[object]).columns
        
        self.num_means = X[self.num_cols].mean()    # Calculate mean
        self.cat_modes = X[self.cat_cols].mode().iloc[0]    # Calculate Mode
        
        return self
    
    def transform(self, X):
        X = X.copy()
        X = X.drop_duplicates().reset_index(drop=True)  # Drop Duplicates

        # Transform null rows
        X[self.num_cols] = X[self.num_cols].fillna(self.num_means)
        X[self.cat_cols] = X[self.cat_cols].fillna(self.cat_modes)
        
        return X

# apply function
imputer = NullValueImputer()
transformed_train_df = imputer.fit_transform(transformed_train_df)

transformed_train_df

Unnamed: 0,date,rented_bike_count,hour,temperature,humidity,wind_speed,visibility,dew_point_temperature,solar_radiation,rainfall,snowfall,seasons,holiday,functioning_day
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.00,0.0,0.0,Winter,No Holiday,Yes
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.00,0.0,0.0,Winter,No Holiday,Yes
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.00,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7003,2018-09-18,0,19,23.2,57,2.2,2000,14.2,0.03,0.0,0.0,Autumn,No Holiday,No
7004,2018-09-18,0,20,22.6,58,1.4,2000,13.9,0.00,0.0,0.0,Autumn,No Holiday,No
7005,2018-09-18,0,21,22.1,61,1.5,2000,14.2,0.00,0.0,0.0,Autumn,No Holiday,No
7006,2018-09-18,0,22,21.8,65,0.3,2000,14.9,0.00,0.0,0.0,Autumn,No Holiday,No


#### 3.1 Save Null Imputer Object

In [280]:
# Save the imputer transformer
imputer_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/imputer.pkl"


with open(imputer_path, 'wb') as f:
    cloudpickle.dump(imputer, f)

### 4. Feature Extraction
Add new features by binning, or new categories based on the existing features, to handle skew.

#### 4.1 Binning skewed Features 

In [253]:
# Categorizing Skewed Columns
class SkewDiscretizer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = self.discrete_rainfall(X)
        X = self.discrete_snowfall(X)
        X = self.discrete_visibility(X)
        X = self.discrete_radiation(X)
        return X

    @staticmethod
    def discrete_rainfall(df, col='rainfall'):
        # Define conditions for rainfall categories
        conditions = [
            df[col] == 0,
            (df[col] > 0) & (df[col] <= 3.5),
            (df[col] > 3.5)
            ]
        
        # Define corresponding category labels
        categories = ['No', 'Light', 'Medium']
        
        # Apply conditions to create the 'rainfall_class' column
        df['rainfall_class'] = np.select(conditions, categories, default='Unknown')
        df.drop([col], axis=1, inplace=True)
        
        return df
    
    @staticmethod   
    def discrete_snowfall(df, col='snowfall'):
        # Define conditions for snowfall categories
        conditions = [
            df[col] == 0,
            (df[col] > 0) & (df[col] <= 0.5),
            (df[col] > 0.5) & (df[col] <= 2.0),
            (df[col] > 2.0) & (df[col] <= 4.0),
            df[col] > 4.0
        ]
        
        # Define corresponding category labels
        categories = ['No', 'Light', 'Medium', 'Heavy', 'Extreme']
        
        # Apply conditions to create the 'snowfall_class' column
        df['snowfall_class'] = np.select(conditions, categories, default='Unknown')
        df.drop([col], axis=1, inplace=True)

        return df
    
    @staticmethod
    def discrete_visibility(df, col='visibility'):    
        df[f'{col}_scaled'] = (df[col] * 10) / 1000

        # Define conditions for visibility categories
        conditions = [
            (df[f'{col}_scaled'] <= 5),
            (df[f'{col}_scaled'] > 5) & (df[f'{col}_scaled'] <= 10),
            df[f'{col}_scaled'] > 10
        ]

        # Define corresponding category labels
        categories = ['Poor', 'Moderate', 'Good']
        
        # Apply conditions to create the 'visibility_class' column
        df['visibility_class'] = np.select(conditions, categories, default='Unknown')
        df.drop([col], axis=1, inplace=True)

        return df.drop([f'{col}_scaled'], axis=1)

    @staticmethod
    def discrete_radiation(df, col='solar_radiation'):
        # Define conditions for visibility categories
        conditions = [
            df[col] <= 0.5,
            (df[col] > 0.5) & (df[col] <= 1),
            (df[col] > 1) & (df[col] <= 2.5),
            (df[col] > 2.5) & (df[col] <= 5),
            df[col] > 5
        ]

        # Define corresponding category labels
        categories = ['Very Low', 'Low', 'Moderate', 'High', 'Extreme']
        
        # Apply conditions to create the 'solar_radiation_class' column
        df['solar_radiation_class'] = np.select(conditions, categories, default='Unknown')
        df.drop([col], axis=1, inplace=True)
        return df
    
# Initialize the SkewDiscretizer
skew_discretizer = SkewDiscretizer()

# Apply the discretization functions
transformed_train_df = skew_discretizer.fit_transform(transformed_train_df)
transformed_train_df.head(), transformed_train_df.shape

(        date  rented_bike_count  hour  temperature  humidity  wind_speed  \
 0 2017-12-01                254     0         -5.2        37         2.2   
 1 2017-12-01                204     1         -5.5        38         0.8   
 2 2017-12-01                173     2         -6.0        39         1.0   
 3 2017-12-01                107     3         -6.2        40         0.9   
 4 2017-12-01                 78     4         -6.0        36         2.3   
 
    dew_point_temperature seasons     holiday functioning_day rainfall_class  \
 0                  -17.6  Winter  No Holiday             Yes             No   
 1                  -17.6  Winter  No Holiday             Yes             No   
 2                  -17.7  Winter  No Holiday             Yes             No   
 3                  -17.6  Winter  No Holiday             Yes             No   
 4                  -18.6  Winter  No Holiday             Yes             No   
 
   snowfall_class visibility_class solar_radiation_cla

##### 4.1.1 Save Binning Transformer

In [281]:
# Save the skew discretizer transformer
skew_discretizer_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/skew_discretizer.pkl"

with open(skew_discretizer_path, 'wb') as f:
    cloudpickle.dump(skew_discretizer, f)

#### 4.2 Remove Redundant Features

In [282]:
# Remove higly correlated features
def remove_multicollinear_features(df):
    features_to_remove = ['dew_point_temperature']

    return df.drop(features_to_remove, axis=1)

# create transformer
multicollinear_transformer = FunctionTransformer(func=remove_multicollinear_features)
# transform the dataset
transformed_train_df = multicollinear_transformer.transform(transformed_train_df)

transformed_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7008 entries, 0 to 7007
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               7008 non-null   datetime64[ns]
 1   rented_bike_count  7008 non-null   int64         
 2   hour               7008 non-null   int64         
 3   temperature        7008 non-null   float64       
 4   humidity           7008 non-null   int64         
 5   wind_speed         7008 non-null   float64       
 6   visibility         7008 non-null   int64         
 7   solar_radiation    7008 non-null   float64       
 8   rainfall           7008 non-null   float64       
 9   snowfall           7008 non-null   float64       
 10  seasons            7008 non-null   object        
 11  holiday            7008 non-null   object        
 12  functioning_day    7008 non-null   object        
dtypes: datetime64[ns](1), float64(5), int64(4), object(3)
memory us

##### 4.2.1 Save Multicollinearity Reducer

In [283]:
# Save the multicollinear transformer
multicollinear_transformer_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/multicollinear_transformer.pkl"

with open(multicollinear_transformer_path, 'wb') as f:
    cloudpickle.dump(multicollinear_transformer, f)

#### 4.3 Categorical Encoding

In [257]:
for col in transformed_train_df.select_dtypes(include=[object]).columns:
    print(col, transformed_train_df[col].unique())

seasons ['Winter' 'Spring' 'Summer' 'Autumn']
holiday ['No Holiday' 'Holiday']
functioning_day ['Yes' 'No']
rainfall_class ['No' 'Light' 'Medium']
snowfall_class ['No' 'Light' 'Medium' 'Heavy' 'Extreme']
visibility_class ['Good' 'Moderate' 'Poor']
solar_radiation_class ['Very Low' 'Low' 'Moderate' 'High']


In [258]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.cat_cols = X.select_dtypes(include=[object]).columns
        self.cat_cols_mapping = {'seasons': {'Spring': 0, 'Summer': 1, 'Autumn': 2, 'Winter': 3}, 
                                 'holiday': {'No Holiday': 0, 'Holiday': 1},
                                 'functioning_day': {'No': 0, 'Yes': 1},
                                 'rainfall_class': {'No': 0, 'Light': 1, 'Medium': 2},
                                 'snowfall_class': {'No': 0, 'Light': 1, 'Medium': 2, 'Heavy': 3, 'Extreme': 4},
                                 'visibility_class': {'Poor': 0, 'Moderate': 1, 'Good': 2},
                                 'solar_radiation_class': {'Very Low': 0, 'Low': 1, 'Moderate': 2, 'High': 3, 'Extreme': 4}
        }
        
        return self

    def transform(self, X):
        
        X = X.copy()
        
        for col in self.cat_cols:
            X[col] = X[col].map(self.cat_cols_mapping[col])
        
        return X
    
# Initialize the CategoricalEncoder
categorical_encoder = CategoricalEncoder()

# Fit and Transform the data
transformed_train_df = categorical_encoder.fit_transform(transformed_train_df)
transformed_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7008 entries, 0 to 7007
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   7008 non-null   datetime64[ns]
 1   rented_bike_count      7008 non-null   int64         
 2   hour                   7008 non-null   int64         
 3   temperature            7008 non-null   float64       
 4   humidity               7008 non-null   int64         
 5   wind_speed             7008 non-null   float64       
 6   seasons                7008 non-null   int64         
 7   holiday                7008 non-null   int64         
 8   functioning_day        7008 non-null   int64         
 9   rainfall_class         7008 non-null   int64         
 10  snowfall_class         7008 non-null   int64         
 11  visibility_class       7008 non-null   int64         
 12  solar_radiation_class  7008 non-null   int64         
dtypes: 

##### 4.3.1 Save Categorical Encoder

In [284]:
# Save the categorical encoder transformer
categorical_encoder_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/categorical_encoder.pkl"

with open(categorical_encoder_path, 'wb') as f:
    cloudpickle.dump(categorical_encoder, f)

#### 4.4 Extract Time Features

In [260]:
transformed_train_df.head()

Unnamed: 0,date,rented_bike_count,hour,temperature,humidity,wind_speed,seasons,holiday,functioning_day,rainfall_class,snowfall_class,visibility_class,solar_radiation_class
0,2017-12-01,254,0,-5.2,37,2.2,3,0,1,0,0,2,0
1,2017-12-01,204,1,-5.5,38,0.8,3,0,1,0,0,2,0
2,2017-12-01,173,2,-6.0,39,1.0,3,0,1,0,0,2,0
3,2017-12-01,107,3,-6.2,40,0.9,3,0,1,0,0,2,0
4,2017-12-01,78,4,-6.0,36,2.3,3,0,1,0,0,2,0


In [261]:
def extract_date_features(df):
    """
    Extracts date features from the 'date' column of the dataframe.

    Args:
    df : pd.DataFrame
        Input DataFrame
    
    Returns:
    df : pd.DataFrame
        DataFrame with extracted date features.
    """

    df = df.copy()
    
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    
    df = df.drop(['date'], axis=1)
    return df

# create transformer
date_features_transformer = FunctionTransformer(func=extract_date_features)
# transform the dataset
transformed_train_df = date_features_transformer.transform(transformed_train_df)

transformed_train_df.head(), transformed_train_df.shape

(   rented_bike_count  hour  temperature  humidity  wind_speed  seasons  \
 0                254     0         -5.2        37         2.2        3   
 1                204     1         -5.5        38         0.8        3   
 2                173     2         -6.0        39         1.0        3   
 3                107     3         -6.2        40         0.9        3   
 4                 78     4         -6.0        36         2.3        3   
 
    holiday  functioning_day  rainfall_class  snowfall_class  visibility_class  \
 0        0                1               0               0                 2   
 1        0                1               0               0                 2   
 2        0                1               0               0                 2   
 3        0                1               0               0                 2   
 4        0                1               0               0                 2   
 
    solar_radiation_class  year  month  day  day_of_we

In [262]:
#create lag features
class LagFeatureCreator(BaseEstimator, TransformerMixin):
    def __init__(self, lag_days=1):
        self.lag_days = lag_days

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for i in range(1, self.lag_days + 1):
            X[f'lag_{i}'] = X['rented_bike_count'].shift(i)

        X = X.dropna().reset_index(drop=True)
        return X

# create transformer
lag_features_transformer = LagFeatureCreator(lag_days=1)
# transform the dataset
lagged_train_df = lag_features_transformer.fit_transform(transformed_train_df)

lagged_train_df.head(), lagged_train_df.shape

(   rented_bike_count  hour  temperature  humidity  wind_speed  seasons  \
 0                204     1         -5.5        38         0.8        3   
 1                173     2         -6.0        39         1.0        3   
 2                107     3         -6.2        40         0.9        3   
 3                 78     4         -6.0        36         2.3        3   
 4                100     5         -6.4        37         1.5        3   
 
    holiday  functioning_day  rainfall_class  snowfall_class  visibility_class  \
 0        0                1               0               0                 2   
 1        0                1               0               0                 2   
 2        0                1               0               0                 2   
 3        0                1               0               0                 2   
 4        0                1               0               0                 2   
 
    solar_radiation_class  year  month  day  day_of_we

##### 4.4.1 Save Time Features Tranformer Objects

In [285]:
# date features transformer
date_features_transformer_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/date_features_transformer.pkl"

# lag features transformer
lag_features_transformer_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/lag_features_transformer.pkl"

# save transfoemres
with open(date_features_transformer_path, 'wb') as f:
    cloudpickle.dump(date_features_transformer, f)

with open(lag_features_transformer_path, 'wb') as f:
    cloudpickle.dump(lag_features_transformer, f)

## 5 Create the Pipeline
1. Clean Columns
2. Handling Null Values
3. Binning
4. Removing Redundancy
5. Categorical Encoding
6. Extract Time Featuers
7. Create Lag Features.

In [293]:
cleaning_pipeline = Pipeline([
    ('clean_col_transformer', clean_col_transformer),
    ('imputer', imputer)])

feature_transformer_pipeline = Pipeline([
    ('skew_discretizer', skew_discretizer),
    ('multicollinear_transformer', multicollinear_transformer),
    ('categorical_encoder', categorical_encoder),
    ('date_features_transformer', date_features_transformer),
    ('lag_features_transformer', lag_features_transformer)
])

# Full Pipeline
preprocessing_pipeline = Pipeline([
    ('cleaning_pipeline', cleaning_pipeline),
    ('feature_transform_pipeline', feature_transformer_pipeline)
])

# Fit and Transform the data    
transformed_train_df = preprocessing_pipeline.fit_transform(train_df)
transformed_train_df.head(), transformed_train_df.shape

(   rented_bike_count  hour  temperature  humidity  wind_speed  seasons  \
 0                204     1         -5.5        38         0.8        3   
 1                173     2         -6.0        39         1.0        3   
 2                107     3         -6.2        40         0.9        3   
 3                 78     4         -6.0        36         2.3        3   
 4                100     5         -6.4        37         1.5        3   
 
    holiday  functioning_day  rainfall_class  snowfall_class  visibility_class  \
 0        0                1               0               0                 2   
 1        0                1               0               0                 2   
 2        0                1               0               0                 2   
 3        0                1               0               0                 2   
 4        0                1               0               0                 2   
 
    solar_radiation_class  year  month  day  day_of_we

### 5.1 Test Pipline

In [294]:
preprocessing_pipeline.transform(val_df)

Unnamed: 0,rented_bike_count,hour,temperature,humidity,wind_speed,seasons,holiday,functioning_day,rainfall_class,snowfall_class,visibility_class,solar_radiation_class,year,month,day,day_of_week,lag_1
0,0,1,20.5,64,0.4,2,0,0,0,0,2,0,2018,9,19,2,0.0
1,0,2,20.0,70,0.2,2,0,0,0,0,2,0,2018,9,19,2,0.0
2,0,3,19.7,70,0.5,2,0,0,0,0,2,0,2018,9,19,2,0.0
3,0,4,19.5,70,0.5,2,0,0,0,0,2,0,2018,9,19,2,0.0
4,0,5,19.3,73,0.9,2,0,0,0,0,2,0,2018,9,19,2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,1251,7,7.3,85,1.4,2,0,1,0,0,2,0,2018,10,25,3,450.0
871,2070,8,7.9,76,1.2,2,0,1,0,0,2,0,2018,10,25,3,1251.0
872,1168,9,10.3,64,1.0,2,0,1,0,0,2,1,2018,10,25,3,2070.0
873,779,10,13.4,53,1.1,2,0,1,0,0,2,2,2018,10,25,3,1168.0


### 5.2 Transform and Save Data

In [296]:
# Save transformed data
transformed_train_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformed_data/transformed_train_data.csv"
transformed_val_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformed_data/transformed_validation_data.csv"
transformed_test_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformed_data/transformed_test_data.csv"

# Save transformed data
transformed_val_df = preprocessing_pipeline.transform(val_df)
transformed_test_df = preprocessing_pipeline.transform(test_df)

save_dataframe(transformed_train_df, transformed_train_path)
save_dataframe(transformed_val_df, transformed_val_path)
save_dataframe(transformed_test_df, transformed_test_path)

### 5.3 Save Pipeline Objects

In [291]:
# save the pipeline
preprocessing_pipeline_path = "/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/pipeline-components/preprocessing_pipeline.pkl"

with open(preprocessing_pipeline_path, 'wb') as f:
    cloudpickle.dump(preprocessing_pipeline, f)