In [1]:
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from scipy.stats import boxcox

# Set style
plt.style.use('ggplot')
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


In [2]:

class BikeDemandDataProcessor:
    def __init__(self, path):
        self.path = path

    def load_data(self, filename):
        full_path = f"{self.path}/{filename}"
        print(f"Loading data from {full_path}")
        try:
            return pd.read_csv(full_path)
        except FileNotFoundError:
            print(f"File {filename} not found in path {self.path}.")
            return None

    def preprocess(self, df):
        df = df.copy()
        df['timestamp'] = pd.to_datetime(df['dteday'] + ' ' + df['hr'].astype(str) + ':00:00', format='%d/%m/%Y %H:%M:%S')
        df.rename(columns={'hr': 'hour', 'yr' : 'year', 'mnth' : 'month', 'cnt' : 'count'}, inplace=True)
        df.drop(['dteday', 'instant'], axis=1, inplace=True)

        # Creating time-based features
        df['payday'] = df['timestamp'].dt.is_month_end.astype(int)
        df['year'] = df['timestamp'].dt.year
        df['day'] = df['timestamp'].dt.day_of_year
        df['day_of_week'] = df['timestamp'].dt.day_of_week
        df['day_of_month'] = df['timestamp'].dt.day
        df['month'] = df['timestamp'].dt.month
        df['week'] = df['timestamp'].dt.isocalendar().week
        df['year_sin'] = np.sin(2 * np.pi * df['year'])
        df['year_cos'] = np.cos(2 * np.pi * df['year'])
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) 
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)  
        df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
        df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
        df['working_day'] = df['day_of_week'].apply(lambda x: 1 if x < 5 else 0)
        df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
        df['moonphase'] = df['timestamp'].apply(lambda x: (x.day + x.month * 29.53) % 29.53)
        df['quarter'] = df['timestamp'].dt.quarter
        df['christmas'] = df['timestamp'].apply(lambda x: 1 if x.month == 12 and x.day >= 20 else 0)
        df['day_of_year'] = df['timestamp'].dt.dayofyear

        # Creating rush hour feature
        df['rush_hour'] = df.apply(lambda x: 1 if ((x['hour'] >= 4 and x['hour'] <= 10) or (x['hour'] >= 15 and x['hour'] <= 21)) and x['working_day'] == 1 else 0, axis=1)

        # Convert object columns to category
        for col in df.select_dtypes(include='object').columns:
            df[col] = df[col].astype('category')

        return df.drop_duplicates()
    
    def feature_engineering(self, train_df, val_df):
        train_df = train_df.copy()
        val_df = val_df.copy()

        # Avoid division by zero
        casual_sum = train_df['casual'].sum()
        if casual_sum == 0:
            casual_sum = 1  # Prevent division by zero

        total_ratio_of_registered_uses = train_df['registered'].sum() / casual_sum

        # Average ratios for different time-based groupings
        average_hour_ratio = train_df.groupby('hour').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
        average_day_ratio = train_df.groupby('day_of_week').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
        average_week_ratio = train_df.groupby('week').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
        average_month_ratio = train_df.groupby('month').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
        average_season_ratio = train_df.groupby('season').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
        average_weekend_ratio = train_df.groupby('weekend').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
        average_working_day_ratio = train_df.groupby('working_day').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))

        # Applying ratios to both train and validation sets
        train_df['total_registered_ratio'] = total_ratio_of_registered_uses
        val_df['total_registered_ratio'] = total_ratio_of_registered_uses

        train_df['hour_ratio'] = train_df['hour'].map(average_hour_ratio)
        val_df['hour_ratio'] = val_df['hour'].map(average_hour_ratio)

        train_df['day_ratio'] = train_df['day_of_week'].map(average_day_ratio)
        val_df['day_ratio'] = val_df['day_of_week'].map(average_day_ratio)

        # Now use these ratios wherever needed for working day and weekend conditions
        train_df['working_day_or_weekend_ratio'] = train_df['working_day'].map(average_working_day_ratio).where(train_df['working_day'] == 1, 
                                                                                                                        train_df['weekend'].map(average_weekend_ratio))
        val_df['working_day_or_weekend_ratio'] = val_df['working_day'].map(average_working_day_ratio).where(val_df['working_day'] == 1, 
                                                                                                                    val_df['weekend'].map(average_weekend_ratio))

        train_df['week_ratio'] = train_df['week'].map(average_week_ratio)
        val_df['week_ratio'] = val_df['week'].map(average_week_ratio)

        train_df['month_ratio'] = train_df['month'].map(average_month_ratio)
        val_df['month_ratio'] = val_df['month'].map(average_month_ratio)

        train_df['season_ratio'] = train_df['season'].map(average_season_ratio)
        val_df['season_ratio'] = val_df['season'].map(average_season_ratio)

        # Dropping original target columns
        train_df.drop(['casual', 'registered'], axis=1, inplace=True)
        val_df.drop(['casual', 'registered'], axis=1, inplace=True)

        # Aggregate counts to daily level
        daily_train_df = train_df.groupby(['year', 'month', 'day', 'day_of_year'])[['count']].sum().reset_index()


        # Calculate rolling mean and standard deviation (2-week window)
        rolling_mean = daily_train_df['count'].rolling(window=14, center=True).mean()
        rolling_std = daily_train_df['count'].rolling(window=14, center=True).std()

        # Identify 3-sigma outliers
        daily_train_df['sigma_3_outlier'] = (daily_train_df['count'] > rolling_mean + 3 * rolling_std) | \
                                            (daily_train_df['count'] < rolling_mean - 3 * rolling_std)

        # Find max outlier flag per day_of_year
        day_of_year_outlier = daily_train_df.groupby('day_of_year', as_index=False)['sigma_3_outlier'].max()

        # Merge back into train_df and val_df
        train_df = train_df.merge(day_of_year_outlier, on='day_of_year', how='left')
        val_df = val_df.merge(day_of_year_outlier, on='day_of_year', how='left')

        # Fill NaN values (if no outlier was detected for that day, assume False)
        train_df['sigma_3_outlier'].fillna(False, inplace=True)
        val_df['sigma_3_outlier'].fillna(False, inplace=True)


        return train_df.drop_duplicates(), val_df.drop_duplicates()

        

        return train_df.drop_duplicates(), val_df.drop_duplicates()
    
    def split_and_engineer_data(self, df):
        sorted_df = df.sort_values('timestamp').copy()
        sorted_df.drop('timestamp', axis=1, inplace=True)

        train_df, val_df = train_test_split(sorted_df, test_size=0.2, shuffle=False)

        train_df, val_df = self.feature_engineering(train_df, val_df)

        return train_df, val_df


In [3]:

def inverse_boxcox(y_transformed, lambda_value):
    return (y_transformed * lambda_value + 1) ** (1 / lambda_value) - 1 if lambda_value != 0 else np.exp(y_transformed) - 1

In [4]:

# Load data
parent_path = Path().resolve()
processor = BikeDemandDataProcessor(parent_path)
hour_raw_df = processor.load_data("hour.csv")
hour_processed_df = processor.preprocess(hour_raw_df)


Loading data from /Users/lawrence/Documents/PYTHON/saga_tech_test_2025/hour.csv


In [5]:

# Check for missing data
null_counts = hour_processed_df.isnull().sum()
print(null_counts[null_counts > 0])


Series([], dtype: int64)


In [6]:
# Prepare data
train_df, val_df = processor.split_and_engineer_data(hour_processed_df)

  average_hour_ratio = train_df.groupby('hour').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
  average_day_ratio = train_df.groupby('day_of_week').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
  average_week_ratio = train_df.groupby('week').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
  average_month_ratio = train_df.groupby('month').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
  average_season_ratio = train_df.groupby('season').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
  average_weekend_ratio = train_df.groupby('weekend').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
  average_working_day_ratio = train_df.groupby('working_day').apply(lambda x: x['registered'].sum() / (x['casual'].sum() or 1))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For 

In [7]:
train_df.columns

Index(['season', 'year', 'month', 'hour', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'count', 'payday',
       'day', 'day_of_week', 'day_of_month', 'week', 'year_sin', 'year_cos',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos',
       'working_day', 'weekend', 'moonphase', 'quarter', 'christmas',
       'day_of_year', 'rush_hour', 'total_registered_ratio', 'hour_ratio',
       'day_ratio', 'working_day_or_weekend_ratio', 'week_ratio',
       'month_ratio', 'season_ratio', 'sigma_3_outlier'],
      dtype='object')

In [8]:
from sklearn.preprocessing import OrdinalEncoder


# Your feature list - corrected to match actual column names
features_in_importance_order = ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 
                                'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 
                                'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 
                                'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin', 
                                'moonphase', 'day_cos', 'month']

# Define features and target
target = 'count'

numeric_columns = ['temp', 'atemp', 'hum', 'windspeed', 'year_sin', 'month_cos', 
                   'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'moonphase', 'hour_ratio', 
                   'day_ratio', 'week_ratio', 'working_day_or_weekend_ratio']

categorical_label_encode_columns = ['season', 'year', 'month', 'hour', 'day_of_week', 'weathersit','day', 'week']

categorical_one_hot_columns = ['holiday', 'working_day', 'weekend', 'rush_hour']


# Combine features into one list
features = numeric_columns + categorical_label_encode_columns + categorical_one_hot_columns

In [9]:
train_df[features].head()

Unnamed: 0,temp,atemp,hum,windspeed,year_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,...,month,hour,day_of_week,weathersit,day,week,holiday,working_day,weekend,rush_hour
0,0.24,0.2879,0.81,0.0,-1.295466e-12,0.866025,0.201299,0.97953,0.0,1.0,...,1,0,5,1,1,52,0,0,1,0
1,0.22,0.2727,0.8,0.0,-1.295466e-12,0.866025,0.201299,0.97953,0.258819,0.965926,...,1,1,5,1,1,52,0,0,1,0
2,0.22,0.2727,0.8,0.0,-1.295466e-12,0.866025,0.201299,0.97953,0.5,0.866025,...,1,2,5,1,1,52,0,0,1,0
3,0.24,0.2879,0.75,0.0,-1.295466e-12,0.866025,0.201299,0.97953,0.707107,0.707107,...,1,3,5,1,1,52,0,0,1,0
4,0.24,0.2879,0.75,0.0,-1.295466e-12,0.866025,0.201299,0.97953,0.866025,0.5,...,1,4,5,1,1,52,0,0,1,0


In [10]:
val_df[features].head()

Unnamed: 0,temp,atemp,hum,windspeed,year_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,...,month,hour,day_of_week,weathersit,day,week,holiday,working_day,weekend,rush_hour
0,0.8,0.7576,0.55,0.1343,-6.349064e-13,-0.5,0.571268,0.820763,1.224647e-16,-1.0,...,8,12,1,2,220,32,0,1,0,0
1,0.8,0.7424,0.52,0.194,-6.349064e-13,-0.5,0.571268,0.820763,-0.258819,-0.965926,...,8,13,1,2,220,32,0,1,0,0
2,0.82,0.7576,0.46,0.0,-6.349064e-13,-0.5,0.571268,0.820763,-0.5,-0.866025,...,8,14,1,2,220,32,0,1,0,0
3,0.8,0.7424,0.52,0.0,-6.349064e-13,-0.5,0.571268,0.820763,-0.7071068,-0.707107,...,8,15,1,1,220,32,0,1,0,1
4,0.76,0.7273,0.66,0.2836,-6.349064e-13,-0.5,0.571268,0.820763,-0.8660254,-0.5,...,8,16,1,3,220,32,0,1,0,1


In [12]:

# Preprocessing pipeline
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

# One-hot encoding for categorical columns
categorical_onehot_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Label encoding for other categorical columns
categorical_label_transformer = Pipeline([
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

# Preprocessor will be defined inside the loop for each feature subset

# Log1p transformation
y_train, lambda_value = boxcox(train_df['count'] + 1)

# Set parameters for LightGBM
params = {
    'num_leaves': 35,  
    'learning_rate': 0.1,  
    'n_estimators': 5000, 
}

# Track best metrics
best_rmse = float("inf")
best_rmsle = float("inf")
best_features = None

# Create a list to store all results
all_results = []

# First run with all features
n_features = len(features_in_importance_order)
selected_features = features_in_importance_order.copy()

# Evaluate performance by progressively removing features from the end
for i in range(0, len(features_in_importance_order)):
    if i > 0:
        # Remove the last feature for each iteration after the first
        selected_features = features_in_importance_order[:-i]
        print('New selected features:', selected_features)
    
    # Check if all selected features exist in the dataset
    valid_features = [f for f in selected_features if f in train_df.columns]
    if len(valid_features) != len(selected_features):
        logging.warning(f"Some features not found in dataset: {set(selected_features) - set(valid_features)}")
        selected_features = valid_features
    
    # Get training data with selected features
    new_X_train = train_df[selected_features].copy()
    print(f"Using {len(selected_features)} features: {selected_features}")

     # Process validation data and make predictions
    # Create filtered feature lists based on the current selected features
    current_numeric_columns = [col for col in numeric_columns if col in selected_features]
    current_categorical_one_hot_columns = [col for col in categorical_one_hot_columns if col in selected_features]
    current_categorical_label_encode_columns = [col for col in categorical_label_encode_columns if col in selected_features]

    # print(f"Current numeric columns: {current_numeric_columns}")
    # print(f"Current one-hot columns: {current_categorical_one_hot_columns}")
    # print(f"Current label encode columns: {current_categorical_label_encode_columns}")   

    # Create a new preprocessor with current features
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, current_numeric_columns),
            ("cat_onehot", categorical_onehot_transformer, current_categorical_one_hot_columns),
            ("cat_label", categorical_label_transformer, current_categorical_label_encode_columns)
        ],
        remainder='drop',
        n_jobs=-1
    )

    # Fit the preprocessor
    X_train_processed = preprocessor.fit_transform(new_X_train)
    
    # Create and fit LightGBM model
    model = lgb.LGBMRegressor(**params, verbosity=-1)
    model.fit(X_train_processed, y_train)
    
    
    # Fit the preprocessor
    X_train_processed = preprocessor.fit_transform(new_X_train)

    # Process validation data and make predictions
    X_val = val_df[selected_features]
    y_val = val_df[target]
    X_val_processed = preprocessor.transform(X_val)
    y_pred = model.predict(X_val_processed)
    
    # Apply inverse transformation
    y_pred_inv = inverse_boxcox(y_pred, lambda_value)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_pred_inv)
    rmse = np.sqrt(mse)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_pred_inv)))
    
    # Store results
    all_results.append({
        'n_features': len(selected_features),
        'features': selected_features.copy(),
        'rmse': rmse,
        'rmsle': rmsle
    })
    
    # Log the metrics
    logging.info(f"Features: {len(selected_features)} | RMSE: {rmse:.4f} | RMSLE: {rmsle:.4f}")
    
    # Track the best performing feature subset
    if rmse < best_rmse:
        best_rmse = rmse
        best_rmsle = rmsle
        best_features = selected_features.copy()

# Sort results by RMSE to find optimal feature count
all_results.sort(key=lambda x: x['rmse'])
best_result = all_results[0]

logging.info(f"Best Features ({best_result['n_features']}): {best_result['features']} | Best RMSE: {best_result['rmse']:.4f} | Best RMSLE: {best_result['rmsle']:.4f}")

Using 28 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin', 'moonphase', 'day_cos', 'month']


2025-03-08 19:43:21,076 - INFO - Features: 28 | RMSE: 61.0845 | RMSLE: 0.3811


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin', 'moonphase', 'day_cos']
Using 27 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin', 'moonphase', 'day_cos']


2025-03-08 19:43:28,461 - INFO - Features: 27 | RMSE: 60.8900 | RMSLE: 0.3831


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin', 'moonphase']
Using 26 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin', 'moonphase']


2025-03-08 19:43:34,750 - INFO - Features: 26 | RMSE: 61.2456 | RMSLE: 0.3762


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin']
Using 25 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin']


2025-03-08 19:43:41,711 - INFO - Features: 25 | RMSE: 61.7349 | RMSLE: 0.3710


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday']
Using 24 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday']


2025-03-08 19:43:48,877 - INFO - Features: 24 | RMSE: 62.7223 | RMSLE: 0.3811


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio']
Using 23 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio']


2025-03-08 19:43:55,457 - INFO - Features: 23 | RMSE: 66.8783 | RMSLE: 0.4050


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week']
Using 22 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week']


2025-03-08 19:44:01,259 - INFO - Features: 22 | RMSE: 66.8783 | RMSLE: 0.4050


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed']
Using 21 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed']


2025-03-08 19:44:06,885 - INFO - Features: 21 | RMSE: 68.7678 | RMSLE: 0.4171


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend']
Using 20 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend']


2025-03-08 19:44:12,188 - INFO - Features: 20 | RMSE: 65.8031 | RMSLE: 0.4099


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos']
Using 19 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos']


2025-03-08 19:44:17,603 - INFO - Features: 19 | RMSE: 65.8031 | RMSLE: 0.4099


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio']
Using 18 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio']


2025-03-08 19:44:22,687 - INFO - Features: 18 | RMSE: 64.3895 | RMSLE: 0.3987


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season']
Using 17 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season']


2025-03-08 19:44:27,627 - INFO - Features: 17 | RMSE: 68.9547 | RMSLE: 0.4211


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio']
Using 16 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio']


2025-03-08 19:44:32,527 - INFO - Features: 16 | RMSE: 68.7064 | RMSLE: 0.4088


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year']
Using 15 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year']


2025-03-08 19:44:37,554 - INFO - Features: 15 | RMSE: 67.9343 | RMSLE: 0.4154


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum']
Using 14 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum']


2025-03-08 19:44:42,393 - INFO - Features: 14 | RMSE: 67.9343 | RMSLE: 0.4154


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day']
Using 13 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day']


2025-03-08 19:44:48,140 - INFO - Features: 13 | RMSE: 74.2683 | RMSLE: 0.4295


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week']
Using 12 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week']


2025-03-08 19:44:53,774 - INFO - Features: 12 | RMSE: 89.7892 | RMSLE: 0.4938


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit']
Using 11 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit']


2025-03-08 19:44:59,513 - INFO - Features: 11 | RMSE: 93.4754 | RMSLE: 0.5094


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day']
Using 10 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day']


2025-03-08 19:45:06,514 - INFO - Features: 10 | RMSE: 101.0564 | RMSLE: 0.5464


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos']
Using 9 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos']


2025-03-08 19:45:13,132 - INFO - Features: 9 | RMSE: 113.4927 | RMSLE: 0.6467


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour']
Using 8 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour']


2025-03-08 19:45:18,185 - INFO - Features: 8 | RMSE: 113.6224 | RMSLE: 0.6444


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin']
Using 7 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin']


2025-03-08 19:45:23,188 - INFO - Features: 7 | RMSE: 140.6332 | RMSLE: 0.7396


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year']
Using 6 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year']


2025-03-08 19:45:28,079 - INFO - Features: 6 | RMSE: 140.6332 | RMSLE: 0.7396


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin']
Using 5 features: ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin']


2025-03-08 19:45:33,298 - INFO - Features: 5 | RMSE: 163.4913 | RMSLE: 0.7683


New selected features: ['hour', 'atemp', 'hour_ratio', 'temp']
Using 4 features: ['hour', 'atemp', 'hour_ratio', 'temp']


2025-03-08 19:45:39,086 - INFO - Features: 4 | RMSE: 163.2931 | RMSLE: 0.7652


New selected features: ['hour', 'atemp', 'hour_ratio']
Using 3 features: ['hour', 'atemp', 'hour_ratio']


2025-03-08 19:45:45,869 - INFO - Features: 3 | RMSE: 162.0745 | RMSLE: 0.7334


New selected features: ['hour', 'atemp']
Using 2 features: ['hour', 'atemp']


2025-03-08 19:45:51,932 - INFO - Features: 2 | RMSE: 162.0697 | RMSLE: 0.7326


New selected features: ['hour']
Using 1 features: ['hour']


2025-03-08 19:45:56,310 - INFO - Features: 1 | RMSE: 183.2609 | RMSLE: 0.7745
2025-03-08 19:45:56,311 - INFO - Best Features (27): ['hour', 'atemp', 'hour_ratio', 'temp', 'hour_sin', 'year', 'year_sin', 'rush_hour', 'hour_cos', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'day_sin', 'moonphase', 'day_cos'] | Best RMSE: 60.8900 | Best RMSLE: 0.3831


In [14]:
from sklearn.preprocessing import OrdinalEncoder


# Your feature list - corrected to match actual column names
features_in_importance_order = ['hour', 'atemp' , 'temp', 'year', 'rush_hour', 
                                'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 
                                'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 
                                'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 
                                'moonphase']

# Define features and target
target = 'count'

numeric_columns = ['temp', 'atemp', 'hum', 'windspeed', 'month_cos', 
                    'moonphase', 'hour_ratio', 
                   'day_ratio', 'week_ratio', 'working_day_or_weekend_ratio']

categorical_label_encode_columns = ['season', 'year', 'month', 'hour', 'day_of_week', 'weathersit','day', 'week']

categorical_one_hot_columns = ['holiday', 'working_day', 'weekend', 'rush_hour']


# Combine features into one list
features = numeric_columns + categorical_label_encode_columns + categorical_one_hot_columns


# Preprocessing pipeline
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

# One-hot encoding for categorical columns
categorical_onehot_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Label encoding for other categorical columns
categorical_label_transformer = Pipeline([
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

# Preprocessor will be defined inside the loop for each feature subset

# Log1p transformation
y_train, lambda_value = boxcox(train_df['count'] + 1)

# Set parameters for LightGBM
params = {
    'num_leaves': 35,  
    'learning_rate': 0.1,  
    'n_estimators': 5000, 
}

# Track best metrics
best_rmse = float("inf")
best_rmsle = float("inf")
best_features = None

# Create a list to store all results
all_results = []

# First run with all features
n_features = len(features_in_importance_order)
selected_features = features_in_importance_order.copy()

# Evaluate performance by progressively removing features from the end
for i in range(0, len(features_in_importance_order)):
    if i > 0:
        # Remove the last feature for each iteration after the first
        selected_features = features_in_importance_order[:-i]
        print('New selected features:', selected_features)
    
    # Check if all selected features exist in the dataset
    valid_features = [f for f in selected_features if f in train_df.columns]
    if len(valid_features) != len(selected_features):
        logging.warning(f"Some features not found in dataset: {set(selected_features) - set(valid_features)}")
        selected_features = valid_features
    
    # Get training data with selected features
    new_X_train = train_df[selected_features].copy()
    print(f"Using {len(selected_features)} features: {selected_features}")

     # Process validation data and make predictions
    # Create filtered feature lists based on the current selected features
    current_numeric_columns = [col for col in numeric_columns if col in selected_features]
    current_categorical_one_hot_columns = [col for col in categorical_one_hot_columns if col in selected_features]
    current_categorical_label_encode_columns = [col for col in categorical_label_encode_columns if col in selected_features]

    # print(f"Current numeric columns: {current_numeric_columns}")
    # print(f"Current one-hot columns: {current_categorical_one_hot_columns}")
    # print(f"Current label encode columns: {current_categorical_label_encode_columns}")   

    # Create a new preprocessor with current features
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, current_numeric_columns),
            ("cat_onehot", categorical_onehot_transformer, current_categorical_one_hot_columns),
            ("cat_label", categorical_label_transformer, current_categorical_label_encode_columns)
        ],
        remainder='drop',
        n_jobs=-1
    )

    # Fit the preprocessor
    X_train_processed = preprocessor.fit_transform(new_X_train)
    
    # Create and fit LightGBM model
    model = lgb.LGBMRegressor(**params, verbosity=-1)
    model.fit(X_train_processed, y_train)
    
    
    # Fit the preprocessor
    X_train_processed = preprocessor.fit_transform(new_X_train)

    # Process validation data and make predictions
    X_val = val_df[selected_features]
    y_val = val_df[target]
    X_val_processed = preprocessor.transform(X_val)
    y_pred = model.predict(X_val_processed)
    
    # Apply inverse transformation
    y_pred_inv = inverse_boxcox(y_pred, lambda_value)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_pred_inv)
    rmse = np.sqrt(mse)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_pred_inv)))
    
    # Store results
    all_results.append({
        'n_features': len(selected_features),
        'features': selected_features.copy(),
        'rmse': rmse,
        'rmsle': rmsle
    })
    
    # Log the metrics
    logging.info(f"Features: {len(selected_features)} | RMSE: {rmse:.4f} | RMSLE: {rmsle:.4f}")
    
    # Track the best performing feature subset
    if rmse < best_rmse:
        best_rmse = rmse
        best_rmsle = rmsle
        best_features = selected_features.copy()

# Sort results by RMSE to find optimal feature count
all_results.sort(key=lambda x: x['rmse'])
best_result = all_results[0]

logging.info(f"Best Features ({best_result['n_features']}): {best_result['features']} | Best RMSE: {best_result['rmse']:.4f} | Best RMSLE: {best_result['rmsle']:.4f}")

Using 21 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'moonphase']


2025-03-08 19:53:07,916 - INFO - Features: 21 | RMSE: 63.1772 | RMSLE: 0.3879


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday']
Using 20 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday']


2025-03-08 19:53:16,238 - INFO - Features: 20 | RMSE: 63.2579 | RMSLE: 0.3824


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio']
Using 19 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio']


2025-03-08 19:53:25,003 - INFO - Features: 19 | RMSE: 67.3725 | RMSLE: 0.4107


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week']
Using 18 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week']


2025-03-08 19:53:32,467 - INFO - Features: 18 | RMSE: 67.3725 | RMSLE: 0.4107


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed']
Using 17 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed']


2025-03-08 19:53:40,074 - INFO - Features: 17 | RMSE: 70.3370 | RMSLE: 0.4178


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend']
Using 16 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend']


2025-03-08 19:53:47,339 - INFO - Features: 16 | RMSE: 68.8141 | RMSLE: 0.4186


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos']
Using 15 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos']


2025-03-08 19:53:53,996 - INFO - Features: 15 | RMSE: 68.8141 | RMSLE: 0.4186


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio']
Using 14 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio']


2025-03-08 19:54:00,274 - INFO - Features: 14 | RMSE: 67.7643 | RMSLE: 0.4090


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season']
Using 13 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season']


2025-03-08 19:54:07,164 - INFO - Features: 13 | RMSE: 67.7123 | RMSLE: 0.4196


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio']
Using 12 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio']


2025-03-08 19:54:15,739 - INFO - Features: 12 | RMSE: 66.6147 | RMSLE: 0.4222


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year']
Using 11 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year']


2025-03-08 19:54:22,106 - INFO - Features: 11 | RMSE: 67.9929 | RMSLE: 0.4258


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum']
Using 10 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum']


2025-03-08 19:54:30,176 - INFO - Features: 10 | RMSE: 67.9929 | RMSLE: 0.4258


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day']
Using 9 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day']


2025-03-08 19:54:42,977 - INFO - Features: 9 | RMSE: 75.8844 | RMSLE: 0.4466


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week']
Using 8 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week']


2025-03-08 19:54:51,567 - INFO - Features: 8 | RMSE: 88.2216 | RMSLE: 0.5033


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit']
Using 7 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit']


2025-03-08 19:54:56,955 - INFO - Features: 7 | RMSE: 93.1288 | RMSLE: 0.5121


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day']
Using 6 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day']


2025-03-08 19:55:02,341 - INFO - Features: 6 | RMSE: 100.2434 | RMSLE: 0.5424


New selected features: ['hour', 'atemp', 'temp', 'year', 'rush_hour']
Using 5 features: ['hour', 'atemp', 'temp', 'year', 'rush_hour']


2025-03-08 19:55:08,804 - INFO - Features: 5 | RMSE: 113.6144 | RMSLE: 0.6490


New selected features: ['hour', 'atemp', 'temp', 'year']
Using 4 features: ['hour', 'atemp', 'temp', 'year']


2025-03-08 19:55:16,277 - INFO - Features: 4 | RMSE: 140.1900 | RMSLE: 0.7318


New selected features: ['hour', 'atemp', 'temp']
Using 3 features: ['hour', 'atemp', 'temp']


2025-03-08 19:55:21,923 - INFO - Features: 3 | RMSE: 163.5142 | RMSLE: 0.7660


New selected features: ['hour', 'atemp']
Using 2 features: ['hour', 'atemp']


2025-03-08 19:55:30,451 - INFO - Features: 2 | RMSE: 162.0697 | RMSLE: 0.7326


New selected features: ['hour']
Using 1 features: ['hour']


2025-03-08 19:55:37,587 - INFO - Features: 1 | RMSE: 183.2609 | RMSLE: 0.7745
2025-03-08 19:55:37,588 - INFO - Best Features (21): ['hour', 'atemp', 'temp', 'year', 'rush_hour', 'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 'moonphase'] | Best RMSE: 63.1772 | Best RMSLE: 0.3879


In [31]:
from sklearn.feature_selection import RFECV

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OrdinalEncoder


# Your feature list - corrected to match actual column names
features_in_importance_order = ['hour', 'atemp' , 'temp', 'year', 'rush_hour', 
                                'working_day', 'weathersit', 'day_of_week', 'day', 'hum', 'day_of_year', 
                                'day_ratio', 'season', 'week_ratio', 'month_cos', 'weekend', 
                                'windspeed', 'week', 'working_day_or_weekend_ratio', 'holiday', 
                                'moonphase']

# Define features and target
target = 'count'

numeric_columns = ['temp', 'atemp', 'hum', 'windspeed', 'month_cos', 
                    'moonphase', 'hour_ratio', 
                   'day_ratio', 'week_ratio', 'working_day_or_weekend_ratio']

categorical_label_encode_columns = ['season', 'year', 'month', 'hour', 'day_of_week', 'weathersit','day', 'week']

categorical_one_hot_columns = ['holiday', 'working_day', 'weekend', 'rush_hour']


# Combine features into one list
features = numeric_columns + categorical_label_encode_columns + categorical_one_hot_columns

# Preprocessing pipeline
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

# One-hot encoding for categorical columns
categorical_onehot_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Label encoding for other categorical columns
categorical_label_transformer = Pipeline([
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

# Set parameters for LightGBM
params = {
    'num_leaves': 35,  
    'learning_rate': 0.1,  
    'n_estimators': 5000, 
}

# Create a preprocessor with all features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat_onehot", categorical_onehot_transformer, categorical_one_hot_columns),
        ("cat_label", categorical_label_transformer, categorical_label_encode_columns)
    ],
    remainder='drop',
    n_jobs=-1
)

regressor = lgb.LGBMRegressor(**params, verbosity=-1)

transformer = QuantileTransformer(output_distribution='normal')

regr = TransformedTargetRegressor(regressor=regressor,
                                  transformer=transformer)

tcsv = TimeSeriesSplit(5)

rfetscv = RFECV(
    estimator=regr,
    step=1,
    cv=tcsv,
    scoring="neg_mean_squared_error",
    min_features_to_select=3,
    n_jobs=2,
)

reg1 = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_eliminator', rfetscv),
    ('model', regr)
])

mse_scores = cross_val_score(reg1, X_train, y_train, scoring='neg_mean_squared_error', cv=tcsv)
rmse_scores = np.sqrt(-mse_scores)

print('Average RMSE:', np.mean(rmse_scores))
print('number of features:', reg1['feature_eliminator'].n_features_)
print('Optimal number of features:', rfetscv.n_features_)
print('Optimal features:', X_train.columns[rfetscv.support_])
X_train = train_df[features].copy()

y_train, lambda_value = train_df['count']

mse_scores = cross_val_score(reg1, X_train, y_train, scoring='neg_mean_squared_error', cv=tcsv)
rmse_scores = np.sqrt(-mse_scores)

print('Average RMSE:', np.mean(rmse_scores))
print('number of features:', reg1['m'].n_features_in_)



ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
    r = call_item()
        ^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 598, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/feature_selection/_rfe.py", line 52, in _rfe_single_fit
    rfe._fit(
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/feature_selection/_rfe.py", line 335, in _fit
    importances = _get_feature_importances(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/feature_selection/_base.py", line 234, in _get_feature_importances
    raise ValueError(
ValueError: when `importance_getter=='auto'`, the underlying estimator TransformedTargetRegressor should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/feature_selection/_rfe.py", line 873, in fit
    scores_features = parallel(
                      ^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 2007, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 1650, in _get_outputs
    yield from self._retrieve()
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 1754, in _retrieve
    self._raise_error_fast()
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 1789, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 745, in get_result
    return self._return_or_raise()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lawrence/Documents/PYTHON/saga_tech_test_2025/venv/lib/python3.11/site-packages/joblib/parallel.py", line 763, in _return_or_raise
    raise self._result
ValueError: when `importance_getter=='auto'`, the underlying estimator TransformedTargetRegressor should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.


In [None]:

import matplotlib.pyplot as plt

# Fit the pipeline to access the RFECV metrics
print("Fitting the pipeline to access feature selection metrics...")
reg1.fit(X_train, y_train)

# Display feature selection information
print('Number of features selected by RFECV:', reg1.named_steps['feature_eliminator'].n_features_)

# Get and display the selected features
selected_features_mask = reg1.named_steps['feature_eliminator'].support_
selected_feature_names = np.array(features)[selected_features_mask].tolist()
print('Selected features:', selected_feature_names)

# Plot the feature selection scores
plt.figure(figsize=(10, 6))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (neg_mean_squared_error)")
plt.plot(range(1, len(reg1.named_steps['feature_eliminator'].grid_scores_) + 1), 
         reg1.named_steps['feature_eliminator'].grid_scores_)
plt.title("Optimal Number of Features")
plt.tight_layout()
plt.show()

# Display feature ranking information (1 = selected)
feature_ranking = dict(zip(features, reg1.named_steps['feature_eliminator'].ranking_))
sorted_ranking = {k: v for k, v in sorted(feature_ranking.items(), key=lambda item: item[1])}
print("\nFeature Ranking (1 = selected):")
for feature, rank in sorted_ranking.items():
    print(f"{feature}: {rank}")

# Calculate performance metrics
y_pred_train = reg1.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
r2_train = r2_score(y_train, y_pred_train)

print(f"\nTraining RMSE: {rmse_train:.4f}")
print(f"Training R²: {r2_train:.4f}")

# Evaluate on validation data
X_val_full = val_df[features]
y_val_true = val_df[target]
y_val_pred = reg1.predict(X_val_full)
y_val_pred_inv = inverse_boxcox(y_val_pred, lambda_value)

rmse_val = np.sqrt(mean_squared_error(y_val_true, y_val_pred_inv))
rmsle_val = np.sqrt(mean_squared_error(np.log1p(y_val_true), np.log1p(y_val_pred_inv)))
r2_val = r2_score(y_val_true, y_val_pred_inv)

print(f"\nValidation RMSE: {rmse_val:.4f}")
print(f"Validation RMSLE: {rmsle_val:.4f}")
print(f"Validation R²: {r2_val:.4f}")

In [27]:
# Recreate the timestamp for training data
train_df['timestamp'] = pd.to_datetime({
	'year': train_df['year'], 
	'month': train_df['month'], 
	'day': train_df['day_of_month']
}) + pd.to_timedelta(train_df['hour'], unit='h')

val_df['timestamp'] = pd.to_datetime({
	'year': val_df['year'], 
	'month': val_df['month'], 
	'day': val_df['day_of_month']
}) + pd.to_timedelta(val_df['hour'], unit='h')

# Residuals and diagnostics
residuals = y_val - y_pred_inv
plt.figure(figsize=(12, 6))
sns.histplot(residuals, kde=True, color='blue')
plt.title("Residuals Distribution")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

# Predicted vs Actual
plt.figure(figsize=(12, 6))
plt.scatter(y_val, y_pred_inv, alpha=0.5, color='green')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')
plt.xlabel('True values')
plt.ylabel('Predicted values')
plt.title('Predicted vs Actual')
plt.show()

# Time series plot with rolling average
plt.figure(figsize=(15, 8))
plt.plot(train_df['timestamp'], train_df[target], label='Actual (Train)', color='blue', alpha=0.5)
plt.plot(val_df['timestamp'], y_val, label='Actual (Validation)', color='blue', alpha=0.5)
plt.plot(val_df['timestamp'], y_pred_inv, label='Predicted (Validation)', color='orange', alpha=0.5)
plt.plot(train_df['timestamp'], train_df[target].rolling(window=24).mean(), label='Actual (Train, Rolling Avg)', color='blue', linewidth=2)
plt.plot(val_df['timestamp'], y_val.rolling(window=24).mean(), label='Actual (Validation, Rolling Avg)', color='blue', linewidth=2)
plt.plot(val_df['timestamp'], pd.Series(y_pred_inv).rolling(window=24).mean(), label='Predicted (Validation, Rolling Avg)', color='orange', linewidth=2)
plt.xlabel('Timestamp')
plt.ylabel('Count')
plt.title('Time Series of Actual vs Predicted Counts with Rolling Average')
plt.legend()
plt.show()


Index(['temp', 'atemp', 'hum', 'windspeed', 'year_sin', 'year_cos',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos',
       'moonphase', 'total_registered_ratio', 'hour_ratio', 'day_ratio',
       'week_ratio', 'month_ratio', 'season_ratio',
       'working_day_or_weekend_ratio', 'season', 'year', 'month', 'hour',
       'day_of_week', 'weathersit', 'day', 'week', 'quarter',
       'sigma_3_outlier', 'holiday', 'payday', 'working_day', 'weekend',
       'rush_hour'],
      dtype='object')