In [1]:
import pandas as pd
import numpy as np
import holidays
from typing import Tuple, List

import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load data
train_df = pd.read_parquet('data/train.parquet')
test_df = pd.read_parquet('data/final_test.parquet')
weather_df = pd.read_csv('external_data/external_data.csv')


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 496827 entries, 48321 to 929187
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[us]
 6   counter_installation_date  496827 non-null  datetime64[us]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  float64  

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51440 entries, 0 to 51439
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   counter_id                 51440 non-null  category      
 1   counter_name               51440 non-null  category      
 2   site_id                    51440 non-null  int64         
 3   site_name                  51440 non-null  category      
 4   date                       51440 non-null  datetime64[us]
 5   counter_installation_date  51440 non-null  datetime64[us]
 6   coordinates                51440 non-null  category      
 7   counter_technical_id       51440 non-null  category      
 8   latitude                   51440 non-null  float64       
 9   longitude                  51440 non-null  float64       
dtypes: category(5), datetime64[us](2), float64(2), int64(1)
memory usage: 2.2 MB


In [5]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 59 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   numer_sta  3322 non-null   int64  
 1   date       3322 non-null   object 
 2   pmer       3322 non-null   int64  
 3   tend       3322 non-null   int64  
 4   cod_tend   3322 non-null   int64  
 5   dd         3322 non-null   int64  
 6   ff         3322 non-null   float64
 7   t          3322 non-null   float64
 8   td         3322 non-null   float64
 9   u          3322 non-null   int64  
 10  vv         3322 non-null   int64  
 11  ww         3322 non-null   int64  
 12  w1         3315 non-null   float64
 13  w2         3312 non-null   float64
 14  n          3166 non-null   float64
 15  nbas       3317 non-null   float64
 16  hbas       2869 non-null   float64
 17  cl         2909 non-null   float64
 18  cm         1941 non-null   float64
 19  ch         1678 non-null   float64
 20  pres    

In [6]:
# Preprocess data
if 'bike_count' in train_df.columns:
    train_df = train_df.drop(columns=['bike_count'])


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 496827 entries, 48321 to 929187
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   date                       496827 non-null  datetime64[us]
 5   counter_installation_date  496827 non-null  datetime64[us]
 6   coordinates                496827 non-null  category      
 7   counter_technical_id       496827 non-null  category      
 8   latitude                   496827 non-null  float64       
 9   longitude                  496827 non-null  float64       
 10  log_bike_count             496827 non-null  float64       
dtypes: category(5), datetime64[us](2), float64(3), int64(

In [8]:
# Preprocess weather data
weather_df['date'] = pd.to_datetime(weather_df['date']).astype('datetime64[ns]')
weather_df.sort_values('date', inplace=True)

# Preprocess train data
train_df['date'] = pd.to_datetime(train_df['date']).astype('datetime64[ns]')
train_df.sort_values('date', inplace=True)

test_df['date'] = pd.to_datetime(test_df['date']).astype('datetime64[ns]')
test_df.sort_values('date', inplace=True)

merged_df_train = pd.merge(train_df, weather_df, on='date', how= 'inner')
merged_df_test = pd.merge(test_df, weather_df, on='date', how='inner')

print(merged_df_train.head(5))
print(merged_df_test.head(5))

            counter_id                       counter_name    site_id  \
0  100056329-104056329       Pont Charles De Gaulle NE-SO  100056329   
1  100056334-104056334               38 rue Turbigo SO-NE  100056334   
2  100047542-103047542  Face au 48 quai de la marne NE-SO  100047542   
3  100057329-103057329   Totem 85 quai d'Austerlitz SE-NO  100057329   
4  100047547-103047547           6 rue Julia Bartet SO-NE  100047547   

                     site_name                date counter_installation_date  \
0       Pont Charles De Gaulle 2020-09-01 03:00:00                2019-12-12   
1               38 rue Turbigo 2020-09-01 03:00:00                2019-12-10   
2  Face au 48 quai de la marne 2020-09-01 03:00:00                2018-11-29   
3   Totem 85 quai d'Austerlitz 2020-09-01 03:00:00                2020-02-18   
4           6 rue Julia Bartet 2020-09-01 03:00:00                2018-11-28   

        coordinates counter_technical_id  latitude  longitude  ...  hnuage1  \
0  48.8

In [9]:
merged_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165368 entries, 0 to 165367
Data columns (total 69 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 165368 non-null  category      
 1   counter_name               165368 non-null  category      
 2   site_id                    165368 non-null  int64         
 3   site_name                  165368 non-null  category      
 4   date                       165368 non-null  datetime64[ns]
 5   counter_installation_date  165368 non-null  datetime64[us]
 6   coordinates                165368 non-null  category      
 7   counter_technical_id       165368 non-null  category      
 8   latitude                   165368 non-null  float64       
 9   longitude                  165368 non-null  float64       
 10  log_bike_count             165368 non-null  float64       
 11  numer_sta                  165368 non-null  int64   

In [10]:
merged_df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17148 entries, 0 to 17147
Data columns (total 68 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   counter_id                 17148 non-null  category      
 1   counter_name               17148 non-null  category      
 2   site_id                    17148 non-null  int64         
 3   site_name                  17148 non-null  category      
 4   date                       17148 non-null  datetime64[ns]
 5   counter_installation_date  17148 non-null  datetime64[us]
 6   coordinates                17148 non-null  category      
 7   counter_technical_id       17148 non-null  category      
 8   latitude                   17148 non-null  float64       
 9   longitude                  17148 non-null  float64       
 10  numer_sta                  17148 non-null  int64         
 11  pmer                       17148 non-null  int64         
 12  tend

In [11]:
print(type(merged_df_train))
print(merged_df_train.head())
print(type(merged_df_test))
print(merged_df_test.head())


<class 'pandas.core.frame.DataFrame'>
            counter_id                       counter_name    site_id  \
0  100056329-104056329       Pont Charles De Gaulle NE-SO  100056329   
1  100056334-104056334               38 rue Turbigo SO-NE  100056334   
2  100047542-103047542  Face au 48 quai de la marne NE-SO  100047542   
3  100057329-103057329   Totem 85 quai d'Austerlitz SE-NO  100057329   
4  100047547-103047547           6 rue Julia Bartet SO-NE  100047547   

                     site_name                date counter_installation_date  \
0       Pont Charles De Gaulle 2020-09-01 03:00:00                2019-12-12   
1               38 rue Turbigo 2020-09-01 03:00:00                2019-12-10   
2  Face au 48 quai de la marne 2020-09-01 03:00:00                2018-11-29   
3   Totem 85 quai d'Austerlitz 2020-09-01 03:00:00                2020-02-18   
4           6 rue Julia Bartet 2020-09-01 03:00:00                2018-11-28   

        coordinates counter_technical_id  latitu

In [12]:
print(merged_df_train.columns)
print(merged_df_test.columns)


Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude', 'log_bike_count', 'numer_sta', 'pmer', 'tend',
       'cod_tend', 'dd', 'ff', 't', 'td', 'u', 'vv', 'ww', 'w1', 'w2', 'n',
       'nbas', 'hbas', 'cl', 'cm', 'ch', 'pres', 'niv_bar', 'geop', 'tend24',
       'tn12', 'tn24', 'tx12', 'tx24', 'tminsol', 'sw', 'tw', 'raf10',
       'rafper', 'per', 'etat_sol', 'ht_neige', 'ssfrai', 'perssfrai', 'rr1',
       'rr3', 'rr6', 'rr12', 'rr24', 'phenspe1', 'phenspe2', 'phenspe3',
       'phenspe4', 'nnuage1', 'ctype1', 'hnuage1', 'nnuage2', 'ctype2',
       'hnuage2', 'nnuage3', 'ctype3', 'hnuage3', 'nnuage4', 'ctype4',
       'hnuage4'],
      dtype='object')
Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude', 'numer_sta', 'pmer', 'tend', 'cod_tend', 

In [13]:
# Drop irrelevant columns
irrelevant_columns = [
    'counter_id', 'counter_name', 'site_id', 'site_name', 'coordinates', 
    'counter_technical_id', 'latitude', 'longitude', 'counter_installation_date', 
    'numer_sta', 'niv_bar', 'geop', 'tn24', 'tx24', 'sw', 'tw', 'phenspe1', 'phenspe2', 'phenspe3', 'phenspe4'
]
merged_df_train.drop(columns=irrelevant_columns, inplace=True)
merged_df_test.drop(columns=irrelevant_columns, inplace=True)

print(type(merged_df_train))
print(type(merged_df_test))



<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [14]:
# Handle missing data
# Drop columns with more than 50% missing values
threshold = 0.5 * len(merged_df_train)
merged_df_train.dropna(axis=1, thresh=threshold, inplace=True)
#merged_df_test = merged_df_test.drop(columns=[col for col in merged_df_test.columns if col not in merged_df_train.columns], inplace=True)

In [15]:
print(merged_df_train.columns)
print(merged_df_test.columns)

Index(['date', 'log_bike_count', 'pmer', 'tend', 'cod_tend', 'dd', 'ff', 't',
       'td', 'u', 'vv', 'ww', 'w1', 'w2', 'n', 'nbas', 'hbas', 'cl', 'cm',
       'pres', 'tend24', 'raf10', 'rafper', 'per', 'etat_sol', 'ht_neige',
       'ssfrai', 'perssfrai', 'rr1', 'rr3', 'rr6', 'rr12', 'rr24', 'nnuage1',
       'ctype1', 'hnuage1', 'nnuage2', 'hnuage2'],
      dtype='object')
Index(['date', 'pmer', 'tend', 'cod_tend', 'dd', 'ff', 't', 'td', 'u', 'vv',
       'ww', 'w1', 'w2', 'n', 'nbas', 'hbas', 'cl', 'cm', 'ch', 'pres',
       'tend24', 'tn12', 'tx12', 'tminsol', 'raf10', 'rafper', 'per',
       'etat_sol', 'ht_neige', 'ssfrai', 'perssfrai', 'rr1', 'rr3', 'rr6',
       'rr12', 'rr24', 'nnuage1', 'ctype1', 'hnuage1', 'nnuage2', 'ctype2',
       'hnuage2', 'nnuage3', 'ctype3', 'hnuage3', 'nnuage4', 'ctype4',
       'hnuage4'],
      dtype='object')


In [16]:
merged_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165368 entries, 0 to 165367
Data columns (total 38 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            165368 non-null  datetime64[ns]
 1   log_bike_count  165368 non-null  float64       
 2   pmer            165368 non-null  int64         
 3   tend            165368 non-null  int64         
 4   cod_tend        165368 non-null  int64         
 5   dd              165368 non-null  int64         
 6   ff              165368 non-null  float64       
 7   t               165368 non-null  float64       
 8   td              165368 non-null  float64       
 9   u               165368 non-null  int64         
 10  vv              165368 non-null  int64         
 11  ww              165368 non-null  int64         
 12  w1              164980 non-null  float64       
 13  w2              164816 non-null  float64       
 14  n               157746 non-null  flo

In [17]:
merged_df_test.drop(columns=[col for col in merged_df_test.columns if col not in merged_df_train.columns], inplace=True)

In [18]:
merged_df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17148 entries, 0 to 17147
Data columns (total 37 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       17148 non-null  datetime64[ns]
 1   pmer       17148 non-null  int64         
 2   tend       17148 non-null  int64         
 3   cod_tend   17148 non-null  int64         
 4   dd         17148 non-null  int64         
 5   ff         17148 non-null  float64       
 6   t          17148 non-null  float64       
 7   td         17148 non-null  float64       
 8   u          17148 non-null  int64         
 9   vv         17148 non-null  int64         
 10  ww         17148 non-null  int64         
 11  w1         17148 non-null  float64       
 12  w2         17148 non-null  float64       
 13  n          16165 non-null  float64       
 14  nbas       17148 non-null  float64       
 15  hbas       14825 non-null  float64       
 16  cl         15067 non-null  float64      

In [19]:
def add_additional_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add additional features like day of week, month, day of year, and holiday indicator.
    """
    # Convert 'date' column to datetime if not already in datetime format
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'])

    # Date-based features
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['day_of_year'] = df['date'].dt.dayofyear

    # Holiday indicator feature (Example: US Holidays)
    us_holidays = holidays.US(years=2023)  # Adjust the year as needed
    df['is_holiday'] = df['date'].dt.date.isin(us_holidays.keys())

    
    return df

# Apply feature engineering to train and test data
merged_df_train = add_additional_features(merged_df_train)
merged_df_test = add_additional_features(merged_df_test)

print(merged_df_train.columns)
print(merged_df_test.columns)

Index(['date', 'log_bike_count', 'pmer', 'tend', 'cod_tend', 'dd', 'ff', 't',
       'td', 'u', 'vv', 'ww', 'w1', 'w2', 'n', 'nbas', 'hbas', 'cl', 'cm',
       'pres', 'tend24', 'raf10', 'rafper', 'per', 'etat_sol', 'ht_neige',
       'ssfrai', 'perssfrai', 'rr1', 'rr3', 'rr6', 'rr12', 'rr24', 'nnuage1',
       'ctype1', 'hnuage1', 'nnuage2', 'hnuage2', 'day_of_week', 'month',
       'day_of_year', 'is_holiday'],
      dtype='object')
Index(['date', 'pmer', 'tend', 'cod_tend', 'dd', 'ff', 't', 'td', 'u', 'vv',
       'ww', 'w1', 'w2', 'n', 'nbas', 'hbas', 'cl', 'cm', 'pres', 'tend24',
       'raf10', 'rafper', 'per', 'etat_sol', 'ht_neige', 'ssfrai', 'perssfrai',
       'rr1', 'rr3', 'rr6', 'rr12', 'rr24', 'nnuage1', 'ctype1', 'hnuage1',
       'nnuage2', 'hnuage2', 'day_of_week', 'month', 'day_of_year',
       'is_holiday'],
      dtype='object')


In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer


def build_optimized_pipeline(X: pd.DataFrame) -> Pipeline:
    """
    Build an optimized pipeline with advanced preprocessing and ensemble models.
    """
    # Separate features into numerical and categorical
    selected_features_numeric = X.select_dtypes(include=['float64', 'int64']).columns
    selected_features_categorical = X.select_dtypes(include=['object', 'category']).columns

    # Numeric transformer with scaling and imputation
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
        ('scaler', StandardScaler())
    ])

    # Categorical transformer with imputation and one-hot encoding
    categorical_transformer = Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),  # Handle missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Preprocessor with column transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, selected_features_numeric),
            ('cat', categorical_transformer, selected_features_categorical)
        ]
    )

    # Feature selection using mutual information regression
    feature_selection = SelectKBest(score_func=mutual_info_regression, k=10)

    # Stacked regressor with XGBoost and Random Forest
    stacked_model = StackingRegressor(
        estimators=[
            ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
            ('xgb', xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=4,
                random_state=42))
        ],
        final_estimator=xgb.XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42)
    )

    # Final pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', feature_selection),
        ('regressor', stacked_model)
    ])
    
    return pipeline


def evaluate_pipeline(pipeline: Pipeline, X: pd.DataFrame, y: pd.Series) -> Pipeline:
    """
    Evaluate pipeline performance using various metrics.
    """
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

    # Output results
    print("Model Evaluation Metrics:")
    print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse):.4f}")
    print(f"R-squared Score (R²): {r2:.4f}")
    print("Cross-validation R² scores:", cv_scores)
    print(f"Mean CV R² score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    return pipeline


def predict_test_data(pipeline: Pipeline, test_data: pd.DataFrame) -> pd.DataFrame:
    """
    Generate predictions for test data.
    """
    predictions = pipeline.predict(test_data)
    return pd.DataFrame({
        'Id': range(len(predictions)),
        'log_bike_count': predictions
    })



In [21]:
# Ensure target variable exists in training data
if 'log_bike_count' not in merged_df_train.columns:
    raise ValueError("'log_bike_count' is missing in the training data.")


In [22]:
# Separate features and target
target_col = 'log_bike_count'
X = merged_df_train.drop(columns=[target_col])
y = merged_df_train[target_col]

# Build pipeline
pipeline = build_optimized_pipeline(X)

# Evaluate pipeline
pipeline = evaluate_pipeline(pipeline, X, y)

Model Evaluation Metrics:
Root Mean Squared Error (RMSE): 0.9288
R-squared Score (R²): 0.6853
Cross-validation R² scores: [ 0.08115468 -0.05751698  0.11307078  0.20331312  0.16847639]
Mean CV R² score: 0.1017 (+/- 0.1804)


In [23]:
prin("f")

NameError: name 'prin' is not defined

In [None]:
numerical_features = merged_df_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = merged_df_train.select_dtypes(include=['category']).columns

In [None]:
def build_model_pipeline(X: pd.DataFrame) -> Pipeline:
    """
    Construct ML pipeline with preprocessing and XGBoost.
    """
    # Separate features into numerical and categorical
    selected_features_numeric = X.select_dtypes(include=['float64', 'int64']).columns
    selected_features_categorical = X.select_dtypes(include=['object', 'category']).columns

    # Numeric transformer with scaling and imputation
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
        ('scaler', StandardScaler())
    ])

    # Categorical transformer with imputation and one-hot encoding
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Preprocessor with column transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, selected_features_numeric),
            ('cat', categorical_transformer, selected_features_categorical)
        ]
    )

    # Final pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
        ('regressor', xgb.XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=4,
            random_state=42
        ))
    ])
    return pipeline


def evaluate_model(pipeline: Pipeline, X: pd.DataFrame, y: pd.Series) -> Pipeline:
    """
    Evaluate model performance using various metrics.
    """
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

    # Output results
    print("Model Evaluation Metrics:")
    print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse):.4f}")
    print(f"R-squared Score (R²): {r2:.4f}")
    print("Cross-validation R² scores:", cv_scores)
    print(f"Mean CV R² score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    return pipeline


def predict(pipeline: Pipeline, test_data: pd.DataFrame) -> pd.DataFrame:
    """
    Generate predictions for test data.
    """
    predictions = pipeline.predict(test_data)
    return pd.DataFrame({
        'Id': range(len(predictions)),
        'log_bike_count': predictions
    })





In [None]:
# Separate features and target
target_col = 'log_bike_count'
X = merged_df_train.drop(columns=[target_col])
y = merged_df_train[target_col]

# Build pipeline
pipeline = build_model_pipeline(X)

# Evaluate pipeline
pipeline = evaluate_model(pipeline, X, y)


Model Evaluation Metrics:
Root Mean Squared Error (RMSE): 1.1840
R-squared Score (R²): 0.4899
Cross-validation R² scores: [ 0.19460499 -0.11880227  0.15578018  0.20220097  0.20338955]
Mean CV R² score: 0.1274 (+/- 0.2487)


In [None]:
# Align test data columns with training data
test_X = merged_df_test[X.columns]

In [None]:
# Generate submission file
submission = predict(pipeline, test_X)
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

Submission file saved as 'submission.csv'.
