In [1]:
%pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.5.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_log_error
from sklearn.preprocessing import LabelEncoder

In [3]:
N_JOBS = 1

# Load the datasets

In [4]:
training_data_path = '../input/ml-competition-2024-for-ukrainians/train.csv'
test_data_path = '../input/ml-competition-2024-for-ukrainians/test.csv'
train_df = pd.read_csv(training_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape, test_df.shape)

(378428, 13) (252286, 12)


# Preprocess data

In [5]:
def process_visibility(train_df, test_df):
    train_df["Item_Visibility"] = train_df["Item_Visibility"].replace(0, np.nan)
    test_df["Item_Visibility"] = test_df["Item_Visibility"].replace(0, np.nan)

    train_df["Item_Visibility"] = np.log1p(train_df["Item_Visibility"])
    test_df["Item_Visibility"] = np.log1p(test_df["Item_Visibility"])

    mean_visibility = train_df["Item_Visibility"].mean()
    print(f"mean_visibility: {mean_visibility}")

    train_df["Item_Visibility"] = train_df["Item_Visibility"].fillna(mean_visibility)
    test_df["Item_Visibility"] = test_df["Item_Visibility"].fillna(mean_visibility)

    return train_df, test_df

train_df, test_df = process_visibility(train_df, test_df)

mean_visibility: 0.06264653074731183


In [6]:
train_df["Item_Outlet_Sales"] = np.log1p(train_df["Item_Outlet_Sales"].values)

In [7]:
target_mean = train_df["Item_Outlet_Sales"].mean()
target_std = train_df["Item_Outlet_Sales"].std()
print(f"target_mean: {target_mean}, target_std: {target_std}")

target_mean: 7.360180293004388, target_std: 0.8535593658882088


# Extract features

In [8]:
def clean_Item_Fat_Content(x):
    if x == "LF" or x == "low fat":
        return "Low Fat"
    elif x == "reg":
        return "Regular"
    else:
        return x
    
def mrp2class(v):
    b0 = 70
    b1 = 135
    b2 = 204
    if v < b0:
        return 0
    elif v >= b0 and v < b1:
        return 1
    elif v >= b1 and v < b2:
        return 2
    else:
        return 3

def extract_features(df):
    df["Item_Fat_Content"] = df["Item_Fat_Content"].apply(lambda x: clean_Item_Fat_Content(x))
    df["Item_MRP_class"] = df["Item_MRP"].apply(lambda x: mrp2class(x))

In [9]:
extract_features(train_df)
extract_features(test_df)

# Generate encodings

## Target encoding

In [10]:
def target_encode_cv(train_df, test_df, target_column, categorical_column, n_splits=5):
    """
    Encodes a categorical variable with the mean of the target variable using cross-validation to avoid target leakage.

    :param train_df: pandas DataFrame containing the training data
    :param test_df: pandas DataFrame containing the test data
    :param target_column: name of the target column in the training data
    :param categorical_column: name of the categorical column to be encoded
    :param n_splits: number of splits for cross-validation
    :return: Two DataFrames with the new encoded feature added to both train and test data
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    train_df[f'{categorical_column}_TE'] = np.nan
    
    # Create a temporary DataFrame for test encoding
    test_encoded = pd.DataFrame()
    
    for train_index, val_index in kf.split(train_df):
        X_train, X_val = train_df.iloc[train_index], train_df.iloc[val_index]
        
        # Compute the mean of the target for each category in the training fold
        category_target_mean = X_train.groupby(categorical_column)[target_column].mean()
        
        # Map the means to the validation fold
        train_df.loc[train_df.index[val_index], f'{categorical_column}_TE'] = \
            train_df.loc[train_df.index[val_index], categorical_column].map(category_target_mean)
        
        # Update test encoding by accumulating results from each fold
        fold_test_encoded = test_df[categorical_column].map(category_target_mean)
        test_encoded = pd.concat([test_encoded, fold_test_encoded], axis=1)
    
    # Compute the mean for test encoding over all folds
    test_df[f'{categorical_column}_TE'] = test_encoded.mean(axis=1)
    
    # Handle missing values in test set that were not present in training set
    test_df[f'{categorical_column}_TE'].fillna(train_df[target_column].mean(), inplace=True)
    
    return train_df, test_df

In [11]:
te_cols = []
cols_for_te = [
    "Item_Identifier",
    "Item_Type",
    "Outlet_Identifier",
    "Item_MRP",
    "Item_Weight",
    "Item_Visibility",
]
for c in cols_for_te:
    train_df, test_df = target_encode_cv(train_df, test_df, "Item_Outlet_Sales", c)
    te_cols.append(f"{c}_TE")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[f'{categorical_column}_TE'].fillna(train_df[target_column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[f'{categorical_column}_TE'].fillna(train_df[target_column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace m

## Frequency encoding

In [12]:
def frequency_encoding(train_df, test_df, categorical_columns):
    """
    Perform frequency encoding for the specified categorical columns in the train and test dataframes.

    Args:
    train_df (pd.DataFrame): The training dataframe containing categorical columns to encode.
    test_df (pd.DataFrame): The test dataframe containing categorical columns to encode.
    categorical_columns (list): A list of column names to apply frequency encoding on.

    Returns:
    pd.DataFrame, pd.DataFrame: The train and test dataframes with the specified categorical columns frequency encoded.
    """
    
    train_df_encoded = train_df.copy()  # Make a copy of the training dataframe to avoid modifying the original one
    test_df_encoded = test_df.copy()    # Make a copy of the test dataframe to avoid modifying the original one
    
    for column in categorical_columns:
        freq_map = train_df[column].value_counts().to_dict()  # Calculate the frequency of each category from the training data
        train_df_encoded[column + "_FE"] = train_df[column].map(freq_map)  # Map the frequencies to the original column values in the training data
        test_df_encoded[column + "_FE"] = test_df[column].map(freq_map).fillna(0)  # Map the frequencies to the original column values in the test data, filling NaN with 0
    
    return train_df_encoded, test_df_encoded

In [13]:
fe_cols = [
    "Item_Identifier",
    "Item_Type",
    "Outlet_Identifier",
    "Item_MRP",
    "Item_Weight",
    "Item_Visibility",
]
train_df, test_df = frequency_encoding(train_df, test_df, fe_cols)

## Interaction-based encoding

In [14]:
def interaction_based_encoding(train_df, test_df, categorical_column, numerical_column, target_column, n_splits=5, random_state=42):
    """
    Perform Interaction-Based Encoding for a specified categorical and numerical column using K-Fold cross-validation to avoid target leakage.

    :param train_df: pandas DataFrame containing the training data.
    :param test_df: pandas DataFrame containing the test data.
    :param categorical_column: The name of the categorical column to encode.
    :param numerical_column: The name of the numerical column to interact with.
    :param target_column: The name of the target column.
    :param n_splits: Number of splits for K-Fold cross-validation.
    :param random_state: Random state for reproducibility.
    :return: Tuple of encoded training and test DataFrames.
    """
    # Initialize the KFold cross-validator
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Initialize an empty array to hold the encoded values
    train_encoded = np.zeros(len(train_df))
    
    # Perform K-Fold cross-validation
    for train_index, val_index in kf.split(train_df):
        train_fold, val_fold = train_df.iloc[train_index], train_df.iloc[val_index]
        
        # Calculate the mean of the numerical column for each category in the training fold
        category_stats = train_fold.groupby(categorical_column)[numerical_column].mean()
        
        # Create a dictionary for quick lookup
        category_mean_dict = category_stats.to_dict()
        
        # Encode the validation fold using the training fold's category stats
        val_fold_encoded = val_fold[categorical_column].map(category_mean_dict).fillna(train_df[numerical_column].mean())
        train_encoded[val_index] = val_fold_encoded
    
    # Assign the encoded values back to the training DataFrame
    train_df[categorical_column + '_IE_' + numerical_column] = train_encoded
    
    # Calculate the category statistics for the entire training set to encode the test set
    category_stats = train_df.groupby(categorical_column)[numerical_column].mean()
    category_mean_dict = category_stats.to_dict()
    
    # Encode the test data (handling unknown categories by assigning them the global mean of the numerical column)
    test_encoded = test_df[categorical_column].map(category_mean_dict).fillna(train_df[numerical_column].mean())
    test_df[categorical_column + '_IE_' + numerical_column] = test_encoded
    
    return train_df, test_df

In [15]:
ie_col_pairs = [
    ("Item_Identifier", "Item_MRP"),
    ("Item_Identifier", "Item_Weight"),
    ("Item_Identifier", "Item_Visibility"),
    ("Item_Type", "Item_MRP"),
    ("Item_Type", "Item_Weight"),
    ("Item_Type", "Item_Visibility"),
    ("Outlet_Identifier", "Item_MRP"),
    ("Outlet_Identifier", "Item_Weight"),
    ("Outlet_Identifier", "Item_Visibility"),
]
ie_cols = []
for cat_c, num_c in ie_col_pairs:
    train_df, test_df = interaction_based_encoding(train_df, test_df, cat_c, num_c, "Item_Outlet_Sales")
    ie_cols.append(cat_c + "_IE_" + num_c)

# Label encoding

In [16]:
def label_encode(train_df, test_df):
    label_encoders = {}
    cat_cols = []
    for column in train_df.columns:
        if train_df[column].dtype == 'object' or train_df[column].dtype == 'category':
            print(f"LE: {column}")
            le = LabelEncoder()
            # Fit on both training and test data to ensure consistency
            le.fit(list(train_df[column].astype(str)) + list(test_df[column].astype(str)))
            train_df[column] = le.transform(train_df[column].astype(str))
            test_df[column] = le.transform(test_df[column].astype(str))
            label_encoders[column] = le
            cat_cols.append(column)
    return train_df, test_df, label_encoders, cat_cols

# Apply label encoding
train_df, test_df, _, cat_cols = label_encode(train_df, test_df)
print(train_df.shape, test_df.shape)

LE: Item_Identifier
LE: Item_Fat_Content
LE: Item_Type
LE: Outlet_Identifier
LE: Outlet_Size
LE: Outlet_Location_Type
LE: Outlet_Type
(378428, 35) (252286, 34)


# Prepare data for tree-based models

In [17]:
tree_drop_cols=[
    "id", # ID
    "Item_Outlet_Sales", # TARGET
    "Outlet_Establishment_Year",
    "Item_Fat_Content",
    "Outlet_Size",
    "Outlet_Location_Type",
    "Outlet_Type",
    "Item_MRP_class",
]

tree_train_features = train_df.drop(columns=tree_drop_cols, errors="ignore")
target = train_df['Item_Outlet_Sales'].values

# Handling missing values
features_mean = tree_train_features.mean()
tree_train_features = tree_train_features.fillna(features_mean)
tree_test_features = test_df.drop(columns=tree_drop_cols, errors="ignore").fillna(features_mean)

print(f"tree_train_features.shape: {tree_train_features.shape}, tree_test_features.shape: {tree_test_features.shape}")
print(f"tree_train_features.columns: {tree_train_features.columns.values}")

tree_train_features.shape: (378428, 27), tree_test_features.shape: (252286, 27)
tree_train_features.columns: ['Item_Identifier' 'Item_Weight' 'Item_Visibility' 'Item_Type' 'Item_MRP'
 'Outlet_Identifier' 'Item_Identifier_TE' 'Item_Type_TE'
 'Outlet_Identifier_TE' 'Item_MRP_TE' 'Item_Weight_TE'
 'Item_Visibility_TE' 'Item_Identifier_FE' 'Item_Type_FE'
 'Outlet_Identifier_FE' 'Item_MRP_FE' 'Item_Weight_FE'
 'Item_Visibility_FE' 'Item_Identifier_IE_Item_MRP'
 'Item_Identifier_IE_Item_Weight' 'Item_Identifier_IE_Item_Visibility'
 'Item_Type_IE_Item_MRP' 'Item_Type_IE_Item_Weight'
 'Item_Type_IE_Item_Visibility' 'Outlet_Identifier_IE_Item_MRP'
 'Outlet_Identifier_IE_Item_Weight' 'Outlet_Identifier_IE_Item_Visibility']


# Prepare data for linear models

In [18]:
lin_drop_cols=[
    "id", # ID
    "Item_Outlet_Sales", # TARGET
    "Outlet_Establishment_Year",
    "Item_Fat_Content",
    "Outlet_Size",
    "Outlet_Location_Type",
    "Outlet_Type",
]

# Prepare datasets for linear models
lin_train_features = train_df.drop(columns=lin_drop_cols, errors="ignore")
target = train_df['Item_Outlet_Sales']

# Handling missing values
features_mean = lin_train_features.mean()
lin_train_features = lin_train_features.fillna(features_mean)
lin_test_features = test_df.drop(columns=lin_drop_cols, errors="ignore").fillna(features_mean)

ohe_cols = ["Item_Identifier", "Item_Type", "Outlet_Identifier", "Item_MRP_class"]
lin_train_features = pd.get_dummies(lin_train_features, dummy_na=True, columns=ohe_cols)
lin_test_features = pd.get_dummies(lin_test_features, dummy_na=True, columns=ohe_cols)

ss_cols = ["Item_Weight", "Item_Visibility", "Item_MRP"] + \
            te_cols + [v + "_FE" for v in fe_cols] + ie_cols

print(f"StandardScaler cols: {ss_cols}")
for c in ss_cols:
    ss = StandardScaler()
    lin_train_features[c] = ss.fit_transform(lin_train_features[c].values.reshape(-1, 1))
    lin_test_features[c] = ss.transform(lin_test_features[c].values.reshape(-1, 1))

print(f"lin_train_features.shape: {lin_train_features.shape}, lin_test_features.shape: {lin_test_features.shape}")

StandardScaler cols: ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Identifier_TE', 'Item_Type_TE', 'Outlet_Identifier_TE', 'Item_MRP_TE', 'Item_Weight_TE', 'Item_Visibility_TE', 'Item_Identifier_FE', 'Item_Type_FE', 'Outlet_Identifier_FE', 'Item_MRP_FE', 'Item_Weight_FE', 'Item_Visibility_FE', 'Item_Identifier_IE_Item_MRP', 'Item_Identifier_IE_Item_Weight', 'Item_Identifier_IE_Item_Visibility', 'Item_Type_IE_Item_MRP', 'Item_Type_IE_Item_Weight', 'Item_Type_IE_Item_Visibility', 'Outlet_Identifier_IE_Item_MRP', 'Outlet_Identifier_IE_Item_Weight', 'Outlet_Identifier_IE_Item_Visibility']
lin_train_features.shape: (378428, 1617), lin_test_features.shape: (252286, 1617)


# Train models

In [19]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
test_predictions = []
feature_importance_values = np.zeros(tree_train_features.shape[1])

ens_w_buf, ens_b_buf = [], []

valid_x_buf = []
valid_y_buf = []

# Cross-validation loop
for fold_i, (train_index, valid_index) in enumerate(kf.split(tree_train_features, target)):
    print(f"Fold {fold_i+1}/{kf.n_splits}")
    
    X_train, X_valid = tree_train_features.iloc[train_index], tree_train_features.iloc[valid_index]
    X_train_lin, X_valid_lin = lin_train_features.iloc[train_index], lin_train_features.iloc[valid_index]

    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]

    ens_p = []
    test_p = []

    # Training the LightGBM regressor
    model = LGBMRegressor(
        max_depth=26,
        n_estimators=10000,
        learning_rate=0.0021993020095881746,
        num_leaves=217,
        feature_fraction=0.6039737774032918,
        bagging_fraction=0.8990670527363338,
        min_split_gain=0.2865758698524692,
        reg_alpha=0.20081754747685004,
        reg_lambda=0.5795389842147277,
        random_state=0,
        n_jobs=N_JOBS,
        verbose=-1
    )
    model.fit(X_train, y_train,
        eval_metric="rmse",
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.early_stopping(200)]
    )

    # Update feature importance
    feature_importance_values += model.feature_importances_ / kf.n_splits

    # Predicting and calculating score on the validation set
    y_pred = model.predict(X_valid, num_iteration=model._best_iteration)
    ens_p.append(y_pred)
    y_pred = np.expm1(y_pred)
    y_pred = np.clip(y_pred, 0, None)

    score = root_mean_squared_log_error(np.expm1(y_valid), y_pred)
    print(f"LGBM RMSLE: {score}")

    # Making predictions on the test set
    p = model.predict(tree_test_features, num_iteration=model._best_iteration)
    test_p.append(p)

    # Training the XGB regressor
    model = XGBRegressor(
        booster="gbtree",
        tree_method="hist",
        n_estimators=10000,
        learning_rate=0.010124129393208042,
        max_leaves=244,
        subsample=0.9179406742270393,
        colsample_bytree=0.7376895006776901,
        reg_lambda=0.2913954318201704,
        alpha=0.6967736207395685,
        max_depth=13,
        random_state=0,
        n_jobs=N_JOBS,
        early_stopping_rounds=200
    )

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=1000,
    )

    # Predicting and calculating score on the validation set
    y_pred = model.predict(X_valid, iteration_range=(0, model.best_iteration))
    ens_p.append(y_pred)
    y_pred = np.expm1(y_pred)
    y_pred = np.clip(y_pred, 0, None)
    score = root_mean_squared_log_error(np.expm1(y_valid), y_pred)
    print(f"XGB RMSLE: {score}")

    # Making predictions on the test set
    p = model.predict(tree_test_features, iteration_range=(0, model.best_iteration))
    test_p.append(p)

    # HistGradientBoostingRegressor
    model = HistGradientBoostingRegressor(
        max_iter=10000,
        max_depth=25,
        learning_rate=0.010820368051329765,
        max_leaf_nodes=68,
        max_features=0.6429655166020577,
        l2_regularization=1.487687838498507,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=200,
        random_state=0,
    )

    model.fit(
        X_train,
        y_train
    )

    # Predicting and calculating score on the validation set
    y_pred = model.predict(X_valid)
    ens_p.append(y_pred)
    y_pred = np.expm1(y_pred)
    y_pred = np.clip(y_pred, 0, None)
    score = root_mean_squared_log_error(np.expm1(y_valid), y_pred)
    print(f"HGBR RMSLE: {score}")

    # Making predictions on the test set
    p = model.predict(tree_test_features)
    test_p.append(p)

    # LR
    model = LinearRegression( 
        n_jobs=N_JOBS
    )

    model.fit(
        X_train_lin,
        y_train
    )

    # Predicting and calculating score on the validation set
    y_pred = model.predict(X_valid_lin)
    ens_p.append(y_pred)
    y_pred = np.expm1(y_pred)
    y_pred = np.clip(y_pred, 0, None)
    score = root_mean_squared_log_error(np.expm1(y_valid), y_pred)
    print(f"LR RMSLE: {score}")

    # Making predictions on the test set
    p = model.predict(lin_test_features)
    test_p.append(p)

    # RandomForestRegressor
    model = RandomForestRegressor(
        max_depth=15,
        max_features=0.6462408415412457,
        max_samples=0.7813502358248877,
        n_estimators=358,
        random_state=0,
        n_jobs=N_JOBS
    )

    model.fit(
        X_train,
        y_train
    )

    # Predicting and calculating score on the validation set
    y_pred = model.predict(X_valid)
    ens_p.append(y_pred)
    y_pred = np.expm1(y_pred)
    y_pred = np.clip(y_pred, 0, None)
    score = root_mean_squared_log_error(np.expm1(y_valid), y_pred)
    print(f"RF RMSLE: {score}")

    # Making predictions on the test set
    p = model.predict(tree_test_features)
    test_p.append(p)

    # Evaluate ensemble
    ens_preds = np.mean(ens_p, axis=0)
    ens_preds = np.expm1(ens_preds)
    ens_preds = np.clip(ens_preds, 0, None)
    score = root_mean_squared_log_error(np.expm1(y_valid), ens_preds)
    print(f"ENS RMSLE: {score}")

    X_ens = np.array(ens_p).transpose()
    ens_model = LinearRegression(n_jobs=N_JOBS)
    ens_model.fit(X_ens, y_valid)
    ens_w = ens_model.coef_
    ens_b = ens_model.intercept_
    print(f"ENS weights: {ens_w}, bias: {ens_b}")
    ens_w_buf.append(ens_w)
    ens_b_buf.append(ens_b)

    ens_preds = ens_model.predict(X_ens)
    ens_preds = np.expm1(ens_preds)
    ens_preds = np.clip(ens_preds, 0, None)
    score = root_mean_squared_log_error(np.expm1(y_valid), ens_preds)
    print(f"LR ENS RMSLE: {score}")
    fold_scores.append(score)

    valid_x_buf.append(X_ens)
    valid_y_buf.append(y_valid)

    test_predictions.append(test_p)

Fold 1/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[8625]	valid_0's rmse: 0.693247	valid_0's l2: 0.480591
LGBM RMSLE: 0.693247018826365
[0]	validation_0-rmse:0.84844
[1000]	validation_0-rmse:0.69388
[2000]	validation_0-rmse:0.69347
[2144]	validation_0-rmse:0.69348
XGB RMSLE: 0.6934479230552235
HGBR RMSLE: 0.6947788394344074
LR RMSLE: 0.6999175529399382
RF RMSLE: 0.6960316723849539
ENS RMSLE: 0.6922866308145504
ENS weights: [ 0.51133622  0.3861403  -0.16600238  0.2957164  -0.00523254], bias: -0.15891990366682318
LR ENS RMSLE: 0.69169530491441
Fold 2/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[8454]	valid_0's rmse: 0.693683	valid_0's l2: 0.481197
LGBM RMSLE: 0.6936834658800312
[0]	validation_0-rmse:0.85010
[1000]	validation_0-rmse:0.69441
[2000]	validation_0-rmse:0.69407
[2280]	validation_0-rmse:0.69404
XGB RMSLE: 0.6940297799728722
HGBR RMSLE: 0.6950693396969411
LR RMSLE: 0.701

In [20]:
average_score = np.mean(fold_scores)
std_score = np.std(fold_scores)
print(f"Average RMSLE: {average_score}, Std of RMSLE: {std_score}")

Average RMSLE: 0.6940644404472316, Std of RMSLE: 0.0022774572798067183


# Combine predictions with Linear Regression model

In [21]:
print("Training ensembling LR model")
ens_model = LinearRegression(n_jobs=N_JOBS)
valid_x_buf = np.concatenate(valid_x_buf, axis=0)
valid_y_buf = np.concatenate(valid_y_buf, axis=0)
print(f"valid_x_buf.shape: {valid_x_buf.shape}, valid_y_buf.shape: {valid_y_buf.shape}")
ens_model.fit(valid_x_buf, valid_y_buf)
print(f"ENS Model weights: {ens_model.coef_}, bias: {ens_model.intercept_}")

valid_p = ens_model.predict(valid_x_buf)
valid_p = np.expm1(valid_p)
valid_p = np.clip(valid_p, 0, None)
average_score = root_mean_squared_log_error(np.expm1(valid_y_buf), valid_p)
print(f"LR ENS RMSLE: {average_score}")


for i, tp in enumerate(test_predictions):
    tp = np.array(tp).transpose()
    print(f"Test {i} X shape: {tp.shape}")
    tp = ens_model.predict(tp)
    print(f"Test {i} preds shape: {tp.shape}")
    test_predictions[i] = tp
test_predictions = np.mean(test_predictions, axis=0)
test_predictions = np.expm1(test_predictions)
test_predictions = np.clip(test_predictions, 0, None)
print(f"test_predictions.shape: {test_predictions.shape}")

Training ensembling LR model
valid_x_buf.shape: (378428, 5), valid_y_buf.shape: (378428,)
ENS Model weights: [ 0.54318574  0.33390037 -0.13402332  0.28448649 -0.01164513], bias: -0.11687988031950436
LR ENS RMSLE: 0.6940962480552789
Test 0 X shape: (252286, 5)
Test 0 preds shape: (252286,)
Test 1 X shape: (252286, 5)
Test 1 preds shape: (252286,)
Test 2 X shape: (252286, 5)
Test 2 preds shape: (252286,)
Test 3 X shape: (252286, 5)
Test 3 preds shape: (252286,)
Test 4 X shape: (252286, 5)
Test 4 preds shape: (252286,)
test_predictions.shape: (252286,)


# Identify top K important features

In [22]:
feature_importances = pd.DataFrame({
    'feature': tree_train_features.columns,
    'importance': feature_importance_values
}).sort_values(by='importance', ascending=False).head(40)

print("Top Important Features:")
print(feature_importances)

Top Important Features:
                                 feature  importance
9                            Item_MRP_TE    120044.4
15                           Item_MRP_FE    109686.2
2                        Item_Visibility    105434.6
4                               Item_MRP    105136.0
11                    Item_Visibility_TE    100206.4
10                        Item_Weight_TE     94601.8
6                     Item_Identifier_TE     91258.0
18           Item_Identifier_IE_Item_MRP     90031.4
20    Item_Identifier_IE_Item_Visibility     86034.6
19        Item_Identifier_IE_Item_Weight     84884.4
1                            Item_Weight     80452.4
17                    Item_Visibility_FE     76003.4
12                    Item_Identifier_FE     71442.8
16                        Item_Weight_FE     70657.6
0                        Item_Identifier     69486.8
8                   Outlet_Identifier_TE     66436.8
26  Outlet_Identifier_IE_Item_Visibility     52568.0
22              Item_T

# Prepare the submission file with predictions

In [23]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'Item_Outlet_Sales': test_predictions
})

# Save the submission file
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False)
print(submission.head(10))

print("Submission file saved successfully.")

       id  Item_Outlet_Sales
0  378428        3008.187088
1  378429        2733.251288
2  378430        2212.245846
3  378431        1715.024614
4  378432        1508.334617
5  378433        1960.262250
6  378434        2333.657808
7  378435        2349.407367
8  378436        1605.060827
9  378437        2111.172706
Submission file saved successfully.
