In [1]:
%pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.5.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_log_error

In [3]:
# Load the datasets
training_data_path = '../input/ml-competition-2024-for-ukrainians/train.csv'
test_data_path = '../input/ml-competition-2024-for-ukrainians/test.csv'

train_df = pd.read_csv(training_data_path)
test_df = pd.read_csv(test_data_path)

print(f"train_df.shape: {train_df.shape}, test_df.shape: {test_df.shape}")

train_df.shape: (378428, 13), test_df.shape: (252286, 12)


In [4]:
# Function to apply label encoding to columns that contain strings
def label_encode(train_df, test_df):
    label_encoders = {}
    cat_cols = []
    for column in train_df.columns:
        if train_df[column].dtype == 'object' or train_df[column].dtype == 'category':
            print(f"LE: {column}")
            le = LabelEncoder()
            # Fit on both training and test data to ensure consistency
            le.fit(list(train_df[column].astype(str)) + list(test_df[column].astype(str)))
            train_df[column] = le.transform(train_df[column].astype(str))
            test_df[column] = le.transform(test_df[column].astype(str))
            label_encoders[column] = le
            cat_cols.append(column)
    return train_df, test_df, label_encoders, cat_cols

# Apply label encoding
train_df, test_df, _, cat_cols = label_encode(train_df, test_df)
print(train_df.shape, test_df.shape)

LE: Item_Identifier
LE: Item_Fat_Content
LE: Item_Type
LE: Outlet_Identifier
LE: Outlet_Size
LE: Outlet_Location_Type
LE: Outlet_Type
(378428, 13) (252286, 12)


In [5]:
drop_cols = []

features = train_df.drop(columns=['id', 'Item_Outlet_Sales'] + drop_cols)
target = train_df['Item_Outlet_Sales']
test_features = test_df.drop(columns=['id'] + drop_cols)

print(f"features.shape: {features.shape}, test_features.shape: {test_features.shape}")
print(f"features.columns: {features.columns.values}")

features.shape: (378428, 11), test_features.shape: (252286, 11)
features.columns: ['Item_Identifier' 'Item_Weight' 'Item_Fat_Content' 'Item_Visibility'
 'Item_Type' 'Item_MRP' 'Outlet_Identifier' 'Outlet_Establishment_Year'
 'Outlet_Size' 'Outlet_Location_Type' 'Outlet_Type']


In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
fold_scores = []
test_predictions = np.zeros(test_features.shape[0])
feature_importance_values = np.zeros(features.shape[1])

for fold_i, (train_index, val_index) in enumerate(kf.split(features, target)):
    print(f"Fold {fold_i+1}/{kf.n_splits}")
    
    X_train, X_val = features.iloc[train_index], features.iloc[val_index]
    y_train, y_val = target.iloc[train_index], target.iloc[val_index]

    y_train, y_val = np.log1p(y_train), np.log1p(y_val)

    # Training the LightGBM regressor
    model = LGBMRegressor(
        max_depth=16,
        n_estimators=3000,
        learning_rate=0.05,
        random_state=0,
        verbose=-1
    )
    model.fit(X_train, y_train,
        eval_metric="rmse",
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(200)]
    )

    # Update feature importance
    feature_importance_values += model.feature_importances_ / kf.n_splits

    # Predicting and calculating score on the validation set
    y_pred = model.predict(X_val, num_iteration=model._best_iteration)
    y_pred = np.expm1(y_pred)
    y_pred = np.clip(y_pred, 0, None)
    score = root_mean_squared_log_error(np.expm1(y_val), y_pred)
    fold_scores.append(score)
    print(f"LGBM RMSLE: {score}")

    # Making predictions on the test set
    p = model.predict(test_features, num_iteration=model._best_iteration)
    p = np.expm1(p)
    p = np.clip(p, 0, None)
    test_predictions += p

# Averaging predictions over all folds
test_predictions /= kf.n_splits

# Print average and std scores
average_score = np.mean(fold_scores)
std_score = np.std(fold_scores)
print(f"Average RMSLE: {average_score}, Std of RMSLE: {std_score}")

Fold 1/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2196]	valid_0's rmse: 0.703793	valid_0's l2: 0.495325
LGBM RMSLE: 0.7037931148293433
Fold 2/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2166]	valid_0's rmse: 0.708253	valid_0's l2: 0.501623
LGBM RMSLE: 0.7082531812546207
Fold 3/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2179]	valid_0's rmse: 0.709045	valid_0's l2: 0.502745
LGBM RMSLE: 0.709045286206613
Fold 4/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2065]	valid_0's rmse: 0.711159	valid_0's l2: 0.505747
LGBM RMSLE: 0.7111587729050091
Fold 5/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2203]	valid_0's rmse: 0.703039	valid_0's l2: 0.494263
LGBM RMSLE: 0.7030387618845727
Average RMSLE: 0.7070578234160317, Std of

In [7]:
# Identify top K important features
feature_importances = pd.DataFrame({
    'feature': features.columns,
    'importance': feature_importance_values
}).sort_values(by='importance', ascending=False).head(20)

print("Top 20 Important Features:")
print(feature_importances)

Top 20 Important Features:
                      feature  importance
5                    Item_MRP     14865.4
3             Item_Visibility     12764.4
1                 Item_Weight     11741.6
0             Item_Identifier     11251.0
4                   Item_Type      4665.0
6           Outlet_Identifier      3638.4
7   Outlet_Establishment_Year      2598.8
2            Item_Fat_Content      1417.6
10                Outlet_Type       672.6
8                 Outlet_Size       670.4
9        Outlet_Location_Type       568.8


In [8]:
# Preparing the submission file with averaged predictions
submission = pd.DataFrame({
    'id': test_df['id'],
    'Item_Outlet_Sales': test_predictions
})

# Save the submission file
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False)
print(submission.head(10))

print("Submission file saved successfully.")

       id  Item_Outlet_Sales
0  378428        3031.709323
1  378429        2515.805016
2  378430        2116.692981
3  378431        1634.179334
4  378432        1404.451532
5  378433        2256.373321
6  378434        2557.690224
7  378435        2383.620247
8  378436        1604.576005
9  378437        2026.743839
Submission file saved successfully.
