id

- **Description**: A unique identifier for each row.
- **Relevance**: Not useful for prediction. It serves only as an identifier.

Item_Identifier

- **Description**: A unique identifier for each product.
- **Relevance**: Generally not directly useful for prediction. However, it can be used for aggregating statistics like mean sales per item.

Item_Weight

- **Description**: The weight of the product.
- **Relevance**: Potentially useful for prediction as it can impact transportation costs and consumer preference.

Item_Fat_Content

- **Description**: Describes whether the product is low fat or regular.
- **Relevance**: Useful for prediction as it can influence consumer choices and sales.

Item_Visibility

- **Description**: The percentage of total display area of all products in a store allocated to this particular product.
- **Relevance**: Highly relevant for prediction. Higher visibility can lead to higher sales.

Item_Type

- **Description**: The category to which the product belongs.
- **Relevance**: Highly relevant for prediction as different categories may have different sales patterns.

Item_MRP

- **Description**: Maximum Retail Price (list price) of the product.
- **Relevance**: Highly relevant for prediction as price is a major factor influencing sales.

Outlet_Identifier

- **Description**: A unique identifier for the store.
- **Relevance**: Useful for prediction to capture store-specific effects, such as location and size.

Outlet_Establishment_Year

- **Description**: The year the store was established.
- **Relevance**: Potentially relevant for prediction. Older stores might have more established customer bases or different sales trends.

Outlet_Size

- **Description**: The size of the store in terms of square footage.
- **Relevance**: Highly relevant for prediction as store size can influence the number of products displayed and sales.

Outlet_Location_Type

- **Description**: The type of city in which the store is located (e.g., Tier 1, Tier 2).
- **Relevance**: Highly relevant for prediction. Location type can impact foot traffic and sales.

Outlet_Type

- **Description**: The type of store (e.g., Grocery Store, Supermarket Type1).
- **Relevance**: Highly relevant for prediction as different types of stores may have different customer bases and sales patterns.

Item_Outlet_Sales

- **Description**: The sales of the product in a particular store. This is the target variable.
- **Relevance**: This is the target variable you are trying to predict.


In [None]:
from typing import Any, Callable, Dict, List, Tuple
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    Normalizer,
)
from xgboost import XGBRegressor

In [None]:
class FatContentMapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mapping = {"low fat": "Low Fat", "LF": "Low Fat", "reg": "Regular"}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["Item_Fat_Content"] = X["Item_Fat_Content"].replace(self.mapping)

        return X


class ItemIdentifierFrequencyMapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.item_identifier_counts = X["Item_Identifier"].value_counts()

        return self

    def transform(self, X):
        X = X.copy()
        X["Item_Identifier"] = X["Item_Identifier"].map(self.item_identifier_counts)

        return X


class Log1pTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.log1p(X)

    def inverse_transform(self, X):
        return np.expm1(X)


def perturb_data(df, n_samples=1000, noise_level=0.05):
    """
    Generate new samples by adding Gaussian noise to the original data.

    Parameters:
    df (DataFrame): The original high-sales data.
    n_samples (int): Number of new samples to generate.
    noise_level (float): The standard deviation of the Gaussian noise.

    Returns:
    DataFrame: A DataFrame with the new samples.
    """
    new_data = []
    for _ in range(n_samples):
        # Select a random row
        random_row = df.sample(n=1).iloc[0]
        perturbed_row = random_row.copy()

        for col in df.select_dtypes(include=[np.number]).columns:
            perturbed_row[col] += np.random.normal(0, noise_level * df[col].std())
        new_data.append(perturbed_row)

    return pd.DataFrame(new_data)


def augment_data(df, n_samples=1000):
    """
    Augment data by creating new samples with slight modifications.

    Parameters:
    df (DataFrame): The original high-sales data.
    n_samples (int): Number of new samples to generate.

    Returns:
    DataFrame: A DataFrame with the augmented samples.
    """
    new_data = []
    for _ in range(n_samples):
        random_row = df.sample(n=1).iloc[0]
        augmented_row = random_row.copy()

        # Slightly modify numerical features
        for col in df.select_dtypes(include=[np.number]).columns:
            augmented_row[col] *= np.random.uniform(0.9, 1.1)
        new_data.append(augmented_row)

    return pd.DataFrame(new_data)


def combine_with_synth_data(og_df: pd.DataFrame) -> pd.DataFrame:
    # n_samples = np.sum(og_sales < og_sales.quantile(0.90)) // 2
    combined_synth_df = og_df.copy()
    og_sales = og_df["Item_Outlet_Sales"]
    idxs_with_coefs = [
        (og_sales > og_sales.quantile(0.90), 0.5),
        (og_sales > og_sales.quantile(0.99), 2.0),
        (og_sales > og_sales.quantile(0.999), 4.0),
    ]
    for idx, coef in idxs_with_coefs:
        selected_rows = og_df[idx]
        n_samples = int(coef * len(selected_rows))
        print("Number of selected rows to generate new data from: ", len(selected_rows))
        print("Number of samples to generate per synth: ", n_samples)
        perturb_df = perturb_data(selected_rows, n_samples=n_samples, noise_level=0.05)
        augmented_df = augment_data(selected_rows, n_samples=n_samples)
        combined_synth_df = pd.concat(
            [combined_synth_df, perturb_df, augmented_df], ignore_index=True
        )


def drop_n_largest(df, column, n):
    largest_indices = df.nlargest(n, column).index
    df_dropped = df.drop(largest_indices)

    return df_dropped


def check_consistency(df):
    # Check for inconsistencies in Outlet information
    outlet_columns = [
        "Outlet_Establishment_Year",
        "Outlet_Size",
        "Outlet_Location_Type",
        "Outlet_Type",
    ]
    outlet_inconsistencies = df.groupby("Outlet_Identifier")[outlet_columns].nunique()

    item_columns = ["Item_Weight", "Item_Fat_Content", "Item_Type"]
    item_inconsistencies = df.groupby("Item_Identifier")[item_columns].nunique()

    return outlet_inconsistencies, item_inconsistencies


def print_category_info(df: pd.DataFrame, categorical_cols: List[str]):
    print(f"# of categories: {len(categorical_cols)}")
    for coll_name in categorical_cols:
        values = df[coll_name].unique()
        print(f"{coll_name} {values} {len(values)}")

    ohencoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    ohencoder.fit_transform(df[categorical_cols]).shape


def category_bar_plot(category_counts):
    # Plotting with seaborn
    plt.figure(figsize=(10, 6))
    sns.barplot(x=category_counts.index, y=category_counts.values, palette="viridis")
    plt.title("Category Occurrences")
    plt.xlabel("Item Type")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right", fontsize=10, fontweight="bold")
    plt.show()


def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))


rmsle_scorer = make_scorer(rmsle, greater_is_better=False)


def plot_predictions(
    model: RegressorMixin,
    X_train: np.array,
    y_train: np.array,
    X_val: np.array,
    y_val: np.array,
    output_transform: Callable | None = None,
) -> Tuple[pd.array, float]:
    def predict(X, y):
        preds = model.predict(X)
        if output_transform is not None:
            preds = output_transform(preds)
            y = output_transform(y)

        score = rmsle(y, preds)

        return preds, y, score

    preds_train, y_train, score_train = predict(X_train, y_train)
    preds_val, y_val, score_val = predict(X_val, y_val)
    print("Train score", score_train)
    print("Evaluation score", score_val)

    fig, axes = plt.subplots(2, 3, figsize=(15, 10))

    def plot_stairs(axes, pred, y, ds_type):
        t_values, t_bins = np.histogram(y)
        axes[0].stairs(t_values, t_bins, fill=True, color="blue")
        axes[0].set_title(f"{ds_type} Target values")

        values, bins = np.histogram(pred)
        axes[1].stairs(values, bins, fill=True, color="red")
        axes[1].set_title("Predicted values")

        axes[2].plot(range(len(y)), y, c="blue", label="Target")
        axes[2].plot(range(len(pred)), pred, c="red", label="Predicted")
        axes[2].set_title("Target vs Predicted")

    plot_stairs(axes[0], preds_train, y_train, f"[Train {score_train:.3f}]")
    plot_stairs(axes[1], preds_val, y_val, f"[Validation {score_val:.3f}]")

    return preds_val, score_val


PREDICTIONS_DIR = Path("predictions")
MODELS_DIR = Path("models")


def predict(
    model: RegressorMixin,
    test: pd.DataFrame,
    file_name: str | None,
    output_transform: Callable | None = None,
) -> pd.DataFrame:
    test_pred = model.predict(test)
    if output_transform is not None:
        test_pred = output_transform(test_pred)

    test_result = pd.DataFrame(
        {"id": test["id"].values, "Item_Outlet_Sales": test_pred.astype(np.int32)},
        index=None,
    )

    if file_name is not None:
        joblib_file = MODELS_DIR / f"{file_name}.pkl"
        joblib.dump(model, joblib_file)

        pred_file = PREDICTIONS_DIR / f"{file_name}.csv"
        test_result.to_csv(pred_file, index=False)

    return test_result


def compare_distributions(og_df, gen_df):
    # Summary stats
    print(og_df.describe())
    print(gen_df.describe())

    # Check for missing values
    print(og_df.isnull().sum())
    print(gen_df.isnull().sum())

    for column in og_df.columns:
        if column in gen_df.columns:
            plt.figure(figsize=(12, 6))
            sns.histplot(og_df[column], color="blue", label="Original", kde=True)
            sns.histplot(gen_df[column], color="red", label="Generated", kde=True)
            plt.title(f"Distribution of {column}")
            plt.legend()
            plt.show()

In [None]:
train_df = pd.read_csv("datasets/train.csv")
test_df = pd.read_csv("datasets/test.csv")
og_df = pd.read_csv("datasets/Big Sales Data.csv")

train_df = train_df.drop("id", axis=1)
features = train_df.drop(["Item_Outlet_Sales"], axis=1)
log1p_transformer = Log1pTransformer()
target = log1p_transformer.transform(train_df["Item_Outlet_Sales"])

numeric_cols = [
    "Item_Weight",
    "Item_Visibility",
    "Item_MRP",
    "Outlet_Establishment_Year",
]

categorical_cols = [
    "Item_Identifier",
    "Item_Fat_Content",
    "Item_Type",
    "Outlet_Identifier",
    "Outlet_Size",
    "Outlet_Location_Type",
    "Outlet_Type",
]


def new_pipeline(
    model, scaler=StandardScaler(), one_hot=True, item_ident_freq_mapper=True
) -> Pipeline:
    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", scaler),
        ]
    )

    categorical_transformer = "passthrough"
    if one_hot:
        categorical_transformer = Pipeline(
            steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
        )

    preprocessor = Pipeline(steps=[])
    preprocessor.steps.append(("fat_content_mapper", FatContentMapper()))

    if item_ident_freq_mapper and "Item_Identifier" in categorical_cols:
        preprocessor.steps.append(
            ("item_identifier_frequency_mapper", ItemIdentifierFrequencyMapper())
        )

    preprocessor.steps.append(
        (
            "column_transformer",
            ColumnTransformer(
                transformers=[
                    ("num", numeric_transformer, numeric_cols),
                    ("cat", categorical_transformer, categorical_cols),
                ],
                remainder="drop",
            ),
        ),
    )

    return Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])


def train_and_evaluate(
    model: RegressorMixin,
    features: pd.DataFrame,
    target: pd.DataFrame,
    test: pd.DataFrame,
    param_grid: Dict[str, Any] | None = None,
    out_file_name: str | None = None,
    cv=4,
) -> RegressorMixin:
    X_train, X_val, y_train, y_val = train_test_split(
        features, target, test_size=0.2, random_state=99
    )

    if param_grid is None:
        best_model = model.fit(X_train, y_train)
    else:
        grid_search = GridSearchCV(
            model,
            param_grid=param_grid,
            scoring=rmsle_scorer,
            cv=cv,
        )

        grid_search.fit(X_train, y_train)

        print("Best Hyperparameters:", grid_search.best_params_)
        print("Best Model:", grid_search.best_estimator_)

        best_model = grid_search.best_estimator_

    plot_predictions(
        best_model,
        X_train,
        y_train,
        X_val,
        y_val,
        output_transform=log1p_transformer.inverse_transform,
    )

    predict(
        best_model,
        test,
        out_file_name,
        output_transform=log1p_transformer.inverse_transform,
    )

    return best_model

In [None]:
param_grid = {
    "model__num_leaves": [31, 50, 70],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "model__n_estimators": [50, 100, 200, 250],
}

lgbm_model = LGBMRegressor()
lgbm_pipeline = new_pipeline(lgbm_model, one_hot=True, item_ident_freq_mapper=True)

best_lgbm_model = train_and_evaluate(
    lgbm_pipeline,
    features,
    target,
    test_df,
    param_grid,
    out_file_name="lgbm_v2_output",
    cv=4,
)

In [None]:
# Best Hyperparameters:
# {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 7,
# 'model__n_estimators': 200, 'model__subsample': 1.0}

param_grid = {
    "model__max_depth": [3, 5, 7],
    "model__learning_rate": [0.01, 0.1, 0.2],
    "model__n_estimators": [50, 100, 200],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0],
}


xgb_model = XGBRegressor()
xgb_pipeline = new_pipeline(lgbm_model, one_hot=True, item_ident_freq_mapper=True)

best_xgb_model = train_and_evaluate(
    xgb_pipeline,
    features,
    target,
    test_df,
    param_grid,
    out_file_name="xgb_v1_output",
    cv=4,
)

In [None]:
param_grid = {
    "alpha": [0.1, 1.0, 10.0, 100.0],  # Regularization strength
    "solver": [
        # "auto",
        # "svd",
        # "cholesky",
        # "lsqr",
        # "sparse_cg",
        "sag",
        # "saga",
    ],  # Solvers
}


lr_model = Ridge()
lr_pipeline = new_pipeline(
    lr_model, scaler=MinMaxScaler(), one_hot=True, item_ident_freq_mapper=True
)


best_xgb_model = train_and_evaluate(
    lr_model,
    features,
    target,
    test_df,
    param_grid,
    out_file_name="lr_v1_output",
    cv=4,
)

In [None]:
# param_grid = {
#     "model__depth": [7, 9, 12],
#     "model__learning_rate": [0.2],
#     "model__iterations": [
#         1000,
#         2000,
#         5000,
#     ],  # CatBoost parameter for n_estimators
#     "model__subsample": [0.8, 1.0],
#     "model__colsample_bylevel": [
#         0.8,
#         1.0,
#     ],
# }
# best_cat_boost_params = {}

param_grid = None
best_cat_boost_params = dict(
    colsample_bylevel=1.0, depth=7, iterations=1000, learning_rate=0.2, subsample=1.0
)

cat_model = CatBoostRegressor(**best_cat_boost_params, verbose=True)
cat_pipeline = new_pipeline(cat_model, one_hot=True, item_ident_freq_mapper=True)

best_cat_model = train_and_evaluate(
    cat_pipeline,
    features,
    target,
    test_df,
    param_grid,
    out_file_name="cat_v8_output",
    cv=4,
)

In [None]:
param_grid = None
best_cat_boost_params = dict(
    colsample_bylevel=1.0, depth=7, iterations=1000, learning_rate=0.2, subsample=1.0
)

model = CatBoostRegressor(**best_cat_boost_params, verbose=True)
cat_pipeline = new_pipeline(
    cat_model, scaler=MinMaxScaler(), one_hot=True, item_ident_freq_mapper=True
)

best_cat_model = train_and_evaluate(
    cat_pipeline,
    features,
    target,
    test_df,
    param_grid,
    out_file_name="cat_v9_output",
    cv=4,
)

# Prev:
# Train score 0.6746445262930283
# Evaluation score 0.7104292188009841

In [None]:
from collections import defaultdict

p = joblib.load(MODELS_DIR / "cat_v8_output.pkl")

importances = p["model"].feature_importances_
categorical_cols_hot_enc = (
    p["preprocessor"]["column_transformer"]["cat"]
    .get_feature_names_out(categorical_cols)
    .tolist()
)


cat_lengths = defaultdict(lambda: 0)
for cat in categorical_cols:
    for cat_hot_enc in categorical_cols_hot_enc:
        if cat_hot_enc.startswith(cat):
            cat_lengths[cat] += 1

prev = len(numeric_cols)
new_importances = importances.tolist()[:prev]
for cat in categorical_cols:
    cat_len = cat_lengths[cat]
    new_importances.append(sum(importances[prev : prev + cat_len]))
    prev += cat_len


feature_names = numeric_cols + categorical_cols

feature_importances = pd.DataFrame(
    {"Feature": feature_names, "Importance": new_importances}
)
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 8))
plt.barh(feature_importances["Feature"], feature_importances["Importance"])
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance from LGBM Model")
plt.gca().invert_yaxis()
plt.show()