In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV

In [4]:

from sklearn.metrics import mean_squared_error

# Load datasets with optimized data types
def load_data():
    sales_dtypes = {
        "id": "category", "item_id": "category", "dept_id": "category",
        "cat_id": "category", "store_id": "category", "state_id": "category"
    }
    calendar_dtypes = {
        "event_name_1": "category", "event_type_1": "category",
        "event_name_2": "category", "event_type_2": "category"
    }
    prices_dtypes = {
        "store_id": "category", "item_id": "category"
    }

    # Load datasets
    sales_data = pd.read_csv(r"C:\Users\user\Downloads\m5-forecasting-accuracy\sales_train_validation.csv", dtype=sales_dtypes)
    calendar = pd.read_csv(r"C:\Users\user\Downloads\m5-forecasting-accuracy\calendar.csv", dtype=calendar_dtypes)
    prices = pd.read_csv(r"C:\Users\user\Downloads\m5-forecasting-accuracy\sell_prices.csv", dtype=prices_dtypes)
    return sales_data, calendar, prices

In [5]:
def preprocess_data_in_chunks(sales_data, calendar, prices, chunk_size=10000):
    calendar = calendar[["d", "wm_yr_wk", "date"]]
    prices = prices[["store_id", "item_id", "wm_yr_wk", "sell_price"]]

    processed_chunks = []

    # Process in smaller chunks
    for start_row in range(0, len(sales_data), chunk_size):
        chunk = sales_data.iloc[start_row:start_row + chunk_size]

        # Melt chunk
        melted_chunk = pd.melt(chunk, id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
                               var_name="d", value_name="sales")

        # Merge with calendar
        merged_chunk = melted_chunk.merge(calendar, how="left", on="d")

        # Merge with prices
        merged_chunk = merged_chunk.merge(prices, how="left", on=["store_id", "item_id", "wm_yr_wk"])

        # Add time-based features
        merged_chunk['date'] = pd.to_datetime(merged_chunk['date'])
        merged_chunk['dayofweek'] = merged_chunk['date'].dt.dayofweek
        merged_chunk['month'] = merged_chunk['date'].dt.month
        merged_chunk['year'] = merged_chunk['date'].dt.year

        # Drop unnecessary columns
        merged_chunk = merged_chunk.drop(["date", "wm_yr_wk"], axis=1)

        # Separate numeric and non-numeric columns
        numeric_cols = merged_chunk.select_dtypes(include=['number']).columns
        non_numeric_cols = merged_chunk.select_dtypes(exclude=['number']).columns

        # Convert categorical columns to object type temporarily
        for col in non_numeric_cols:
            if merged_chunk[col].dtype.name == "category":
                merged_chunk[col] = merged_chunk[col].astype("object")

        # Fill missing values
        merged_chunk[numeric_cols] = merged_chunk[numeric_cols].fillna(0)  # Fill numeric columns with 0
        merged_chunk[non_numeric_cols] = merged_chunk[non_numeric_cols].fillna("Unknown")  # Fill non-numeric with "Unknown"

        # Convert object columns back to category type for memory efficiency
        for col in non_numeric_cols:
            if merged_chunk[col].dtype.name == "object":
                merged_chunk[col] = merged_chunk[col].astype("category")

        # Append processed chunk
        processed_chunks.append(merged_chunk)

    # Combine all chunks into a single dataframe
    processed_data = pd.concat(processed_chunks, ignore_index=True)
    return processed_data


In [6]:
# Load the data
sales_data, calendar, prices = load_data()

In [7]:
# Preprocess the data in chunks
processed_data = preprocess_data_in_chunks(sales_data, calendar, prices)

In [9]:
# Prepare features (X) and target (y)
X = processed_data.drop(["sales"], axis=1)
y = processed_data["sales"]

In [10]:
# Ensure y is a Series with numeric values
y = y.squeeze()  # Ensure 1D structure
y = pd.to_numeric(y, errors="coerce").fillna(0)  # Ensure numeric type and handle missing values

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Include reduce_cardinality function here
def reduce_cardinality(data, threshold=50):
    for col in data.select_dtypes(include=["category", "object"]).columns:
        if data[col].nunique() > threshold:
            top_categories = data[col].value_counts().nlargest(threshold).index
            data[col] = data[col].apply(lambda x: x if x in top_categories else "Other")
    return data

# Step 1: Reduce cardinality for categorical columns
X_train = reduce_cardinality(X_train.copy())
X_test = reduce_cardinality(X_test.copy())



In [None]:
# Step 2: Sparse one-hot encoding
def one_hot_encode_sparse(data, columns):
    from sklearn.feature_extraction.text import CountVectorizer
    from scipy.sparse import hstack
    
    vectorizers = {}
    sparse_matrices = []
    
    for col in columns:
        vec = CountVectorizer(tokenizer=lambda x: [x], lowercase=False, binary=True)
        sparse_matrix = vec.fit_transform(data[col].astype(str))
        sparse_matrices.append(sparse_matrix)
        vectorizers[col] = vec
    
    return hstack(sparse_matrices), vectorizers

categorical_cols = X_train.select_dtypes(include=["category", "object"]).columns
X_train_sparse, train_vectorizers = one_hot_encode_sparse(X_train, categorical_cols)
X_test_sparse = hstack([train_vectorizers[col].transform(X_test[col].astype(str)) for col in categorical_cols])

# Step 3: Combine sparse one-hot with numeric columns
numeric_cols = X_train.select_dtypes(include=["number"]).columns
X_train_numeric = X_train[numeric_cols].to_numpy()
X_test_numeric = X_test[numeric_cols].to_numpy()

X_train_final = hstack([X_train_numeric, X_train_sparse])
X_test_final = hstack([X_test_numeric, X_test_sparse])

In [34]:
!pip install lightgbm



In [37]:
from lightgbm import LGBMRegressor
# Train and evaluate models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, max_depth=10, random_state=42),
    "SVR": SVR(kernel='rbf', C=1.0, epsilon=0.1)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results[name] = rmse
    print(f"{name} RMSE: {rmse:.4f}")

MemoryError: Unable to allocate 3.82 GiB for an array with shape (11, 46661896) and data type object

In [None]:
print(processed_data["sales"].head())
print(processed_data["sales"].dtype)