In [None]:
%reload_kedro

In [None]:
import optuna
import lightgbm as lgb
from pyspark.sql.functions import col, year, month, dayofmonth, dayofweek, when
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
train_df = catalog.load("train")

In [None]:
test_df = catalog.load("test")

In [None]:
def preprocess_data(df):
    df = df.withColumn('year', year(col('date'))) \
        .withColumn('month', month(col('date'))) \
        .withColumn('day', dayofmonth(col('date'))) \
        .withColumn('day_of_week', dayofweek(col('date'))) \
        .withColumn('is_weekend', when(col('day_of_week') >= 6, 1).otherwise(0))
    return df

In [None]:
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'mape',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 10)
    }

    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    model = lgb.train(param, dtrain, valid_sets=[dval], callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)])

    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    mape = mean_absolute_percentage_error(y_val, y_pred)

    return mape

In [None]:
# Preprocess the data
train_df = preprocess_data(train_df)

# Convert to Pandas DataFrame for compatibility with LightGBM and Optuna
train_pd = train_df.toPandas()

# Handle missing values by filling them with zeros or another strategy
train_pd.fillna(0, inplace=True)

# Encode categorical features
label_encoders = {}
for column in ['country', 'store', 'product']:
    le = LabelEncoder()
    train_pd[column] = le.fit_transform(train_pd[column])
    label_encoders[column] = le

# Define features and target
features = ['country', 'store', 'product', 'year', 'month', 'day', 'day_of_week', 'is_weekend']
target = 'num_sold'

X = train_pd[features]
y = train_pd[target]

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a study object and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=600)

# Print the best hyperparameters
print('Best hyperparameters:', study.best_params)

# Train the final model with the best hyperparameters
best_params = study.best_params
best_params['n_estimators'] = 1000

final_model = lgb.LGBMRegressor(**best_params)
final_model.fit(X, y)

In [None]:
test_df = preprocess_data(test_df)
test_pd = test_df.toPandas()

# Handle missing values in test data
test_pd.fillna(0, inplace=True)

# Encode categorical features in test data
for column in ['country', 'store', 'product']:
    le = label_encoders[column]
    test_pd[column] = le.transform(test_pd[column])

X_test = test_pd[features]

# Predict and prepare the submission file
test_pd['num_sold'] = final_model.predict(X_test)
# Convert predictions to integers
test_pd['num_sold'] = test_pd['num_sold'].round().astype(int)
submission = test_pd[['id', 'num_sold']]

In [None]:
%reload_kedro

In [None]:
submission.to_csv('../data/07_model_output/my_first_submission.csv', index=False)

In [None]:
# Adding a cell for auditing final_model on train and test datasets

# Calculate predictions for both training and test datasets
train_predictions = final_model.predict(X).round().astype(int)
test_predictions = final_model.predict(X_test).round().astype(int)

# Calculate MAPE for training dataset
train_mape = mean_absolute_percentage_error(y, train_predictions)

# Calculate MAPE for test dataset
test_mape = mean_absolute_percentage_error(test_pd[target], test_predictions)

# Print the results
print(f"Training MAPE: {train_mape:.4f}")
print(f"Test MAPE: {test_mape:.4f}")

# You can also visualize the results if needed
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 7))

# Plotting training predictions
plt.subplot(1, 2, 1)
plt.scatter(y, train_predictions, alpha=0.3)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Training Data')

# Plotting test predictions
plt.subplot(1, 2, 2)
plt.scatter(test_pd[target], test_predictions, alpha=0.3)
plt.plot([test_pd[target].min(), test_pd[target].max()], [test_pd[target].min(), test_pd[target].max()], 'r', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Test Data')

plt.tight_layout()
plt.show()