In [2]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import numpy as np

# Load datasets
df_train = pd.read_csv('train_v9rqX0R.csv')
df_test = pd.read_csv('test_AbJTz2l.csv')

# Drop 'Item_Weight' and 'Outlet_Size' as per your requirement
df_train.drop(columns=['Item_Weight', 'Outlet_Size'], inplace=True)
df_test.drop(columns=['Item_Weight', 'Outlet_Size'], inplace=True)

# Separate target variable and features
X = df_train.drop(columns=['Item_Outlet_Sales'])
y = df_train['Item_Outlet_Sales']

# Combine train and test data for consistent encoding
combined = pd.concat([X, df_test])

# Label encode categorical columns
categorical_cols = combined.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
    label_encoders[col] = le

# Split combined dataset back into train and test
X = combined.iloc[:len(df_train), :]
X_test = combined.iloc[len(df_train):, :]

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
xgb_model = XGBRegressor(random_state=42)

rmse_scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Train the model
    xgb_model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = xgb_model.predict(X_val)
    
    # Calculate RMSE for this fold
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_scores.append(rmse)

# Calculate the average RMSE across all folds
avg_rmse = np.mean(rmse_scores)
print(f"Average RMSE from K-Fold Cross-Validation: {avg_rmse}")

# Train the final model on the entire training data
xgb_model.fit(X, y)

# Predict on the test dataset
# Clip predictions to ensure non-negative values
y_test_pred = np.clip(xgb_model.predict(X_test), 0, None)

# Save the clipped predictions to a CSV file
submission = pd.DataFrame({
    'Item_Identifier': df_test['Item_Identifier'], 
    'Outlet_Identifier': df_test['Outlet_Identifier'], 
    'Item_Outlet_Sales': y_test_pred
})
submission.to_csv('submission.csv', index=False)
print("Predictions saved to 'submission.csv'")


Average RMSE from K-Fold Cross-Validation: 1178.8608121930597
Predictions saved to 'submission.csv'
