In [6]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load the merged dataset
data = pd.read_csv('merged_data_updated.csv')

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values[missing_values > 0])

# Identify columns with all missing values
all_missing_columns = missing_values[missing_values == len(data)].index
print("Columns with all missing values:\n", all_missing_columns)

# Drop columns with all missing values
data_dropped = data.drop(columns=all_missing_columns)

# Check the remaining columns to ensure ProductCategory is present
print("Remaining columns after dropping:\n", data_dropped.columns)

# Separate numeric and non-numeric columns
numeric_cols = data_dropped.select_dtypes(include=['number']).columns
non_numeric_cols = data_dropped.select_dtypes(exclude=['number']).columns

# Handle missing values for numeric columns
imputer_numeric = SimpleImputer(strategy='mean')
data_numeric_imputed = pd.DataFrame(imputer_numeric.fit_transform(data_dropped[numeric_cols]), columns=numeric_cols)

# Handle missing values for non-numeric columns
imputer_non_numeric = SimpleImputer(strategy='most_frequent')
data_non_numeric_imputed = pd.DataFrame(imputer_non_numeric.fit_transform(data_dropped[non_numeric_cols]), columns=non_numeric_cols)

# Combine the imputed data
data_imputed = pd.concat([data_numeric_imputed, data_non_numeric_imputed], axis=1)

# Check the columns after imputation
print("Columns after imputation:\n", data_imputed.columns)

# Reconstruct the ProductCategory column from one-hot encoded columns
def reconstruct_product_category(row):
    if row['ProductCategory_MenClothing'] == 1:
        return 'MenClothing'
    elif row['ProductCategory_OtherClothing'] == 1:
        return 'OtherClothing'
    elif row['ProductCategory_WomenClothing'] == 1:
        return 'WomenClothing'
    else:
        return None

data_imputed['ProductCategory'] = data_imputed.apply(reconstruct_product_category, axis=1)

# Check if ProductCategory was constructed properly
print("Unique values in ProductCategory column:\n", data_imputed['ProductCategory'].unique())

# Drop one-hot encoded ProductCategory columns
data_imputed = data_imputed.drop(columns=['ProductCategory_MenClothing', 'ProductCategory_OtherClothing', 'ProductCategory_WomenClothing'])

# Feature selection and preprocessing
features = data_imputed.drop(columns=['Year', 'Month', 'Sales(In ThousandDollars)', 'ProductCategory'], errors='ignore')
target = data_imputed['Sales(In ThousandDollars)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_predictions = gbr.predict(X_test)
gbr_mae = mean_absolute_error(y_test, gbr_predictions)
print(f'Gradient Boosting Regressor MAE: {gbr_mae}')

# Load submission file
submission = pd.read_csv('submission.csv')

# Prepare the submission DataFrame
submission['Sales(In ThousandDollars)'] = 0  # Initialize with zeros

# Ensure that the one-hot encoded columns for ProductCategory are included in the prediction DataFrame
# Create a column list for feature_row that matches the training data
feature_cols = list(features.columns)

# Iterate through the submission file and make predictions
for idx, row in submission.iterrows():
    year, month, category = row['Year'], row['Month'], row['ProductCategory']

    # Create a feature row for prediction
    feature_row = pd.DataFrame({
        'Year': [year],
        'Month': [month]
    })

    # Add one-hot encoded ProductCategory columns
    for cat in ['MenClothing', 'OtherClothing', 'WomenClothing']:
        feature_row[f'ProductCategory_{cat}'] = 1 if category == cat else 0

    # Add missing columns with zero values to feature_row
    for col in feature_cols:
        if col not in feature_row.columns:
            feature_row[col] = 0

    # Ensure feature_row has the same columns as X_train, filling missing columns with zeros
    feature_row = feature_row[feature_cols]

    # Make the prediction
    gbr_prediction = gbr.predict(feature_row)

    # Fill in the submission file
    submission.loc[idx, 'Sales(In ThousandDollars)'] = gbr_prediction[0]

# Save the updated submission file
submission.to_csv('submission_updated.csv', index=False)

Missing values in each column:
 Monthly Nominal GDP Index (inMillion$)         273
Monthly Real GDP Index (inMillion$)            273
CPI                                            273
unemployment rate                              273
CommercialBankInterestRateonCreditCardPlans    273
                                              ... 
Wind (km/h) low                                669
Wind (km/h) avg                                669
Wind (km/h) high                               669
Precip. (mm) sum                               669
WeatherEvent                                   669
Length: 63, dtype: int64
Columns with all missing values:
 Index(['Day', 'Temp high (°C)', 'Temp avg (°C)', 'Temp low (°C)',
       'Dew Point high (°C)', 'Dew Point avg (°C)', 'Dew Point low (°C)',
       'Humidity (%) high', 'Humidity (%) avg', 'Humidity (%) low',
       'Sea Level Press. (hPa) high', 'Sea Level Press. (hPa) avg',
       'Sea Level Press. (hPa) low', 'Visibility (km) high',
       'Vis