In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import os

# Load data
train_path = "./playground-series-s5e2/train.csv"
test_path = "./playground-series-s5e2/test.csv"
submission_path = "./playground-series-s5e2/sample_submission.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_submission = pd.read_csv(submission_path)

# Exploratory Data Analysis
print("Train Data Shape:", df_train.shape)
print("Test Data Shape:", df_test.shape)
print("Missing Values in Train:\n", df_train.isnull().sum())
print("Missing Values in Test:\n", df_test.isnull().sum())
print("Columns in Train:", df_train.columns)

df_train.head()

# Ensure 'Price' column exists
if 'Price' not in df_train.columns:
    raise KeyError("Column 'Price' not found in training data. Check dataset contents.")

# Handle missing values separately for numeric and categorical columns
for col in df_train.select_dtypes(include=['number']).columns:
    df_train[col].fillna(df_train[col].median(), inplace=True)

for col in df_train.select_dtypes(include=['object']).columns:
    df_train[col].fillna(df_train[col].mode()[0], inplace=True)

# Convert categorical variables to numeric
categorical_cols = df_train.select_dtypes(include=['object']).columns
for col in categorical_cols:
    lbl = LabelEncoder()
    df_train[col] = lbl.fit_transform(df_train[col])

# Feature Engineering & Preprocessing
X = df_train.drop(columns=['Price', 'id'])  # Keep 'id' separate for submission
y = df_train['Price']
X_test = df_test.drop(columns=['id'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Model Training (Random Forest Regressor)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_val_scaled)
print("Random Forest MAE:", mean_absolute_error(y_val, y_pred))
print("Random Forest RMSE:", mean_squared_error(y_val, y_pred, squared=False))

# LightGBM Regressor
lgb_model = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05)
lgb_model.fit(X_train_scaled, y_train)
lgb_pred = lgb_model.predict(X_val_scaled)
print("LightGBM MAE:", mean_absolute_error(y_val, lgb_pred))
print("LightGBM RMSE:", mean_squared_error(y_val, lgb_pred, squared=False))

# Generate Predictions for Submission
test_predictions = lgb_model.predict(X_test_scaled)
df_submission['Price'] = test_predictions
df_submission.to_csv('submission.csv', index=False)

print("Submission file saved as submission.csv")


Train Data Shape: (300000, 11)
Test Data Shape: (200000, 10)
Missing Values in Train:
 id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64
Missing Values in Test:
 id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64
Columns in Train: Index(['id', 'Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)', 'Price'],
      dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

ValueError: could not convert string to float: 'Puma'

In [8]:
print(df_train.columns)

Index(['id', 'Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)', 'Price'],
      dtype='object')
