<h2>Mix Submission 01 (39.14690)</h2>

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.neural_network import MLPRegressor

In [16]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [17]:
# Preprocessing for binary columns
binary_cols = ['Laptop Compartment', 'Waterproof']
for col in binary_cols:
    mode_val = train[col].mode()[0]
    train[col] = train[col].fillna(mode_val)
    test[col] = test[col].fillna(mode_val)
    train[col] = train[col].map({'Yes': 1, 'No': 0}).astype(int)
    test[col] = test[col].map({'Yes': 1, 'No': 0}).astype(int)

In [18]:
# Standardize categorical columns
categorical_cols = ['Brand', 'Material', 'Size', 'Style', 'Color']
for col in categorical_cols:
    train[col] = train[col].str.strip().str.lower()
    test[col] = test[col].str.strip().str.lower()

In [19]:
# Prepare data
X_train = train.drop(['id', 'Price'], axis=1)
y_train = train['Price']
X_test = test.drop('id', axis=1)

In [20]:
# Define preprocessing pipelines
num_features = ['Compartments', 'Weight Capacity (kg)'] + binary_cols
cat_features = categorical_cols

In [21]:
# Preprocessor for linear models and neural networks
preprocessor_linear = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), num_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_features)
])

In [22]:
# Preprocessor for tree-based models
preprocessor_tree = ColumnTransformer([
    ('num', SimpleImputer(strategy='mean'), num_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]), cat_features)
])

In [23]:
# Define models with hyperparameters
models = {
    'LinearRegression': Pipeline([
        ('preprocessor', preprocessor_linear),
        ('model', LinearRegression())
    ]),
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor_tree),
        ('model', lgb.LGBMRegressor(
            n_estimators=500,
            learning_rate=0.05,
            num_leaves=32,
            random_state=42
        ))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor_tree),
        ('model', xgb.XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=5,
            random_state=42
        ))
    ]),
    'NeuralNetwork': Pipeline([
        ('preprocessor', preprocessor_linear),
        ('model', MLPRegressor(
            hidden_layer_sizes=(128, 64),
            early_stopping=True,
            random_state=42,
            max_iter=500
        ))
    ])
}

In [24]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [25]:
# Evaluate models using cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda y, y_pred: np.sqrt(np.mean((y - y_pred)**2)))

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=rmse_scorer)
    results[name] = np.mean(scores)
    print(f"{name:15} RMSE: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

LinearRegression RMSE: 39.0204 ± 0.0557
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 291
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 9
[LightGBM] [Info] Start training from score 81.448481
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 291
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 9
[LightGBM] [Info] Start training from score 81.385450
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003977 seconds.
You can set `force_row_wise=true` to remove 

In [26]:
# Select best model
best_model_name = min(results, key=results.get)
print(f"\nBest model: {best_model_name}")


Best model: LinearRegression


In [27]:
# Train final model
best_model = models[best_model_name].fit(X_train, y_train)

In [28]:
# Generate predictions
predictions = best_model.predict(X_test)

In [29]:
# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'Price': predictions.round(3)
})
submission.to_csv('Mix01.csv', index=False)