In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [7]:
# Separate numerical and categorical columns
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

In [8]:
# Fill missing values for numerical columns
train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].median())
test_df[numerical_cols] = test_df[numerical_cols].fillna(test_df[numerical_cols].median())


In [9]:
# Fill missing values for categorical columns
train_df[categorical_cols] = train_df[categorical_cols].fillna('Unknown')
test_df[categorical_cols] = test_df[categorical_cols].fillna('Unknown')

In [10]:
# Convert categorical variables to dummy/indicator variables
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [11]:
# Ensure the train and test datasets have the same columns
train_df, test_df = train_df.align(test_df, join='left', axis=1)
test_df.fillna(0, inplace=True)

In [13]:
# Create new feature: car_age
train_df['car_age'] = 2024 - train_df['model_year']
test_df['car_age'] = 2024 - test_df['model_year']
train_df.drop(['model_year'], axis=1, inplace=True)
test_df.drop(['model_year'], axis=1, inplace=True)

In [14]:
# Features and target
X = train_df.drop('price', axis=1)
y = train_df['price']

In [15]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Initialize and train the models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

In [17]:
# Create a function to evaluate models using cross-validation
def evaluate_model(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    return -scores.mean()

In [None]:
# Evaluate models
rf_rmse = evaluate_model(rf_model, X_train, y_train)
gb_rmse = evaluate_model(gb_model, X_train, y_train)
xgb_rmse = evaluate_model(xgb_model, X_train, y_train)

print(f'Random Forest RMSE: {rf_rmse}')
print(f'Gradient Boosting RMSE: {gb_rmse}')
print(f'XGBoost RMSE: {xgb_rmse}')

In [None]:
# Hyperparameter tuning for the best model (assuming XGBoost is the best model here)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters and best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score (negative RMSE): {grid_search.best_score_}')

In [None]:
# Use the best estimator for predictions
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)

In [None]:
# Calculate RMSE for the best model
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Optimized Root Mean Squared Error: {rmse}')

In [None]:
# Predict on the test data
test_predictions = best_model.predict(test_df)

In [None]:
# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_df.index,  # Assuming the index should be used for Id
    'Price': test_predictions
})

In [None]:
# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)