In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error
from scipy.stats import randint, uniform

# Load the datasets
train_data = pd.read_csv('clean_train.csv')

# Separate features and target variable from training data
X = train_data.drop(['id', 'clean_title' ,'price', 'ext_col', 'int_col', 'cylinder_shape', 'gears'], axis=1)
y = train_data['price']   # Only the last column

# Automatically identify numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Define preprocessing for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define the parameter grid
param_grid = {
    'regressor__n_estimators': randint(50, 200),
    'regressor__max_features': ['sqrt', 'log2', randint(1, len(numeric_features) + len(categorical_features))],
    'regressor__max_depth': randint(3, 20),
    'regressor__min_samples_split': randint(2, 20),
    'regressor__min_samples_leaf': randint(1, 20),
    'regressor__bootstrap': [True, False]
}

# Set up K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scorer
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_grid, n_iter=100,
    scoring=scorer, cv=kf, verbose=1, random_state=42, n_jobs=-1
)

# Fit the model
random_search.fit(X, y)

# Best parameters found
print(f"Best parameters: {random_search.best_params_}")

# Cross-validation scores
cv_results = random_search.cv_results_
mean_test_score = random_search.best_score_
std_test_score = cv_results['std_test_score'][random_search.best_index_]

print(f"Mean Test Score (Negative MAE): {mean_test_score}")
print(f"Standard Deviation of Test Score: {std_test_score}")

# Output predictions for inspection using the best estimator
best_model = random_search.best_estimator_
y_pred = cross_val_score(best_model, X, y, cv=kf, scoring='neg_mean_absolute_error')
print(f"Cross-Validation Scores (Negative MAE): {y_pred}")
print(f"Mean CV Score (Negative MAE): {np.mean(y_pred)}")
print(f"Standard Deviation of CV Score: {np.std(y_pred)}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits


150 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\PTB3KOR\.conda\envs\ml_project\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\PTB3KOR\.conda\envs\ml_project\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\PTB3KOR\.conda\envs\ml_project\lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\PTB3KOR\.conda\envs\ml_project\lib\site-packages\sklearn\base

Best parameters: {'regressor__bootstrap': False, 'regressor__max_depth': 19, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 9, 'regressor__min_samples_split': 12, 'regressor__n_estimators': 88}
Mean Test Score (Negative MAE): -19023.971174320475
Standard Deviation of Test Score: 673.4465373108344
Cross-Validation Scores (Negative MAE): [-18323.24372839 -18791.99852334 -20312.03951319 -18862.98420129
 -18829.58990539]
Mean CV Score (Negative MAE): -19023.971174320475
Standard Deviation of CV Score: 673.4465373108344


In [2]:
# Step 1: Load the new dataset
test_dataset = pd.read_csv('clean_test.csv')
test_data = test_dataset.drop(['id', 'clean_title', 'ext_col', 'int_col' ,'cylinder_shape', 'gears'], axis=1)

# Ensure the new dataset has the same features as the training data
# If necessary, drop or fill any missing target column

# Step 2: Use the best_model to make predictions
y_pred = best_model.predict(test_data)

In [3]:
data  = pd.read_csv('clean_test.csv')
# Create a DataFrame with IDs and predictions
predictions = pd.DataFrame({
    'id': data.iloc[:, 0],  # Assuming the first column is the ID
    'price': y_pred
})

# Save to CSV
predictions.to_csv('.\\data\\rf_predictions.csv', index=False)