In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from scipy.stats import uniform, randint

In [2]:
# Load the datasets
train_data = pd.read_csv('clean_train.csv')
test_data  = pd.read_csv('clean_test.csv')

# Separate features and target variable from training data
dfX_train = train_data.drop(['id', 'clean_title' ,'price'], axis=1)
dfX_test  = test_data.drop(['id', 'clean_title'], axis=1)
dfY_train = train_data['price']   # Only the last column
'''
# Check for any categorical data and convert them to dummy variables
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align the test set to the training set
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)
'''

'\n# Check for any categorical data and convert them to dummy variables\nX_train = pd.get_dummies(X_train, drop_first=True)\nX_test = pd.get_dummies(X_test, drop_first=True)\n\n# Align the test set to the training set\nX_test = X_test.reindex(columns = X_train.columns, fill_value=0)\n\n# Feature scaling\nscaler = StandardScaler()\nX_train_scaled = scaler.fit_transform(X_train)\nX_test_scaled = scaler.transform(X_test)\n\n# Train the model\nmodel = LinearRegression()\nmodel.fit(X_train_scaled, y_train)\n\n# Make predictions on the test data\ny_pred = model.predict(X_test_scaled)\n'

In [3]:
dfX_train.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine_power,engine_volume,engine_cylinder,cylinder_shape,gears,transmission,ext_col,int_col,accident
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0,3.5,6.0,V,10.0,Automatic,Blue,Gray,0
1,BMW,335 i,2007,80000,Gasoline,300.0,3.0,6.0,Straight,6.0,Manual,Black,Black,0
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0,4.2,8.0,,6.0,Automatic,Purple,Beige,0
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0,3.0,6.0,Straight,7.0,Dual-Clutch,Gray,Brown,0
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0,3.8,6.0,V,,Automatic,White,Black,0


In [4]:
numeric_features = dfX_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=7)),
    ('scaler', StandardScaler())
])

In [5]:
categorical_features = dfX_train.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [6]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
'''
# Define the model pipeline
pipeline   = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, random_state=42))
])
'''

"\n# Define the model pipeline\npipeline   = Pipeline(steps=[\n    ('preprocessor', preprocessor),\n    ('regressor', XGBRegressor(n_estimators=100, random_state=42))\n])\n"

In [7]:
# Define the model pipeline with hyperparameter tuning
param_grid = {
    'regressor__n_estimators': range(100, 500, 50),
    'regressor__max_depth': range(3, 10),
    'regressor__learning_rate': np.linspace(0.01, 0.2, 20),
    'regressor__gamma': np.linspace(0, 0.5, 10),
    'regressor__reg_alpha': np.linspace(0, 1, 20),
    'regressor__reg_lambda': np.linspace(1, 2, 10)
}

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        random_state=42,
        colsample_bytree=0.6860159238890708,  # Fixed best value for demonstration
        subsample=0.8028987933998539  # Fixed best value for demonstration
    ))
])

# Set up K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scorer
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_grid, n_iter=50,
    scoring=scorer, cv=kf, verbose=1, random_state=42, n_jobs=-1
)

# Fit the model
random_search.fit(dfX_train, dfY_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [8]:
# Best parameters found
print(f"Best parameters: {random_search.best_params_}")

# Cross-validation scores
cv_results = random_search.cv_results_
mean_test_score = random_search.best_score_
std_test_score = cv_results['std_test_score'][random_search.best_index_]

print(f"Mean Test Score (Negative MAE): {mean_test_score}")
print(f"Standard Deviation of Test Score: {std_test_score}")

# Output predictions for inspection using the best estimator
best_model = random_search.best_estimator_
y_pred = cross_val_score(best_model, dfX_train, dfY_train, cv=kf, scoring='neg_mean_absolute_error')
print(f"Cross-Validation Scores (Negative MAE): {y_pred}")
print(f"Mean CV Score (Negative MAE): {np.mean(y_pred)}")
print(f"Standard Deviation of CV Score: {np.std(y_pred)}")

Best parameters: {'regressor__reg_lambda': 1.8888888888888888, 'regressor__reg_alpha': 0.5263157894736842, 'regressor__n_estimators': 200, 'regressor__max_depth': 9, 'regressor__learning_rate': 0.03, 'regressor__gamma': 0.05555555555555555}
Mean Test Score (Negative MAE): -17079.133517746013
Standard Deviation of Test Score: 935.2155823965464
Cross-Validation Scores (Negative MAE): [-16251.29278369 -16774.84443265 -18878.57541073 -16486.06952861
 -17004.88543304]
Mean CV Score (Negative MAE): -17079.133517746013
Standard Deviation of CV Score: 935.2155823965464


In [9]:
y_pred = best_model.predict(dfX_test)

In [10]:
test_data  = pd.read_csv('clean_test.csv')
# Create a DataFrame with IDs and predictions
predictions = pd.DataFrame({
    'id': test_data.iloc[:, 0],  # Assuming the first column is the ID
    'price': y_pred
})

# Save to CSV
predictions.to_csv('.\\data\\xgb_predictions.csv', index=False)

In [11]:
len(predictions)

36183

In [12]:
dfX_train.isnull().sum()

brand                  0
model                  0
model_year             0
milage                 0
fuel_type            294
engine_power        4057
engine_volume        529
engine_cylinder      628
cylinder_shape     28207
gears              17625
transmission          12
ext_col               41
int_col             1045
accident               0
dtype: int64