In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from xgboost import XGBRegressor

# pd.set_option('display.max_columns', 500)
df = pd.read_csv('data/autos.csv')

X = df.copy()
y = df['price']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    for col in X.select_dtypes(['category', 'object']):
        X[col], _ = X[col].factorize()
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_log_error')
    score = -1*score.mean()
    score = np.sqrt(score)
    return score

Preprocess

In [None]:
#cat cols 
object_cols = [col for col in X_train.columns if X_train[col].dtype=='object']
oh_enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
oh_X_train = pd.DataFrame(oh_enc.fit_transform(X_train[object_cols]))
oh_X_valid = pd.DataFrame(oh_enc.transform(X_valid[object_cols]))

oh_X_train.index = X_train[object_cols].index
oh_X_valid.index = X_valid[object_cols].index

X_train = X_train.select_dtypes(exclude='object')
X_valid = X_valid.select_dtypes(exclude='object')

X_train = X_train.join(oh_X_train)
X_valid = X_valid.join(oh_X_valid)

X_train.columns = X_train.columns.astype(str)
X_valid.columns = X_valid.columns.astype(str)

#num cols
my_imp = SimpleImputer(strategy='median')

xtraincol = X_train.columns
xvalidcol = X_valid.columns

X_train = pd.DataFrame(my_imp.fit_transform(X_train))
X_valid = pd.DataFrame(my_imp.transform(X_valid))

X_train.columns = xtraincol
X_valid.columns = xvalidcol

Clustering

In [None]:
feature = ['peak_rpm', 'engine_size', 'horsepower', 'length']

kmeans = KMeans(n_clusters=5, n_init=10, random_state=0)
X['Cluster'] = kmeans.fit_predict(X[feature])
X['Cluster'] = X['Cluster'].astype('category')

Xy = X.copy()
Xy['price'] = y 
sns.relplot(x='value', y='price', hue='Cluster', col='variable',
            height=4, aspect=1, facet_kws={'sharex':False}, col_wrap=3, data=Xy.melt(value_vars=feature, id_vars=['price', 'Cluster'],))