In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from xgboost import XGBRegressor

# pd.set_option('display.max_columns', 500)
df = pd.read_csv('data/autos.csv')

X = df.copy()
y = df['price']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [10]:
pd.set_option('display.max_columns',  25)
X

Unnamed: 0,symboling,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,4,130,mpfi,3.47,2.68,9,111,5000,21,27,13495
1,3,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,4,130,mpfi,3.47,2.68,9,111,5000,21,27,16500
2,1,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,6,152,mpfi,2.68,3.47,9,154,5000,19,26,16500
3,2,audi,gas,std,4,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,4,109,mpfi,3.19,3.40,10,102,5500,24,30,13950
4,2,audi,gas,std,4,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,5,136,mpfi,3.19,3.40,8,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,-1,volvo,gas,std,4,sedan,rwd,front,109.1,188.8,68.9,55.5,2952,ohc,4,141,mpfi,3.78,3.15,9,114,5400,23,28,16845
189,-1,volvo,gas,turbo,4,sedan,rwd,front,109.1,188.8,68.8,55.5,3049,ohc,4,141,mpfi,3.78,3.15,8,160,5300,19,25,19045
190,-1,volvo,gas,std,4,sedan,rwd,front,109.1,188.8,68.9,55.5,3012,ohcv,6,173,mpfi,3.58,2.87,8,134,5500,18,23,21485
191,-1,volvo,diesel,turbo,4,sedan,rwd,front,109.1,188.8,68.9,55.5,3217,ohc,6,145,idi,3.01,3.40,23,106,4800,26,27,22470


In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    for col in X.select_dtypes(['category', 'object']):
        X[col], _ = X[col].factorize()
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_log_error')
    score = -1*score.mean()
    score = np.sqrt(score)
    return score

Preprocess

In [None]:
#cat cols 
object_cols = [col for col in X_train.columns if X_train[col].dtype=='object']
oh_enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
oh_X_train = pd.DataFrame(oh_enc.fit_transform(X_train[object_cols]))
oh_X_valid = pd.DataFrame(oh_enc.transform(X_valid[object_cols]))

oh_X_train.index = X_train[object_cols].index
oh_X_valid.index = X_valid[object_cols].index

X_train = X_train.select_dtypes(exclude='object')
X_valid = X_valid.select_dtypes(exclude='object')

X_train = X_train.join(oh_X_train)
X_valid = X_valid.join(oh_X_valid)

X_train.columns = X_train.columns.astype(str)
X_valid.columns = X_valid.columns.astype(str)

#num cols
my_imp = SimpleImputer(strategy='median')

xtraincol = X_train.columns
xvalidcol = X_valid.columns

X_train = pd.DataFrame(my_imp.fit_transform(X_train))
X_valid = pd.DataFrame(my_imp.transform(X_valid))

X_train.columns = xtraincol
X_valid.columns = xvalidcol

Clustering

In [None]:
feature = ['peak_rpm', 'engine_size', 'horsepower', 'length']

kmeans = KMeans(n_clusters=5, n_init=10, random_state=0)
X['Cluster'] = kmeans.fit_predict(X[feature])
X['Cluster'] = X['Cluster'].astype('category')

Xy = X.copy()
Xy['price'] = y 
sns.relplot(x='value', y='price', hue='Cluster', col='variable',
            height=4, aspect=1, facet_kws={'sharex':False}, col_wrap=3, data=Xy.melt(value_vars=feature, id_vars=['price', 'Cluster'],))