In [1]:
import numpy as np
import pandas as pd

from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)

CARS = '/kaggle/input/car-features-and-prices-dataset/data.csv'

df = pd.read_csv(filepath_or_buffer=CARS)
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [3]:
df.nunique()

Make                   48
Model                 915
Year                   28
Engine Fuel Type       10
Engine HP             356
Engine Cylinders        9
Transmission Type       5
Driven_Wheels           4
Number of Doors         3
Market Category        71
Vehicle Size            3
Vehicle Style          16
highway MPG            59
city mpg               69
Popularity             48
MSRP                 6049
dtype: int64

In [4]:
from plotly.express import histogram

for key, value in df.nunique().to_dict().items():
    if value < 50:
        histogram(data_frame=df, x=key).show()

In [5]:
histogram(data_frame=df, x='MSRP', color='Vehicle Size', log_y=True)

We have some MSRP outliers; let's look again and drop anything where MSRP > 100k.

In [6]:
histogram(data_frame=df[df['MSRP'] < 100000], x='MSRP', color='Vehicle Size', log_y=True)

We have some very cheap cars (MSRP < 10k), and then we see a sensible-looking distribution, where size costs more.

In [7]:
for color in ['Driven_Wheels', 'Vehicle Size', 'Vehicle Style',]:
    histogram(data_frame=df[(df['MSRP'] < 100000) & (df['MSRP'] > 10000)], x='MSRP', color=color, log_y=False).show()

Let's take a look at what we can find if we just look at numerical data.

In [8]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [9]:
from sklearn.manifold import TSNE
from plotly.express import scatter

numeric_columns = ['Year', 'Engine HP', 'Engine Cylinders', 'Number of Doors', 'highway MPG', 'city mpg', 'Popularity']
tsne = TSNE(n_components=2, n_iter=1000, init='pca', verbose=1, random_state=2024,)
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=df[numeric_columns].dropna()), columns=['tx', 'ty'])
tsne_df['target'] = df[numeric_columns + ['MSRP']].dropna()['MSRP'].copy()
tsne_df['log target'] = np.log(tsne_df['target'])
scatter(data_frame=tsne_df, x='tx', y='ty', color='log target')

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 11815 samples in 0.011s...
[t-SNE] Computed neighbors for 11815 samples in 0.278s...
[t-SNE] Computed conditional probabilities for sample 1000 / 11815
[t-SNE] Computed conditional probabilities for sample 2000 / 11815
[t-SNE] Computed conditional probabilities for sample 3000 / 11815
[t-SNE] Computed conditional probabilities for sample 4000 / 11815
[t-SNE] Computed conditional probabilities for sample 5000 / 11815
[t-SNE] Computed conditional probabilities for sample 6000 / 11815
[t-SNE] Computed conditional probabilities for sample 7000 / 11815
[t-SNE] Computed conditional probabilities for sample 8000 / 11815
[t-SNE] Computed conditional probabilities for sample 9000 / 11815
[t-SNE] Computed conditional probabilities for sample 10000 / 11815
[t-SNE] Computed conditional probabilities for sample 11000 / 11815
[t-SNE] Computed conditional probabilities for sample 11815 / 11815
[t-SNE] Mean sigma: 1.819441
[t-SNE] KL divergence

Do what we will it looks like our data has lots of little islands, and they do not necessarily correspond to the MSRP values.