In [1]:
import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import validation_curve, learning_curve, ValidationCurveDisplay, LearningCurveDisplay
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

In [2]:
train = pd.read_csv('playground-series-s4e9/train.csv')
test = pd.read_csv('playground-series-s4e9/test.csv')

In [3]:
#fuel_type is string, accident is string, clean_title is string
na_cols = ['fuel_type', 'accident', 'clean_title']
for col in na_cols:
    train[col] = train[col].fillna('Missing')

In [4]:
train.isna().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [5]:
train.shape

(188533, 13)

In [6]:
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price'],
      dtype='object')

In [7]:
train = train.drop(columns='id')
train.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [8]:
columns_to_encode = ['brand', 'model', 'model_year', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), columns_to_encode)
    ],
    remainder='drop'  # Keeps the columns that are not transformed
)

In [9]:
train_ohe = column_transformer.fit_transform(train)

In [10]:
encoded_feature_names = column_transformer.get_feature_names_out()
train_df = pd.DataFrame(train_ohe, columns=encoded_feature_names)

In [11]:
train_df['milage'] = train['milage']
train_df['price'] = train['price']

In [12]:
train_df

Unnamed: 0,onehot__brand_Acura,onehot__brand_Alfa,onehot__brand_Aston,onehot__brand_Audi,onehot__brand_BMW,onehot__brand_Bentley,onehot__brand_Bugatti,onehot__brand_Buick,onehot__brand_Cadillac,onehot__brand_Chevrolet,...,onehot__int_col_White / Brown,onehot__int_col_Yellow,onehot__int_col_–,onehot__accident_At least 1 accident or damage reported,onehot__accident_Missing,onehot__accident_None reported,onehot__clean_title_Missing,onehot__clean_title_Yes,milage,price
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,213000,4200
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,143250,4999
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,136731,13900
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,19500,45000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,7388,97500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,49000,27500
188529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,28600,30000
188530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,13650,86900
188531,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,13895,84900


In [13]:
train_df.columns

Index(['onehot__brand_Acura', 'onehot__brand_Alfa', 'onehot__brand_Aston',
       'onehot__brand_Audi', 'onehot__brand_BMW', 'onehot__brand_Bentley',
       'onehot__brand_Bugatti', 'onehot__brand_Buick',
       'onehot__brand_Cadillac', 'onehot__brand_Chevrolet',
       ...
       'onehot__int_col_White / Brown', 'onehot__int_col_Yellow',
       'onehot__int_col_–',
       'onehot__accident_At least 1 accident or damage reported',
       'onehot__accident_Missing', 'onehot__accident_None reported',
       'onehot__clean_title_Missing', 'onehot__clean_title_Yes', 'milage',
       'price'],
      dtype='object', length=3647)

In [14]:
train_df.to_parquet('data/train.parquet')