In [16]:
import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import validation_curve, learning_curve, ValidationCurveDisplay, LearningCurveDisplay
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

In [17]:
train = pd.read_csv('playground-series-s4e9/test.csv')
# test = pd.read_csv('playground-series-s4e9/test.csv')

In [18]:
#fuel_type is string, accident is string, clean_title is string
na_cols = ['fuel_type', 'accident', 'clean_title']
for col in na_cols:
    train[col] = train[col].fillna('Missing')

In [19]:
train.isna().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
dtype: int64

In [20]:
train.shape

(125690, 12)

In [21]:
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title'],
      dtype='object')

In [22]:
train = train.drop(columns='id')
train.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,Missing
3,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,Missing
4,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [23]:
columns_to_encode = ['brand', 'model', 'model_year', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), columns_to_encode)
    ],
    remainder='drop'  # Keeps the columns that are not transformed
)

In [24]:
train_ohe = column_transformer.fit_transform(train)

In [25]:
encoded_feature_names = column_transformer.get_feature_names_out()
train_df = pd.DataFrame(train_ohe, columns=encoded_feature_names)

In [26]:
train_df['milage'] = train['milage']

In [27]:
train_df

Unnamed: 0,onehot__brand_Acura,onehot__brand_Alfa,onehot__brand_Aston,onehot__brand_Audi,onehot__brand_BMW,onehot__brand_Bentley,onehot__brand_Bugatti,onehot__brand_Buick,onehot__brand_Cadillac,onehot__brand_Chevrolet,...,onehot__int_col_White,onehot__int_col_White / Brown,onehot__int_col_Yellow,onehot__int_col_–,onehot__accident_At least 1 accident or damage reported,onehot__accident_Missing,onehot__accident_None reported,onehot__clean_title_Missing,onehot__clean_title_Yes,milage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,98000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,9142
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,28121
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,61258
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,59000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,83315
125686,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,29336
125687,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,77634
125688,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,112000


In [28]:
train_df.columns

Index(['onehot__brand_Acura', 'onehot__brand_Alfa', 'onehot__brand_Aston',
       'onehot__brand_Audi', 'onehot__brand_BMW', 'onehot__brand_Bentley',
       'onehot__brand_Bugatti', 'onehot__brand_Buick',
       'onehot__brand_Cadillac', 'onehot__brand_Chevrolet',
       ...
       'onehot__int_col_White', 'onehot__int_col_White / Brown',
       'onehot__int_col_Yellow', 'onehot__int_col_–',
       'onehot__accident_At least 1 accident or damage reported',
       'onehot__accident_Missing', 'onehot__accident_None reported',
       'onehot__clean_title_Missing', 'onehot__clean_title_Yes', 'milage'],
      dtype='object', length=3638)

In [30]:
train_df['id'] = pd.read_csv('playground-series-s4e9/test.csv')['id']

In [32]:
train_df.to_parquet('data/test.parquet')