In [3]:
import pandas as pd

df = pd.read_csv("used_cars.csv")
df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$10,300"
1,Hyundai,Palisade SEL,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,"$38,005"
2,Lexus,RX 350 RX 350,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,"$54,598"
3,INFINITI,Q50 Hybrid Sport,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,"$15,500"
4,Audi,Q3 45 S line Premium Plus,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,"$34,999"


In [4]:
df['price'] = (
    df['price']
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)


In [5]:
df['price'].head()

0    10300.0
1    38005.0
2    54598.0
3    15500.0
4    34999.0
Name: price, dtype: float64

In [6]:
df['milage'] = (
    df['milage']
    .str.replace(' mi.', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)


In [7]:
df['milage'].head()


0    51000.0
1    34742.0
2    22372.0
3    88900.0
4     9835.0
Name: milage, dtype: float64

In [8]:
df['fuel_type'] = df['fuel_type'].fillna('Unknown')
df['accident'] = df['accident'].fillna('Unknown')
df['clean_title'] = df['clean_title'].fillna('Unknown')


In [9]:
df.isnull().sum()


brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [10]:
X = df.drop(columns=['price'])
y = df['price']


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [12]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_features, numerical_features


(['brand',
  'model',
  'fuel_type',
  'engine',
  'transmission',
  'ext_col',
  'int_col',
  'accident',
  'clean_title'],
 ['model_year', 'milage'])

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [14]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [15]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [17]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [18]:
X_train_processed.shape, X_test_processed.shape


((3207, 3209), (802, 3209))

## Kodiranje i normalizacija podataka

Numeričke značajke (*model_year*, *milage*) normalizirane su korištenjem
StandardScaler metode kako bi se osigurala usporedivost vrijednosti.

Kategoričke značajke kodirane su primjenom One-Hot Encoding tehnike.
Za obradu potencijalno nepoznatih kategorija u testnim i produkcijskim
podacima korištena je opcija *handle_unknown='ignore'*.

Za objedinjavanje svih koraka predobrade korišten je *ColumnTransformer*,
čime je osigurana konzistentna primjena predobrade tijekom treniranja
i kasnije prilikom deploya modela.


In [19]:
import joblib

joblib.dump(preprocessor, "preprocessor.pkl")


['preprocessor.pkl']