In [96]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


In [97]:
# Reading the data
df = pd.read_csv('../data/data.csv')


In [98]:
# Specifying the target and feature values
X = df[['brand', 'model', 'year', 'fuel', 'milage', 'transmissionType', 'engineCapacity', 'enginePower']]
y = df['price']

In [99]:
brand_counts = X['brand'].value_counts()
X.loc[:, 'brand'] = X['brand'].apply(lambda x: x if brand_counts[x] >= 10 else 'other')

In [100]:
# Imputing missing string values
imuter_string = SimpleImputer(strategy='most_frequent')
X.loc[:, ['brand', 'fuel', 'transmissionType']] = imuter_string.fit_transform(X[['brand', 'fuel', 'transmissionType']])

In [101]:
# Encoding the categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[['brand', 'fuel', 'transmissionType']]), columns=encoder.get_feature_names_out())

In [102]:
# Imputing missing numeric values
imputer_numeric = SimpleImputer(strategy='mean')
X.loc[:, ['year', 'milage', 'engineCapacity', 'enginePower']] = imputer_numeric.fit_transform(X[['year', 'milage', 'engineCapacity', 'enginePower']])

In [103]:
# Feature engineering
X['power_to_capacity'] = X['enginePower'] / X['engineCapacity']

In [104]:
# Scaling the numerical features
scaler = StandardScaler()

X.loc[:, ['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']] = scaler.fit_transform(X[['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']])
X_num = X[['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']]

 -3.780444  ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.loc[:, ['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']] = scaler.fit_transform(X[['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']])
 -0.57594799]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.loc[:, ['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']] = scaler.fit_transform(X[['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']])
 -0.1794261 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.loc[:, ['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']] = scaler.fit_transform(X[['year', 'milage', 'engineCapacity', 'enginePower', 'power_to_capacity']])
 -0.91849027]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.loc[:, ['year', 'milage', 'engine

In [105]:
# Saving the preprocessed data
df_final = pd.concat([X_encoded.reset_index(drop=True), X_num.reset_index(drop=True), y.reset_index(drop=True)], axis=1)
df_final.to_csv('../data/data_preprocessed.csv', index=False)