In [1]:
'''
Step 1:
Apply log transformation to price
'''
import numpy as np
import pandas as pd

data = pd.read_csv('bmw.csv')
data['log_price'] = np.log(data['price'].copy()) 
data = data.drop(['price'], axis=1)

print(data.head())

       model  year transmission  mileage fuelType  tax   mpg  engineSize  \
0   5 Series  2014    Automatic    67068   Diesel  125  57.6         2.0   
1   6 Series  2018    Automatic    14827   Petrol  145  42.8         2.0   
2   5 Series  2016    Automatic    62794   Diesel  160  51.4         3.0   
3   1 Series  2017    Automatic    26676   Diesel  145  72.4         1.5   
4   7 Series  2014    Automatic    39554   Diesel  160  50.4         3.0   

   log_price  
0   9.323669  
1  10.203592  
2   9.680344  
3   9.453287  
4   9.581904  


In [2]:
'''
Step 2
Apply one hot encoding for categorical features
'''
categorical_cols = ['model', 'transmission', 'fuelType']

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

data = data.drop(categorical_cols, axis=1, errors='ignore')

print(data.head())

   year  mileage  tax   mpg  engineSize  log_price  model_ 2 Series  \
0  2014    67068  125  57.6         2.0   9.323669            False   
1  2018    14827  145  42.8         2.0  10.203592            False   
2  2016    62794  160  51.4         3.0   9.680344            False   
3  2017    26676  145  72.4         1.5   9.453287            False   
4  2014    39554  160  50.4         3.0   9.581904            False   

   model_ 3 Series  model_ 4 Series  model_ 5 Series  ...  model_ Z3  \
0            False            False             True  ...      False   
1            False            False            False  ...      False   
2            False            False             True  ...      False   
3            False            False            False  ...      False   
4            False            False            False  ...      False   

   model_ Z4  model_ i3  model_ i8  transmission_Manual  \
0      False      False      False                False   
1      False      Fals

In [3]:
'''
Step 3
Split the data in train, test 
'''
from sklearn.model_selection import train_test_split

X = data.drop('log_price', axis=1)
y = data['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

X_train shape: (8624, 34), X_test shape: (2157, 34)
y_train shape: (8624,), y_test shape: (2157,)


In [4]:
'''
Step 4
Apply normalization to the numberical features
'''
from sklearn.preprocessing import StandardScaler

scaling_cols = [ 'year', 'mileage', 'tax', 'mpg', 'engineSize']
scaler = StandardScaler()
X_train[scaling_cols] = scaler.fit_transform(X_train[scaling_cols])
X_test[scaling_cols] = scaler.transform(X_test[scaling_cols])

print(X_train[scaling_cols].head())

          year   mileage       tax       mpg  engineSize
3652  0.816915 -0.651978  0.225809 -0.195461   -0.295106
7007  0.816915 -0.751929  0.225809 -0.605836    1.525610
9045 -0.450641  1.536461  0.225809 -0.101753    1.525610
3910  0.816915 -0.904976  0.061840  2.738557   -1.205465
3635 -1.718196  1.510708  1.127640 -0.350563    1.525610


In [5]:
'''
Step 5
Save the data after preprocessing
'''
train_data = X_train.copy()
train_data['log_price'] = y_train

test_data = X_test.copy()
test_data['log_price'] = y_test

train_data.to_csv('processed_train_data.csv', index=False)
test_data.to_csv('processed_test_data.csv', index=False)

print("Data Saved.")

Data Saved.


In [6]:
import pickle
import pandas as pd

scaler_filename = 'scaler.pkl'
final_columns = X_train.columns.tolist()

with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)

with open('model_features.pkl', 'wb') as file:
    pickle.dump(final_columns, file)

print(f"Scaler saved to {scaler_filename}")
print(f"Feature list saved to model_features.pkl")

Scaler saved to scaler.pkl
Feature list saved to model_features.pkl
