In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import pickle

In [2]:
df = pd.read_csv("./../../data/car_price_prediction.csv")
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [3]:
sel_cols = ['Manufacturer', 'Model', 'Prod. year', 'Mileage', 'Engine volume', 'Price']

In [4]:

car_df = df[sel_cols]
car_df.head()

Unnamed: 0,Manufacturer,Model,Prod. year,Mileage,Engine volume,Price
0,LEXUS,RX 450,2010,186005 km,3.5,13328
1,CHEVROLET,Equinox,2011,192000 km,3.0,16621
2,HONDA,FIT,2006,200000 km,1.3,8467
3,FORD,Escape,2011,168966 km,2.5,3607
4,HONDA,FIT,2014,91901 km,1.3,11726


In [5]:
car_df.shape

(19237, 6)

In [6]:
car_df = car_df.drop_duplicates()
car_df.shape

(15620, 6)

In [7]:
car_df["Mileage"] = car_df["Mileage"].str.replace(' km', '')
car_df.head()

Unnamed: 0,Manufacturer,Model,Prod. year,Mileage,Engine volume,Price
0,LEXUS,RX 450,2010,186005,3.5,13328
1,CHEVROLET,Equinox,2011,192000,3.0,16621
2,HONDA,FIT,2006,200000,1.3,8467
3,FORD,Escape,2011,168966,2.5,3607
4,HONDA,FIT,2014,91901,1.3,11726


In [8]:
car_df["Mileage"] = car_df["Mileage"].astype(int)

In [9]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15620 entries, 0 to 19236
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Manufacturer   15620 non-null  object
 1   Model          15620 non-null  object
 2   Prod. year     15620 non-null  int64 
 3   Mileage        15620 non-null  int64 
 4   Engine volume  15620 non-null  object
 5   Price          15620 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 854.2+ KB


In [10]:
def detect_outliers(df, features, thold):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * thold
        # Detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Store indeces
        outlier_indices.extend(outlier_list_col)
    
    
    return outlier_indices

In [11]:
features_ol = ['Price','Mileage']
outliers = detect_outliers(car_df,features_ol, 1.5)
car_df = car_df.drop(car_df.loc[outliers].index, axis=0)

In [12]:
car_df['Manufacturer'] = car_df['Manufacturer'].astype('category')
car_df['Model'] = car_df['Model'].astype('category')
car_df['Engine volume'] = car_df['Engine volume'].astype('category')
car_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14182 entries, 0 to 19236
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Manufacturer   14182 non-null  category
 1   Model          14182 non-null  category
 2   Prod. year     14182 non-null  int64   
 3   Mileage        14182 non-null  int64   
 4   Engine volume  14182 non-null  category
 5   Price          14182 non-null  int64   
dtypes: category(3), int64(3)
memory usage: 549.8 KB


In [13]:
car_df['Engine volume'].unique()

['3.5', '3', '1.3', '2.5', '2', ..., '5.4 Turbo', '0.3 Turbo', '5.2', '5.8', '1.1 Turbo']
Length: 101
Categories (101, object): ['0', '0.1', '0.2', '0.2 Turbo', ..., '6.2', '6.3', '6.4', '7.3']

In [14]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()

X = car_df["Mileage"].values.reshape(-1, 1)
mileage_sclaer = scale.fit(X)
scaledX = mileage_sclaer.transform(X)
print(scaledX)

[[0.53133659]
 [0.54846174]
 [0.57131431]
 ...
 [0.46162196]
 [0.33240495]
 [0.53395892]]


In [15]:
mileage_sclaer_path = "../app/model_objects/mileage_sclaer.pkl"
with open(mileage_sclaer_path,'wb') as f:
    pickle.dump(mileage_sclaer,f)

In [16]:
car_df["Mileage"] = scaledX

In [17]:
categorical_columns = ["Manufacturer" ,"Model", "Engine volume"]
df_encoded = pd.get_dummies(car_df, columns=categorical_columns)
df_encoded.head()

Unnamed: 0,Prod. year,Mileage,Price,Manufacturer_ACURA,Manufacturer_ALFA ROMEO,Manufacturer_AUDI,Manufacturer_BMW,Manufacturer_BUICK,Manufacturer_CADILLAC,Manufacturer_CHEVROLET,...,Engine volume_5.5 Turbo,Engine volume_5.6,Engine volume_5.7,Engine volume_5.8,Engine volume_5.9,Engine volume_6,Engine volume_6.2,Engine volume_6.3,Engine volume_6.4,Engine volume_7.3
0,2010,0.531337,13328,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,0.548462,16621,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2006,0.571314,8467,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2011,0.482663,3607,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,0.262522,11726,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
car_df = df_encoded
car_df.shape

(14182, 1647)

In [19]:
all_columns_path = "../app/model_objects/all_columns.pkl"
all_cols = [c for c in car_df.columns if c != "Price"]
with open(all_columns_path, 'wb') as f:
    pickle.dump(all_cols, f)

In [20]:
from sklearn.model_selection import train_test_split

X = car_df.drop('Price', axis=1)
y = car_df['Price']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
print(X.shape)
print(y.shape)

(14182, 1646)
(14182,)


In [22]:
from sklearn.linear_model import LinearRegression
linearregression = LinearRegression()
linearregression.fit(X_train, y_train)

In [23]:
MSE=[]
RMSE=[]
MAE=[]
MAPE=[]
Score=[]
R2=[]
evaluation={}

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred1 = linearregression.predict(X_test)
LinearRegression_score = linearregression.score(X_test, y_test)

print("LinearRegression score:", LinearRegression_score)

# Calculate evaluation metrics
MAE.append(mean_absolute_error(y_test, y_pred1)) 
MSE.append( mean_squared_error(y_test, y_pred1))
RMSE.append(np.sqrt(MSE[-1]))

accuracy = round(r2_score(y_test,y_pred1),3)*100


R2.append(accuracy)

# Print the evaluation metrics
print("\nLinearRegression model evaluation")
print(f'Mean Absolute Error (MAE): {MAE[-1]:.2f}')
print(f'Mean Squared Error (MSE): {MSE[-1]:.2f}')
print(f'Root Mean Squared Error (RMSE): {RMSE[-1]:.2f}')
print(f'R-squared (R²): {R2[-1]:.2f}\n')
print(f'Accuracy: {accuracy}')


evaluation["LinearRegression"] = {
        'MAE': MAE[-1],
        'MSE': MSE[-1],
        'RMSE': RMSE[-1],
        'R-squared': R2[-1]
    }

# print a sample from the test and prediction
result1=pd.DataFrame()
result1["y_test"]=y_test
result1["y_predicted"]=y_pred1
result1.sample(3)

LinearRegression score: -1.506202447883324e+20

LinearRegression model evaluation
Mean Absolute Error (MAE): 17645200654730.12
Mean Squared Error (MSE): 18347816464602839451206418432.00
Root Mean Squared Error (RMSE): 135454112025448.83
R-squared (R²): -15062024478833239916544.00

Accuracy: -1.506202447883324e+22


Unnamed: 0,y_test,y_predicted
4498,23834,19430.75
4973,24458,23475.5
3333,1882,3626.75


In [25]:
RMSE

[135454112025448.83]

In [26]:
model_path = "../app/model_objects/model.pkl"
with open(model_path,'wb') as f:
    pickle.dump(linearregression,f)

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

In [28]:
gbr = GradientBoostingRegressor(max_depth=7, n_estimators=500, learning_rate =.05)
gbr.fit(X_train, y_train)

In [29]:
y_pred = gbr.predict(X_test)

In [30]:
accuracy = round(r2_score(y_test,y_pred),3)*100
print('{:s} : {:.0f} %'.format("GBR", accuracy))
#accuracy

GBR : 51 %


In [31]:
result2=pd.DataFrame()
result2["y_test"]=y_test
result2["y_predicted"]=y_pred
result2.sample(3)

Unnamed: 0,y_test,y_predicted
18022,2901,19074.308118
16644,19287,16712.966878
2919,21169,9310.671512


In [32]:
model_path = "../app/model_objects/model_gbr.pkl"
with open(model_path,'wb') as f:
    pickle.dump(gbr, f)