In [22]:
# import necessary libraries:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [23]:
# read data file:
df = pd.read_csv('cars.csv')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319972 entries, 0 to 319971
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      319972 non-null  int64  
 1   ort             319971 non-null  object 
 2   modell          319972 non-null  object 
 3   marke           319972 non-null  object 
 4   price           319972 non-null  float64
 5   Karosserieform  319972 non-null  object 
 6   Kilometerstand  319941 non-null  float64
 7   Erstzulassung   319972 non-null  float64
 8   Leistung        319807 non-null  float64
 9   Getriebe        319972 non-null  object 
 10  Kraftstoff      304843 non-null  object 
 11  Außenfarbe      315618 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 29.3+ MB


In [25]:
# show the 5 rows of the data:
df.head()

Unnamed: 0.1,Unnamed: 0,ort,modell,marke,price,Karosserieform,Kilometerstand,Erstzulassung,Leistung,Getriebe,Kraftstoff,Außenfarbe
0,0,Donauwörth,forTwo,smart,32989.0,Kleinwagen,5500.0,2015.0,75.0,Automatik,Super 95,Weiß
1,1,Schwalmtal,A1,Audi,26700.0,Kleinwagen,21000.0,2019.0,85.0,Automatik,Benzin,Blau
2,3,Göttingen,Cuore,Daihatsu,4950.0,Kleinwagen,79100.0,2005.0,43.0,Automatik,Benzin,Blau
3,4,Wuppertal,208,Peugeot,20990.0,Kleinwagen,51.0,2019.0,74.0,Automatik,Super 95 (Partikelfilter),Rot
4,5,Köln,forFour,smart,18950.0,Kleinwagen,22356.0,2019.0,66.0,Automatik,Super E10 95,Grau


In [26]:
# drop unnecessary column:
df = df.drop(columns=['ort','Außenfarbe'])

In [27]:
# Use KNN imputer to impute the missing values:

# imputer = KNNImputer()

# df['Kilometerstand'] = imputer.fit_transform(df[['Kilometerstand']])
# df['Erstzulassung'] = imputer.fit_transform(df[['Erstzulassung']])
# df['Leistung'] = imputer.fit_transform(df[['Leistung']])


In [28]:
df = df.drop(columns= ['Kraftstoff', 'marke']) # to train the model

In [29]:
#split the data:
X, y = df.drop("price",axis=1) , df["price"]


# one hot encode :
encoder= OneHotEncoder()
X = encoder.fit_transform(X)


# use pandas get_dummies for encoding:
# X = pd.get_dummies(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# train the XGBoost Model:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred= xgb.predict(X_test)


print(f'the r2 score: {r2_score(y_test, y_pred)}')
print(f'the mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'the mean absolute precentage error: {mean_absolute_percentage_error(y_test, y_pred)}')


  from pandas import MultiIndex, Int64Index


the r2 score: 0.8638455608316453
the mean absolute error: 4241.48887669113
the mean absolute precentage error: 0.4733128374204775


In [31]:
# plot the predicted value comparing to the actual values:
fig = px.scatter(x = y_test, y= y_pred, width= 800, height=800)
fig.show()

In [None]:
# to enhance the performance of the model we can use GridSerachCV to know the best parameters (this method take long time):


# from xgboost import XGBRegressor
# xgb = XGBRegressor()

# parameters = {'learning_rate': [0.03, 0.05, 0.07], 
#   'max_depth': [7, 9, 10],
#   #'min_child_weight': [1, 3, 5, 7],
#   #'subsample': [0.7,0.9,0.8],
#   #'colsample_bytree': [0.7, 0.8],
#   'n_estimators': [200, 500, 700]}

# xgb_grid = GridSearchCV(xgb,
#   parameters,
#   cv = 2)

# xgb_grid.fit(X_train, y_train)

# print("=============================")
# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)
# print("======================")

# xgb_chosed_model = xgb_grid.best_estimator_
# Y_pred=xgb_chosed_model.predict(X_test)

# xg_mae=mean_absolute_error(y_test,y_pred)
# print("Mean Absolute Error: ",xg_mae)
# print('mape: ' ,mean_absolute_percentage_error(y_test, y_pred))
# print(xg_mae/df['price'].mean())

In [32]:
# save the model:

#import pickle

# with open ('model.pkl','wb') as f:
#   pickle.dump(xgb, f)


# with open ('model.pkl','rb') as f:
#   model = pickle.load(f)

