In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk


In [None]:
#load the data
df = pd.read_csv('car_price_prediction.csv')

df.head()

In [None]:
#clean data
df.replace('-', np.nan, inplace=True)
df.dropna(inplace=True)

df['mileage'] = df['Mileage'].str.replace(' km', '').str.replace(',', '').astype(int)

df.head()

In [None]:
# df['Engine volume'].unique()

#remove turbo
df['Engine volume'] = df['Engine volume'].str.replace(' Turbo', '').astype(float)
df['Engine volume'].unique()

In [91]:
#encode the manufacturer column
from sklearn.preprocessing import LabelEncoder

manufacturer_encoder=LabelEncoder()
manufacturer_encoder.fit(df['Manufacturer'])
df['manufacturer_encoded']=manufacturer_encoder.transform(df['Manufacturer'])


In [None]:
#check encoded keys
manufacturer_encoder.classes_

In [None]:
df.head()

In [94]:
#selects columns to use as features
x=df[['manufacturer_encoded', 'Engine volume', 'Prod. year', 'mileage', 'Levy']]
y=df['Price']

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x.info()

In [98]:
#choose machine model and train

#linear regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
x_train.info()

In [None]:
x_test.info()

In [None]:
#train the model
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [None]:
#predict
result=lr_model.predict([[6,3.0,2015,30000,13000]])
result

In [None]:
#test model
y_pred = lr_model.predict(x_test)

from sklearn.metrics import mean_squared_error
mse_value=mean_squared_error(y_test, y_pred)
rmse_value=np.sqrt(mse_value)

print('Mean Squared Error:', mse_value)
print('Root Mean Squared Error:', rmse_value)


In [None]:
df.describe()

In [None]:
#make price type categorical into cheap and expensive 
df['price_type'] = pd.cut(df['Price'], bins=[0, 17300, 872946], labels=['cheap', 'expensive'])

df['price_type'].value_counts()

In [None]:
df.head()

In [None]:
x.head()

In [108]:
#encode price type
price_type_encoder = LabelEncoder()
price_type_encoder.fit(df['price_type'])
df['price_type_encoded'] = price_type_encoder.transform(df['price_type'])

y=df['price_type_encoded']

In [None]:
y.head()

In [None]:
#train a classfication
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
rf_model.fit(x_train, y_train)


In [None]:
#predict
result = rf_model.predict([[6,3.0,2015,30000,13000]])
result

In [None]:
#tst model wigth accuracy, precision, f1-score and recall
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import math

y_pred = rf_model.predict(x_test)
accuracy_score_value = accuracy_score(y_test, y_pred)
precision_score_value = precision_score(y_test, y_pred)
f1_score_value = f1_score(y_test, y_pred)
recall_score_value = recall_score(y_test, y_pred)

print('Accuracy Score:', math.floor(accuracy_score_value*100),"%")
print('Precision Score:', math.floor(precision_score_value*100),"%")
print('F1 Score:', math.floor(f1_score_value*100),"%")
print('Recall Score:', math.floor(recall_score_value*100),"%")


In [None]:
#Hyperparameter tuning - Grd searchcv
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [50,100,150, 200,250, 300],
    "max_depth": [2,3,4,6,7,8,9,10],
    "criterion": ["gini", "entropy","log_loss"]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1,verbose=2)
grid_search.fit(x_train, y_train)


In [None]:
grid_search.best_params_

In [None]:
tuned_model=grid_search.best_estimator_

y_pred = tuned_model.predict(x_test)
accuracy_score_value = accuracy_score(y_test, y_pred)
precision_score_value = precision_score(y_test, y_pred)

f1_score_value = f1_score(y_test, y_pred)
recall_score_value = recall_score(y_test, y_pred)

print('Accuracy Score:', math.floor(accuracy_score_value*100),"%")
print('Precision Score:', math.floor(precision_score_value*100),"%")
print('F1 Score:', math.floor(f1_score_value*100),"%")
print('Recall Score:', math.floor(recall_score_value*100),"%")

In [None]:
#naives bayes
from sklearn.naive_bayes import GaussianNB
nb_model=GaussianNB()
nb_model.fit(x_train, y_train)
y_pred = nb_model.predict(x_test)

accuracy_score_value = accuracy_score(y_test, y_pred)
precision_score_value = precision_score(y_test, y_pred)
f1_score_value = f1_score(y_test, y_pred)
recall_score_value = recall_score(y_test, y_pred)

print('Accuracy Score:', math.floor(accuracy_score_value*100),"%")
print('Precision Score:', math.floor(precision_score_value*100),"%")
print('F1 Score:', math.floor(f1_score_value*100),"%")
print('Recall Score:', math.floor(recall_score_value*100),"%")


In [None]:
#predict
result = nb_model.predict([[6,3.0,2015,30000,13000]])
result

In [None]:
#visualise the coorelation between the features
import seaborn as sns
sns.heatmap(x.corr(), annot=True)
