In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from scipy import stats

import joblib

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('ford_price_prediction.csv')

## First Glance To Data 🔎
<hr>

In [None]:
df.head()

In [None]:
df.info()

In [None]:
desc = ['price','tax','mileage' ,'year', 'fuelType', 'mpg', 'engineSize']
for name in desc:
    if df[name].dtypes == 'object':
        print(f"{name}:\n",df[name].describe(),"\n")
df[desc].describe()

## Explatory Data Analysis 📊 and Data Cleaning 🧹
<hr>

In [None]:
for col in df.columns:
    print(col)
    print(df[f'{col}'].unique())
    print('*'*75)

In [None]:
df['engineSize'][df['engineSize']==0] = np.nan
df['engineSize'].unique()

In [None]:
df[df['fuelType'] == 'Electric']

In [None]:
df['engineSize'][df['fuelType']=='Electric'] = 0
df[df['fuelType'] == 'Electric']


In [None]:
enginevol_to_fix = df[df['engineSize'].isna()]
enginevol_to_fix

In [None]:
df[df['year'] == 2060]

In [None]:
df['year'][df['year']==2060] = 2006
df[df['mileage']==54807]

In [None]:
cars = df

In [None]:
cars.head()

In [None]:
display(cars[cars.price == cars.price.max()])
display(cars[cars.price < 1000])

In [None]:
display(cars[cars.mileage == cars.mileage.max()])
display(cars[cars.mileage < 1000])

As a outlier detection algorithm, we will use ***IQR calculation*** and get rid of the values which is higher or lower than ***1.5 IQR***.

In [None]:
def detect_outliers(df,features,thold):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * thold
        # Detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Store indeces
        outlier_indices.extend(outlier_list_col)
    
    
    return outlier_indices

In [None]:
features = ['price', 'tax', 'mileage']
outliers = detect_outliers(cars,features, 1.5)
valid_df = cars.drop(cars.loc[outliers].index,axis=0)
valid_df['price'].median()

In [None]:
top10_cars = valid_df['model'].value_counts().sort_values(ascending = False)[:10]
top10_mean_prices = [valid_df[valid_df['model'] == i]['price'].mean() for i in list(top10_cars.index)]

fig = plt.figure(figsize=(14,6))
ax = fig.add_subplot(121)
sns.barplot(x=top10_cars.index, y=top10_cars.values, palette='hot')
plt.xticks(rotation = 90)
plt.ylabel('Ammount of cars')
plt.title('Top10 The Most Frequent Cars')

ax2 = fig.add_subplot(122)
sns.lineplot(x=top10_cars.index, y=top10_mean_prices, color='r')
plt.xticks(rotation = 90)
plt.ylabel('Mean Prices')
plt.title("Top10 Cars' Mean Prices")
plt.show()

In [None]:
valid_df.groupby('transmission')['price'].median().sort_values(ascending=False)

In [None]:
valid_df.groupby('engineSize')['price'].median().sort_values(ascending=False)

In [None]:
valid_df.groupby('fuelType')['price'].median().sort_values(ascending=False)

In [None]:
valid_df['fuelType'].value_counts()

In [None]:
%matplotlib inline

valid_df.hist(bins=25, figsize=(20,15))
plt.show()

In [None]:
attributes = ['price','tax','engineSize','year','mileage']
scatter_matrix(valid_df[attributes], figsize=(12,8))
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(valid_df.corr(),annot=True, cbar = True)
plt.title('Correlation Matrix')
plt.show()

In [None]:
num_attribs = ['tax', 'year', 'engineSize', 'mileage', 'mpg']
cat_attribs = ['model','transmission', 'fuelType']

Let's split our target value from our dataset.

In [None]:
num_cars = valid_df[num_attribs]
y = valid_df['price']
cat_cars = valid_df[cat_attribs]

In [None]:
Imputer = SimpleImputer(strategy='median')

Imputer.fit(num_cars)
# Displaying medians of every numveric column we have
display(Imputer.statistics_)
num_cars = Imputer.transform(num_cars)
num_cars

Time to scale our numerical attributes! Scaling will help to our model to make better predictions and computation time will be less.

In [None]:
scaler = StandardScaler()
num_cars = scaler.fit_transform(num_cars)

In [None]:
num_cars

Let's put these 2 steps together to use them with one piece of code when we need them in the future.

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())])

Also we need to handle with the categorical columns because the machine learning model we will use expects only numerical values. So we will use one hot encoding, which will give 1 (Hot) the attribute is exist for that sample and 0 (Cold) for others and making full pipeline which will handle numerical values and categorical values at the same time)

In [None]:
full_pipeline = ColumnTransformer([
    ('num',num_pipeline, num_attribs),
    ('cat',OneHotEncoder(), cat_attribs)  
])
cars_prepared = full_pipeline.fit_transform(valid_df)

Time to split our data for train and test, we will use %66 of them for train and %33 of them for test step.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cars_prepared, y, test_size=0.33, random_state = 123)

## Model Setup, Hyperparameter Tuning and Model Evaluation 🧱
<hr>

In [None]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

In [None]:
predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
print(y_train)

In [None]:
print(predictions)

In [None]:
mae = mean_absolute_error(y_train, predictions)
mae

In [None]:
linear_regression_results = [y_train, predictions]
linear_regression_results_plot = sns.boxplot(data=linear_regression_results)
plt.ylabel('Car price')
plt.title('Linear Regression results')
linear_regression_results_plot.set_xticks([0, 1], ['dataset_prices', 'predicted_prices'])
plt.show()

In [None]:
sns.distplot(a=y_train, color='crimson')
sns.distplot(a=predictions, color='limegreen')
plt.xlabel('Car price')
plt.title('Linear Regression results')
plt.legend(labels=['Actual prices', 'Predicted prices'])

In [None]:
print("-----------------------------------------------------------------------------------------------------------------------------------")

In [None]:
tree_reg = DecisionTreeRegressor(random_state = 123)

tree_reg.fit(X_train, y_train)

In [None]:
tree_predictions = tree_reg.predict(X_train)
tree_mse = mean_squared_error(y_train, tree_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
tree_results = [y_train, tree_predictions]
tree_results_plot = sns.boxplot(data=tree_results)
plt.title('Decision Tree results')
plt.ylabel('Car price')
tree_results_plot.set_xticks([0, 1], ['dataset_prices', 'predicted_prices'])
plt.show()

In [None]:
sns.distplot(a=y_train, color='crimson')
sns.distplot(a=tree_predictions, color='limegreen')
plt.xlabel('Car price')
plt.title('Decision Tree results')
plt.legend(labels=['Actual prices', 'Predicted prices'])

In [None]:
scores = cross_val_score(tree_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(scores)

tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores

In [None]:
lin_scores = cross_val_score(lin_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
lin_rmse_scores = np.sqrt(-lin_scores)
lin_rmse_scores

In [None]:
print("-----------------------------------------------------------------------------------------------------------------------------------")

In [None]:
forest_reg = RandomForestRegressor(n_estimators = 100, random_state=123)

forest_reg.fit(X_train, y_train)

In [None]:
forest_predictions = forest_reg.predict(X_train)
forest_mse = mean_squared_error(y_train, forest_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
random__forest_results = [y_train, forest_predictions]
random__forest_results_plot = sns.boxplot(data=random__forest_results)
plt.title('Random Forest results')
plt.ylabel('Car price')
random__forest_results_plot.set_xticks([0, 1], ['dataset_prices', 'predicted_prices'])
plt.show()

In [None]:
sns.distplot(a=y_train, color='crimson')
sns.distplot(a=forest_predictions, color='limegreen')
plt.xlabel('Car price')
plt.title('Random forest results')
plt.legend(labels=['Actual prices', 'Predicted prices'])

In [None]:
sns.distplot(a=y_train, color='crimson')
sns.distplot(a=forest_predictions, color='limegreen')
plt.xlabel('Car price')
plt.title('Random Forest results')
plt.legend(labels=['Actual prices', 'Predicted prices'])

In [None]:
forest_scores = cross_val_score(forest_reg, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)
forest_rmse_scores

In [None]:
param_grid = [
    {'n_estimators': [100, 200], 'max_features': [35,33,31]},
  ]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = [i  for cat in cat_encoder.categories_ for i in cat]
attributes = num_attribs +  cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

In [None]:
# Save the model we trained
joblib.dump(final_model, "final_model.pkl")

# If you want to use this model all you need to do is:
# joblib.load('final_model.pkl')

## Extra

We checked the feature importances couple of cell before and some of the features seem not adding quite information to our model. So let's pick first k features to train our model. In order to do that, we will make custom transformer and will use it in the Pipeline which will take care data cleaning, selection of top k features and training our model.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [None]:
final_pipeline = Pipeline([('full',full_pipeline),
                           ('top_feature_selector',TopFeatureSelector(feature_importances, 35)),
                          ('model', final_model)])

In [None]:
final_pipeline.fit(valid_df.drop('Price', axis=1), valid_df['Price'])

In [None]:
some_data = valid_df.drop('Price', axis=1).iloc[:4]
some_labels = valid_df['Price'].iloc[:4]

pred = final_pipeline.predict(some_data)
display(pred)
display(some_labels.values)