In [43]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import joblib

current_path = '/Users/ericklopez/Desktop/Developing_and_Deploying_a_Predictive_Analytics_Platform_Using_FastAPI_Streamlit_and_Docker/empirical/notebooks/data_visualizations.ipynb'
sys.path.insert(0, os.path.dirname(os.path.dirname(current_path)))

In [44]:
fpath = os.path.join(os.path.abspath('..'), 'data', 'processed', 'cleaned_car_data.csv')
df = pd.read_csv(fpath)
df_copy = df.copy()
col = ['Unnamed: 0']
df_copy.drop(columns=col, inplace=True)
df_copy.head()

Unnamed: 0,name,year,fuel_type,automaker,miles_driven,price
0,Hyundai Santro Xing,2007,Petrol,Hyundai,27962,952
1,Mahindra Jeep CL550,2006,Diesel,Mahindra,25,5059
2,Maruti Suzuki Alto,2018,Petrol,Maruti,13670,0
3,Hyundai Grand i10,2014,Petrol,Hyundai,17398,3869
4,Ford EcoSport Titanium,2014,Diesel,Ford,22369,6845


In [45]:
values_to_drop = [0]
df_filtered = df_copy[~df_copy['price'].isin(values_to_drop)]

values_to_drop2 = [0]
df_filtered = df_filtered[~df_filtered['miles_driven'].isin(values_to_drop2)]

values_to_drop3 = ['Mahindra', 'Maruti', 'Skoda', 'Renault', 'Datsun', 'Tata', 'Hindustan', 'Force', 'Land', 'Volvo']
df_filtered = df_filtered[~df_filtered['automaker'].isin(values_to_drop3)]

In [46]:
df_filtered.reset_index().drop(columns=['index'], inplace=True)


Extracting Training Data

In [47]:
X = df_filtered[['name', 'automaker', 'year', 'miles_driven', 'fuel_type']]
y = df_filtered['price']

In [48]:
X.head()

Unnamed: 0,name,automaker,year,miles_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,27962,Petrol
3,Hyundai Grand i10,Hyundai,2014,17398,Petrol
4,Ford EcoSport Titanium,Ford,2014,22369,Diesel
6,Ford Figo,Ford,2012,25476,Diesel
7,Hyundai Eon,Hyundai,2013,15534,Petrol


Applying Train Test Split

In [49]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


Encoding

In [50]:
ohe=OneHotEncoder()
ohe.fit(X[['name','automaker','fuel_type']])

Transform Categorical Columns

In [51]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','automaker','fuel_type']),
                                remainder='passthrough')

Build Linear Regression Model

In [52]:
lr=LinearRegression()

Make Pipeline

In [53]:
pipe=make_pipeline(column_trans,lr)

Fitting the Model

In [54]:
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)

Checking R2 Score

In [55]:
r2_score(y_test,y_pred)


0.552188219555342

Training ML Model: 
:Cross Validation (Technique)(Resampling): 
Training ML model on multiple folds or subsets of data then evaluating them on Test Set. Test Set is set to 10% of total data.

Parameters:
:scores: 
store the R² scores for each iteration.

:lr: 
linearRegression model initialized

:pipe: 
ohe, transform categorical via One Hot Encoder (ohe)

:y_pred: 
prediction

Returns:
The index of the highest R² score in the 'scores' list


Annotations for code logic below:
:X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i:
In each iteration, the dataset X (features) and y (target) is split into training and test sets. The test_size=0.1 specifies that 10% of the data is used for testing, and the random_state=i ensures a different split each time.

:lr = LinearRegression():
Model is initialized

:pipe = make_pipeline(column_trans, lr):
One Hot Encoding (OHE) preprocessing step

:pipe.fit(X_train, y_train):
Model training, pipeline is fitted (trained) on the training data.


:y_pred = pipe.predict(X_test):
Predictions are made on the test set.

:scores.append(r2_score(y_test, y_pred)):
The R² score (coefficient of determination) is computed for the predictions and actual values. This score is appended to the scores list.

:scores[np.argmax(scores)]:
finds the index of the highest R² score in the scores list. Retrieving the highest score.



In [56]:
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()  
    pipe=make_pipeline(column_trans,lr)  
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)  
    scores.append(r2_score(y_test,y_pred))  
scores[np.argmax(scores)]

0.9600230798915883

Employing ML Model: Predict Specific Car Price 

In [60]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['BMW 3 Series','BMW',2011,100,'Petrol']).reshape(1,5)))

array([10720.63952395])

Seeking random state associated with best score

In [63]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.9600230798915883

Annotations for code below:
:import joblib: 
Imports the joblib library, which is used for saving and loading Python objects, such as machine learning models.

:joblib.dump(pipe, open('LinearRegressionModel.pkl', 'wb')): 
Saves the pipe (which is a pipeline containing the preprocessing steps and the linear regression model) to a file named LinearRegressionModel.pkl in binary write mode ('wb'). This allows the model to be loaded later without needing to retrain it.

In [66]:
import joblib
file_name = '/Users/ericklopez/Desktop/Developing_and_Deploying_a_Predictive_Analytics_Platform_Using_FastAPI_Streamlit_and_Docker/empirical/data/final/LinearRegressionModel.pkl'
joblib.dump(pipe,open(file_name,'wb'))
pipe.predict(pd.DataFrame(columns=['name','automaker','year','miles_driven','fuel_type'],data=np.array(['BMW 3 Series','BMW',2011,100,'Petrol']).reshape(1,5)))

array([10467.31056619])