1. Import dropped df
2. Create pipeline consisting of preprocessor and a model
3. Drop some columns
4. Train the pipeline on the data
5. Cheack results
6. Export pipeline
7. Import pipeline
8. Predict with new data entry

In [31]:
%matplotlib inline 
# Make plots appear inside the notebook

# EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor

import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

# Import Dataframe

In [49]:
df_dropped_empty_outliers = pd.read_csv("df_preprocessing.csv", low_memory=False)

In [50]:
df_dropped_empty_outliers.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,auburn,33590,2014,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,other,,,pickup,white,al
1,auburn,22590,2010,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,other,,,pickup,blue,al
2,auburn,39590,2020,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,other,,,pickup,red,al
3,auburn,30990,2017,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,other,,,pickup,red,al
4,auburn,15000,2013,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al


In [51]:
df_selected_features = df_dropped_empty_outliers.drop(columns=["region", "model", "condition", "cylinders", "fuel", "title_status", "drive", "type", "paint_color", "state"])

In [52]:
df_selected_features.head()

Unnamed: 0,price,year,manufacturer,odometer,transmission,size
0,33590,2014,gmc,57923.0,other,
1,22590,2010,chevrolet,71229.0,other,
2,39590,2020,chevrolet,19160.0,other,
3,30990,2017,toyota,41124.0,other,
4,15000,2013,ford,128000.0,automatic,full-size


# Create pipeline consisting of preprocessor and a model

In [53]:
# Identify categorical and numerical columns
categorical_cols = df_selected_features.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_selected_features.select_dtypes(exclude=['object']).columns.tolist()
numerical_cols.remove('price')  # Remove the target column

# Define preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Define preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [54]:
df_selected_features.head()

Unnamed: 0,price,year,manufacturer,odometer,transmission,size
0,33590,2014,gmc,57923.0,other,
1,22590,2010,chevrolet,71229.0,other,
2,39590,2020,chevrolet,19160.0,other,
3,30990,2017,toyota,41124.0,other,
4,15000,2013,ford,128000.0,automatic,full-size


In [55]:
# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [56]:
# Create and save the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [57]:
X = df_selected_features.drop(columns='price')
y = df_selected_features['price']

In [58]:
pipeline.fit(X, y)

In [59]:
# Transform the data
transformed_data = preprocessor.transform(df_selected_features.drop(columns='price'))

# Convert the transformed data back to a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=numerical_cols + categorical_cols)

In [60]:
transformed_df.tail()

Unnamed: 0,year,odometer,manufacturer,transmission,size
370457,0.849971,-0.988181,32.0,3.0,3.0
370458,0.956009,-1.318989,42.0,3.0,3.0
370459,0.956009,-1.447647,7.0,3.0,3.0
370460,0.743933,-1.022806,24.0,3.0,3.0
370461,0.849971,-1.143946,5.0,3.0,3.0


In [71]:
# Save the pipeline
joblib.dump(pipeline, 'models/car_price_prediction_pipeline.pkl')

['models/car_price_prediction_pipeline.pkl']

In [72]:
imported_pipeline = joblib.load("models/car_price_prediction_pipeline.pkl")

In [73]:
# Example new data for prediction
new_data = pd.DataFrame({
    'year': [2019],
    'manufacturer': ['chevrolet'],
    'odometer': [15000.0],
    'transmission': ['automatic'],
    'size': ['full-size'],
})

# Predict using the pipeline
prediction = imported_pipeline.predict(new_data)

In [74]:
prediction

array([17495.61333333])