### Load Dataset

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [36]:
cars_df = pd.read_csv( "E:\ML_course\practice\S11_Model_Deployment/final_cars_maruti.csv" )

In [37]:
cars_df.sample(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,Age,Model,Mileage,Power,KM_Driven
44,Hyderabad,Petrol,Manual,First,5,2.1,9,alto,20.92,67.1,70
862,Hyderabad,Diesel,Manual,First,5,5.5,7,dzire,28.4,73.75,68
317,Delhi,Diesel,Manual,First,5,6.4,4,ciaz,28.09,88.5,66
452,Chennai,Petrol,Manual,First,5,6.7,5,ciaz,20.73,91.1,30
747,Delhi,Diesel,Manual,First,5,5.95,5,baleno,27.39,74.0,54


In [38]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      1010 non-null   object 
 1   Fuel_Type     1010 non-null   object 
 2   Transmission  1010 non-null   object 
 3   Owner_Type    1010 non-null   object 
 4   Seats         1010 non-null   int64  
 5   Price         1010 non-null   float64
 6   Age           1010 non-null   int64  
 7   Model         1010 non-null   object 
 8   Mileage       1010 non-null   float64
 9   Power         1010 non-null   float64
 10  KM_Driven     1010 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 86.9+ KB


### Feature Set Selection

In [39]:
x_features = ['Fuel_Type', 
              'Transmission', 
              'Owner_Type', 
              'Age', 
              'Model', 
              'KM_Driven']

In [40]:
x_features

['Fuel_Type', 'Transmission', 'Owner_Type', 'Age', 'Model', 'KM_Driven']

In [41]:
cat_vars = ['Fuel_Type',
            'Transmission',
            'Owner_Type',
            'Model']

In [42]:
num_vars = list(set(x_features) - set(cat_vars))

In [43]:
num_vars

['Age', 'KM_Driven']

### Need for Data Transformation

1. Categorical columns
    - OHE Encoding
2. Numerical Columns
    - No Transformation Required

### Setting X and y variables

In [44]:
X = cars_df[x_features]
y = cars_df['Price']

### Data Splitting

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [47]:
X_train.shape

(808, 6)

In [48]:
X_test.shape

(202, 6)

## Creating Pipelines

In [49]:
from sklearn.preprocessing import OneHotEncoder

In [50]:
one_hot_encoding = OneHotEncoder(handle_unknown='ignore')   

- for example we have fuel type petrol and desel and if anyone fill data in fuel type is electrical so it will get ignored

In [51]:
from sklearn.compose import ColumnTransformer

In [52]:
from sklearn.pipeline import Pipeline

In [53]:
cat_transform = Pipeline(steps= [('oheencoder',one_hot_encoding)])

In [54]:
preprocesser = ColumnTransformer(
            transformers=[('numerical',"passthrough",num_vars),
                          ('categorical', cat_transform,cat_vars)]
)

### Linear Regression

In [55]:
from sklearn.linear_model import LinearRegression

In [56]:
lreg = LinearRegression()

In [57]:
lreg_pipeline = Pipeline(steps= [('preprocessing',preprocesser),
                                 ('regression', lreg)])

In [58]:
lreg_pipeline

In [59]:
lreg_pipeline.fit(X_train,y_train)

### Predict on Test Set

In [60]:
y_pred = lreg_pipeline.predict(X_test)

In [61]:
from sklearn.metrics import mean_squared_error, r2_score

In [62]:
r2_score(y_test,y_pred)

0.8746563035294356

## Predicting on New Data

In [63]:
data = {'Fuel_Type': 'Diesel',
        'Transmission': 'Manual',
        'Owner_Type': 'First',
        'Age': 8,
        'Model': 'ertiga',
        'KM_Driven': 87}

In [64]:
data_df = pd.DataFrame(data , index=[0])

In [65]:
data_df

Unnamed: 0,Fuel_Type,Transmission,Owner_Type,Age,Model,KM_Driven
0,Diesel,Manual,First,8,ertiga,87


In [66]:
price = lreg_pipeline.predict(data_df)[0]
price

6.171483840956446

## Save the Pipeline

In [67]:
from joblib import dump

In [68]:
dump(lreg_pipeline,'car_prediction.pkl')

['car_prediction.pkl']