# Creating ML Pipeline



### Load Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
cars_df = pd.read_csv( "E:\ML_course\practice\S8_Creating_ML_Pipeline/final_cars_maruti.csv" )

In [3]:
cars_df.sample(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,Age,Model,Mileage,Power,KM_Driven
178,Ahmedabad,Diesel,Manual,First,7,5.75,7,ertiga,20.77,88.8,67
902,Mumbai,Petrol,Manual,Second,5,1.1,13,wagon,17.3,64.0,50
7,Delhi,Diesel,Manual,First,5,4.25,6,swift,22.9,74.0,52
436,Coimbatore,Diesel,Manual,First,5,5.17,6,swift,23.4,74.0,67
841,Hyderabad,Petrol,Manual,First,5,2.2,9,alto,20.92,67.1,81


In [4]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      1010 non-null   object 
 1   Fuel_Type     1010 non-null   object 
 2   Transmission  1010 non-null   object 
 3   Owner_Type    1010 non-null   object 
 4   Seats         1010 non-null   int64  
 5   Price         1010 non-null   float64
 6   Age           1010 non-null   int64  
 7   Model         1010 non-null   object 
 8   Mileage       1010 non-null   float64
 9   Power         1010 non-null   float64
 10  KM_Driven     1010 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 86.9+ KB


### Feature Set Selection

In [5]:
x_features = ['Fuel_Type', 
              'Transmission', 
              'Owner_Type', 
              'Age', 
              'Model', 
              'KM_Driven']

In [6]:
x_features

['Fuel_Type', 'Transmission', 'Owner_Type', 'Age', 'Model', 'KM_Driven']

In [7]:
cat_vars = ['Fuel_Type',
            'Transmission',
            'Owner_Type',
            'Model']

In [8]:
num_vars = list(set(x_features) - set(cat_vars))

In [9]:
num_vars

['Age', 'KM_Driven']

### Need for Data Transformation

1. Categorical columns
    - OHE Encoding
2. Numerical Columns
    - No Transformation Required

### Setting X and y variables

In [10]:
X = cars_df[x_features]
y = cars_df['Price']

### Data Splitting

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [13]:
X_train.shape

(808, 6)

In [14]:
X_test.shape

(202, 6)

## Creating Pipelines

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
one_hot_encoding = OneHotEncoder(handle_unknown='ignore')   

- for example we have fuel type petrol and desel and if anyone fill data in fuel type is electrical so it will get ignored

In [17]:
from sklearn.compose import ColumnTransformer

In [18]:
from sklearn.pipeline import Pipeline

In [19]:
cat_transform = Pipeline(steps= [('oheencoder',one_hot_encoding)])

In [20]:
preprocesser = ColumnTransformer(
            transformers=[('numerical',"passthrough",num_vars),
                          ('categorical', cat_transform,cat_vars)]
)

### Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lreg = LinearRegression()

In [23]:
lreg_pipeline = Pipeline(steps= [('preprocessing',preprocesser),
                                 ('regression', lreg)])

In [24]:
lreg_pipeline

In [25]:
lreg_pipeline.fit(X_train,y_train)

### Predict on Test Set

In [26]:
y_pred = lreg_pipeline.predict(X_test)

In [27]:
from sklearn.metrics import mean_squared_error, r2_score

In [28]:
r2_score(y_test,y_pred)

0.8746563035294356

## Predicting on New Data

In [29]:
data = {'Fuel_Type': 'Diesel',
        'Transmission': 'Manual',
        'Owner_Type': 'First',
        'Age': 8,
        'Model': 'ertiga',
        'KM_Driven': 87}

In [30]:
data_df = pd.DataFrame(data , index=[0])

In [31]:
data_df

Unnamed: 0,Fuel_Type,Transmission,Owner_Type,Age,Model,KM_Driven
0,Diesel,Manual,First,8,ertiga,87


In [32]:
price = lreg_pipeline.predict(data_df)[0]
price

6.171483840956446

## Save the Pipeline

In [33]:
from joblib import dump

In [34]:
dump(lreg_pipeline,'cars_pridiction.pkl')

['cars_pridiction.pkl']