In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Regression Example: Used Car Price Prediction

Regression analysis is a set of ML algorithms for estimating the relationships between a dependent (continuous) variable (also called the 'outcome' or 'response' variable) and one or more independent variables (often called 'predictors', or 'features').

Source: https://en.wikipedia.org/wiki/Regression_analysis

Other References:

https://hbr.org/2015/11/a-refresher-on-regression-analysis

### Loading the Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
cars_df = pd.read_csv( "/content/drive/MyDrive/MLDL3Days/notebooks/new_used_car.csv" )

In [None]:
type(cars_df)

In [7]:
cars_df.sample(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,mlg,age,make,model,mileage_new,engine_new,power_new,KM_Driven
428,Ahmedabad,Petrol,Manual,First,5.0,3.2,18.9,5,maruti,wagon,18.9,998.0,67.1,51
1773,Hyderabad,Diesel,Manual,First,5.0,3.5,21.1,10,maruti,ritz,21.1,1248.0,73.9,110
187,Hyderabad,Diesel,Manual,Second,5.0,3.2,21.1,10,maruti,ritz,21.1,1248.0,73.9,68
584,Pune,Diesel,Manual,Second,5.0,2.25,19.0,10,fiat,linea,19.0,1248.0,93.0,172
28,Hyderabad,Petrol,Manual,First,5.0,2.25,18.2,10,maruti,zen,18.2,998.0,67.1,39


In [10]:
cars_df.sample(5, random_state = 100)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,mlg,age,make,model,mileage_new,engine_new,power_new,KM_Driven
1587,Mumbai,Petrol,Manual,First,5.0,1.46,19.0,11,maruti,zen,19.0,998.0,67.1,47
1888,Mumbai,Diesel,Manual,Second,5.0,4.5,24.4,4,hyundai,xcent,24.4,1120.0,71.0,63
845,Chennai,Petrol,Automatic,First,5.0,3.96,16.95,7,hyundai,i10,16.95,1197.0,78.9,21
362,Chennai,Petrol,Manual,First,4.0,1.6,25.4,9,tata,nano,25.4,624.0,37.48,35
2858,Kochi,Diesel,Manual,Second,5.0,7.97,28.09,2,maruti,ciaz,28.09,1248.0,88.5,17


In [8]:
cars_df.head(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,mlg,age,make,model,mileage_new,engine_new,power_new,KM_Driven
0,Chennai,Petrol,Manual,First,5.0,4.5,18.2,9,honda,jazz,18.2,1199.0,88.7,46
1,Chennai,Diesel,Manual,First,7.0,6.0,20.77,8,maruti,ertiga,20.77,1248.0,88.76,87
2,Jaipur,Diesel,Manual,First,5.0,3.5,23.08,7,nissan,micra,23.08,1461.0,63.1,86
3,Chennai,Diesel,Manual,Second,5.0,1.95,22.3,8,tata,indica,22.3,1248.0,74.0,65
4,Jaipur,Diesel,Manual,First,5.0,5.6,25.2,5,maruti,swift,25.2,1248.0,74.0,64


In [9]:
cars_df.tail(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,mlg,age,make,model,mileage_new,engine_new,power_new,KM_Driven
3087,Coimbatore,Diesel,Manual,First,5.0,4.83,25.8,5,honda,amaze,25.8,1498.0,98.6,70
3088,Delhi,Diesel,Manual,First,5.0,4.75,28.4,6,maruti,swift,28.4,1248.0,74.0,27
3089,Jaipur,Diesel,Manual,First,5.0,4.0,24.4,5,hyundai,xcent,24.4,1120.0,71.0,100
3090,Kolkata,Petrol,Manual,First,5.0,2.65,18.9,7,maruti,wagon,18.9,998.0,67.1,46
3091,Hyderabad,Diesel,Manual,First,5.0,2.5,25.44,9,chevrolet,beat,25.44,936.0,57.6,47


In [11]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      3092 non-null   object 
 1   Fuel_Type     3092 non-null   object 
 2   Transmission  3092 non-null   object 
 3   Owner_Type    3092 non-null   object 
 4   Seats         3091 non-null   float64
 5   Price         3092 non-null   float64
 6   mlg           3092 non-null   float64
 7   age           3092 non-null   int64  
 8   make          3092 non-null   object 
 9   model         3092 non-null   object 
 10  mileage_new   3092 non-null   float64
 11  engine_new    3092 non-null   float64
 12  power_new     3092 non-null   float64
 13  KM_Driven     3092 non-null   int64  
dtypes: float64(6), int64(2), object(6)
memory usage: 338.3+ KB


In [12]:
cars_df.Location.unique()

array(['Chennai', 'Jaipur', 'Kochi', 'Bangalore', 'Kolkata', 'Hyderabad',
       'Delhi', 'Coimbatore', 'Mumbai', 'Pune', 'Ahmedabad'], dtype=object)

In [13]:
cars_df.Location.value_counts()

Hyderabad     406
Kolkata       346
Pune          341
Kochi         339
Mumbai        322
Coimbatore    305
Jaipur        275
Chennai       264
Delhi         248
Bangalore     124
Ahmedabad     122
Name: Location, dtype: int64

In [14]:
cars_df.Location.value_counts(normalize = True) * 100

Hyderabad     13.130660
Kolkata       11.190168
Pune          11.028461
Kochi         10.963777
Mumbai        10.413972
Coimbatore     9.864166
Jaipur         8.893920
Chennai        8.538163
Delhi          8.020699
Bangalore      4.010349
Ahmedabad      3.945666
Name: Location, dtype: float64

### Participant Exercise:

#### What are different transmission types and how many cars are there in the dataset from each transmission type?

In [None]:
plt.figure(figsize=(15, 6))
plt.hist(cars_df.KM_Driven);

In [None]:
plt.figure(figsize=(15, 6))
plt.hist(cars_df.KM_Driven, bins = range(0, 200, 10));
plt.xticks(range(0, 200, 10));

In [None]:
cars_df.KM_Driven.min()

In [None]:
cars_df.KM_Driven.max()

### Participant Exercise:

#### Create a histogram for Price to understand the frequency distribution for every 1 lakh sold price?

In [None]:
sn.lmplot(data = cars_df.sample(200),
          x = 'mileage_new',
          y = 'Price');

In [None]:
sn.lmplot(data = cars_df.sample(200),
          x = 'KM_Driven',
          y = 'Price');

## Building a simple linear regression model

Assumes linear relationship between features and outcome variable.

### Setting X and Y Variables

In [None]:
X = pd.DataFrame(cars_df['KM_Driven'])
y = cars_df['Price']

### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [None]:
X_train[0:10]

In [None]:
X_test.shape

In [None]:
X_train.shape

### Observing the relationship

In [None]:
sn.lmplot( data = cars_df.sample(100),
           x = 'KM_Driven',
           y = 'Price',
           fit_reg = False);

### Building the model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lreg_v1 = LinearRegression()
lreg_v1.fit(X_train, y_train)

#### Finding the model parameters

In [None]:
lreg_v1.intercept_

In [None]:
lreg_v1.coef_

### Predicting on test set and evaluation model performance

In [None]:
y_pred = lreg_v1.predict(X_test)

In [None]:
y_df = pd.DataFrame({"actual": y_test,
                     "predicted": y_pred,
                     "residual": y_pred - y_test})

In [None]:
y_df.sample(10, random_state = 100)

#### What is R-quared?
https://www.investopedia.com/terms/r/r-squared.asp

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test, y_pred)

### Participants Exercise: 1

Build a model by adding the following four parameters and measure accuracy

- mileage_new 
- engine_new 
- power_new
- KM_Driven

## Building model with all required variables (Multiple Linear Regression)

### Feature Set Selection

In [None]:
list(cars_df.columns)

In [None]:
#x_features = ['KM_Driven', 'Fuel_Type', 'age',
#              'Transmission', 'Owner_Type', 'Seats', 
#              'make', 'mileage_new', 'engine_new', 
#              'power_new', 'Location', 'model']

x_features = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats', 
              'make', 'mileage_new', 'engine_new', 
              'power_new', 'Location']

In [None]:
cat_features = ['Fuel_Type', 
                'Transmission', 'Owner_Type',
                'make', 'Location']

#cat_features = ['Fuel_Type', 
#                'Transmission', 'Owner_Type',
#                'make', 'Location', 'model']

In [None]:
num_features = list(set(x_features) - set(cat_features))

In [None]:
num_features

In [None]:
cars_df[x_features].info()

In [None]:
cars_df.isnull().sum()

### Dropping Null Values

In [None]:
cars_df = cars_df[x_features + ['Price']].dropna()

In [None]:
cars_df.shape

In [None]:
cars_df.sample(10)

### Encoding Categorical Variables

OHE: One Hot Encoding

https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/

In [None]:
encoded_cars_df = pd.get_dummies(cars_df[x_features], 
                                 columns=cat_features)

In [None]:
encoded_cars_df.sample(5)

In [None]:
encoded_cars_df.columns

In [None]:
encoded_cars_df.shape

### Setting X and y variables

In [None]:
X = encoded_cars_df
y = cars_df['Price']

### Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [None]:
X_train.shape

In [None]:
X_test.shape

### Multiple Linear Regression Models

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lreg_v1 = LinearRegression()

In [None]:
lreg_v1.fit(X_train, y_train)

### Understanding model parameters

In [None]:
lreg_v1.intercept_

In [None]:
lreg_v1.coef_

In [None]:
dict(zip(X_train.columns, 
         np.round(lreg_v1.coef_, 3)))

### Predict on test set

In [None]:
y_pred = lreg_v1.predict(X_test)

In [None]:
y_df = pd.DataFrame({"actual": y_test,
                     "predicted": y_pred,
                     "residual": y_pred - y_test})

In [None]:
y_df.sample(10, random_state = 100)

In [None]:
r2_score(y_test, y_pred)

### Measuring Accuracy

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
mse_v1 = mean_squared_error(y_test, y_pred)

In [None]:
mse_v1

In [None]:
rmse_v1 = np.sqrt(mse_v1)

In [None]:
rmse_v1

### Participant Exercise: 2

Take different training set, build model and measure the model accuracy. But, how to sample differenent training and test sets?
- Change the random_state to different numbers while training and test splits and then measure the r2 values.
- Repeat the above process for 5 different random_states and make a note of the r2 values.

### K-FOLD Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LinearRegression(),
                         X_train,
                         y_train,
                         cv = 10,
                         scoring = 'r2')
scores.mean()

In [None]:
scores

In [None]:
scores.std()

In [None]:
r2_score(y_test, y_pred)

### What are the reasons for the remaining error

1. More factors 
2. More samples 
3. Complex Models : Try other models
4. Feature Engineering - Derive new features (factors) from existing features (factors)
5. Noise (randomness)
   

### Saving the model

In [None]:
class CarPredictionModel():
    
    def __init__(self, model, features, rmse):
        self.model = model
        self.features = features
        self.rmse = rmse

In [None]:
my_model = CarPredictionModel(lreg_v1, list(X_train.columns), rmse_v1)

In [None]:
my_model.rmse

In [None]:
# Uncomment this code for older version of sklearn
#from sklearn.externals import joblib
#joblib.dump(my_model, './cars.pkl')

In [None]:
from joblib import dump

In [None]:
dump(my_model, './cars.pkl')

### Participant Exercise: 3

1. Removing all cars prior to 2010
2. Add the car model (cateorical variable) to the list of x features.
3. Build a new linear regression model
4. Predict on test set and measure the accuracy (RMSE and R Squared values)
5. Do the cross Validation and find the mean and std of the r2 values

## Building KNN Model

In [None]:
sn.lmplot( data = cars_df.sample(50, random_state = 80),
           x = "mileage_new",
           y = 'KM_Driven',
           fit_reg = False);

In [None]:
cars_df.sample(10)

### Scaling the data

- Min Max Scaler
- Standard Scaler

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(X_train)

In [None]:
x_train_scaled = scaler.transform(X_train)
x_test_scaled = scaler.transform(X_test)

In [None]:
x_train_scaled.shape

In [None]:
x_train_scaled[0:10]

### Build the model

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn_v1 = KNeighborsRegressor(n_neighbors=10,
                             weights='distance')

In [None]:
knn_v1.fit(x_train_scaled, y_train)

In [None]:
x_train_scaled.shape

### Predicting on test data and calculating accuracy

In [None]:
y_knn_pred = knn_v1.predict(x_test_scaled)

In [None]:
mse_knn = mean_squared_error(y_test, y_knn_pred)

In [None]:
np.sqrt(mse_knn)

In [None]:
r2_score(y_test, y_knn_pred)

### Participant Exercise: 4

Finding best params

- Iterate through a list of possible K values. For example: 3 through 15
- Build model for each k value, predict on test set and measure it's accuracy
- Print the k value for which r2 is maximum

In [None]:
for k in range(3, 16):
  knn_v1 = KNeighborsRegressor(n_neighbors=k, weights='distance')
  knn_v1.fit(x_train_scaled, y_train)
  y_knn_pred = knn_v1.predict(x_test_scaled)
  r2 = r2_score(y_test, y_knn_pred)

  print(f"for k={k} - r2={round(r2, 4)}")

### Grid Search