<h1 style = "text-align:center; font-size:300%; color:#c91d42">Cars Price Pridection</h1>

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

plt.style.use(style='ggplot')
%matplotlib inline

# Loading Training  Data

In [None]:
data = pd.read_csv('/kaggle/input/used-cars-price-prediction/train-data.csv')

In [None]:
data.head()

In [None]:
data.info()

# Data Cleaning

### Dropping Unnecessary Columns

In [None]:
# Drop Unnecessary columns 
data.drop(['Unnamed: 0'],axis=1, inplace=True)

### Dropping Missing Values

In [None]:
# Check Missing Values
round(data.isnull().mean() * 100, 2)

In [None]:
# Dropping Missing Values
data.dropna(subset = ['Mileage', 'Power', 'Mileage', 'Seats'], inplace=True)
data.drop(['New_Price'],axis=1, inplace=True)

**We drop the missing values of Mileage, Engine, Power, and Seats because they are less than 1% of the total data.**

**We drop the column of New_Price because the missing values from it are more than 86% of the total data.**

In [None]:
# Check Missing Values agine
data.isnull().sum()

In [None]:
# Check 'null km/kg' Values
data[data['Mileage'] == 'null km/kg']

In [None]:
# Check 'null kmpl' Values
data[data['Mileage'] == 'null kmpl']

In [None]:
# Check 'null CC' Values
data[data['Engine'] == 'null CC']

In [None]:
# Check 'null bhp' Values
data[data['Power'] == 'null bhp']

In [None]:
# Replace 'null bhp' with the mean of Power
data['Power'] = data['Power'].str.replace('null bhp', '113')   # 113 is a median of Power feature

### Removing the non-numeric characters from features values

In [None]:
# Remove the non-numeric characters and convert to float64
data['Mileage'] = data['Mileage'].str.replace('[^\d\.]', '').astype('float64')
data['Engine'] = data['Engine'].str.replace('[^\d\.]', '').astype('float64')
data['Power'] = data['Power'].str.replace('[^\d\.]', '').astype('float64')

### Removing Duplicate Values

In [None]:
# Check Duplicate Values
data.duplicated().sum()

### Detecting and Removing Outliers

In [None]:
col = ['Kilometers_Driven', 'Mileage', 'Power', 'Engine'] 
plt.figure(figsize=(18,12))
for i,v in enumerate(col):
    print(i,v)
    plt.subplot(3,2,i+1)
    sns.boxplot(data = data, x = v, color = 'red')
plt.show()

In [None]:
# Remove outliers for Kilometers_Driven
data.drop(data[data['Kilometers_Driven'] > 100000].index, axis = 0, inplace = True)

In [None]:
# Remove outliers for Power
data.drop(data[data['Power'] > 480].index, axis = 0, inplace = True)

In [None]:
# Remove outliers for Engine
data.drop(data[data['Engine'] > 5000].index, axis = 0, inplace = True)

In [None]:
col = ['Kilometers_Driven', 'Mileage', 'Power', 'Engine'] 
plt.figure(figsize=(18,12))
for i,v in enumerate(col):
    print(i,v)
    plt.subplot(3,2,i+1)
    sns.boxplot(data = data, x = v, color = 'green')
plt.show()

In [None]:
# change float format
pd.options.display.float_format = '{:,.0f}'.format

In [None]:
data.describe()

In [None]:
data.info()

# Feature Engineering

In [None]:
# Check Name column uniques 
data['Name'].nunique()

In [None]:
print(f"Percentage of uniques { round(data['Name'].nunique() / data.shape[0] * 100)} %")

**The "Name" feature has no effect that's because it has so many unique values, So let's make it useful and impactful.**

In [None]:
data['Name']

**We can notice that the first word of the name is (Brand), so let's get it**

In [None]:
data['Brand'] = data['Name'].apply(lambda row: row.split()[0])

In [None]:
data['Brand'].nunique()

In [None]:
data['Brand'].unique()

**Now we don't need the 'Name ' column, So let's drop it.**

In [None]:
# Drop Name column
data.drop(['Name'], axis=1, inplace=True)

In [None]:
# change the position of columns
data = data.reindex(columns=['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Brand', 
                             'Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price'])

# Exploratory Data Analysis (EDA)

In [None]:
data.head()

## 1. Univariate Analysis

### Year

In [None]:
data['Year'].min()

In [None]:
data['Year'].max()

In [None]:
plt.figure(figsize = (12 ,6))
sns.histplot(data = data, x = data['Kilometers_Driven'], kde = True)
plt.show()

### Mileage

In [None]:
data['Mileage'].describe()

In [None]:
plt.figure(figsize = (12 ,6))
sns.histplot(data = data, x = data['Mileage'], kde = True)
plt.show()

### Engine

In [None]:
data['Engine'].describe()

In [None]:
plt.figure(figsize = (12 ,6))
sns.histplot(data = data, x = data['Engine'], bins = 16, kde = True)
plt.show()

### Power

In [None]:
data['Power'].describe()

In [None]:
plt.figure(figsize = (12 ,6))
sns.histplot(data = data, x = data['Power'], kde = True)
plt.show()

### Price

In [None]:
data['Price'].describe()

In [None]:
plt.figure(figsize = (12 ,6))
sns.histplot(data = data, x = data['Kilometers_Driven'], kde = True)
plt.show()

### Location

In [None]:
data['Location'].value_counts().sort_values(ascending = False)

In [None]:
plt.figure(figsize = (14 ,6))
sns.countplot(data = data, x = 'Location')
plt.show()

### Owner Type

In [None]:
data['Owner_Type'].value_counts().sort_values(ascending = False)

In [None]:
plt.figure(figsize = (10 ,6))
sns.countplot(data = data, x = 'Owner_Type')
plt.show()

### Fuel Type

In [None]:
data['Fuel_Type'].value_counts().sort_values(ascending = False)

In [None]:
plt.figure(figsize = (10 ,6))
sns.countplot(data = data, x = 'Fuel_Type')
plt.show()

### Transmission

In [None]:
data['Transmission'].value_counts().sort_values(ascending = False)

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (14, 7))
sns.countplot(data = data, x = 'Transmission', ax = ax[0])
ax[1].pie(data['Transmission'].value_counts().values, labels = data['Transmission'].value_counts().index, 
          autopct = '%1.1f%%')
plt.show()

### Brand

In [None]:
Brand_data = data['Brand'].value_counts().sort_values(ascending = False)
Brand_data

In [None]:
plt.figure(figsize = (22 ,10))
sns.countplot(data = data, x = 'Brand')
plt.show()

### Seats

In [None]:
data['Seats'].value_counts().sort_values(ascending = False)

In [None]:
plt.figure(figsize = (10 ,6))
sns.countplot(data = data, x = 'Seats')
plt.show()

### Conclusions for Univariate Analysis

In [None]:
fig, ax = plt.subplots(2, 3, figsize = (20, 14))
sns.histplot(data = data, x = data['Year'], bins = 20, kde = True, ax = ax[0, 0])
sns.histplot(data = data, x = data['Kilometers_Driven'], bins = 20, kde = True, ax = ax[0, 1])
sns.histplot(data = data, x = data['Mileage'], bins = 20, kde = True, ax = ax[0, 2])
sns.histplot(data = data, x = data['Engine'], bins = 20, kde = True, ax = ax[1, 0])
sns.histplot(data = data, x = data['Power'], bins = 20, kde = True, ax = ax[1, 1])
sns.histplot(data = data, x = data['Price'], bins = 20, kde = True, ax = ax[1, 2])
plt.show()

In [None]:
fig, ax = plt.subplots(2, 3, figsize = (24, 18))
sns.countplot(data = data, x = 'Transmission', ax = ax[0, 0])
sns.countplot(data = data, x = 'Fuel_Type', ax = ax[0, 1])
sns.countplot(data = data, x = 'Owner_Type', ax = ax[0, 2])
sns.countplot(data = data, x = 'Location', ax = ax[1, 0])
sns.countplot(data = data, x = 'Brand', ax = ax[1, 1])
sns.countplot(data = data, x = 'Seats', ax = ax[1, 2])
plt.show()

## 2. Bivariate Analysis

In [None]:
plt.figure(figsize = (14, 8))
sns.heatmap(data.corr(), annot = True)
plt.show()

### Year & Price

In [None]:
plt.figure(figsize = (14, 8))
sns.lineplot(data = data, x = 'Year', y = 'Price', linewidth = 4, color = '#f01528')
plt.show()

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = data, x = 'Year', y = 'Price', linewidth = 4, color = '#f01528', edgecolor = "#f01528")
plt.show()

#### Note:
**There is a positive relationship between the price of the car and the year of the car, as the price of the car increases every year.**

### Kilometers_Driven & Price

In [None]:
plt.figure(figsize = (14, 8))
sns.lineplot(data = data, x = 'Kilometers_Driven', y = 'Price', linewidth = 2, color = '#0080fc')
plt.show()

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = data, x = 'Kilometers_Driven', y = 'Price', linewidth = 4, color = '#0080fc', edgecolor = "#0080fc")
plt.show()

#### Note:
**The relation between kilometers driven and car price is a measure of how much a car has been driven. In general, as the number of kilometers driven increases, the price of the car tends to decrease.**

### Mileage & Price

In [None]:
plt.figure(figsize = (14, 8))
sns.lineplot(data = data, x = 'Mileage', y = 'Price', linewidth = 2, color = '#8b21b0')
plt.show()

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = data, x = 'Mileage', y = 'Price', linewidth = 4, color = '#8b21b0', edgecolor = "#8b21b0")
plt.show()

#### Note: 
**The relation between mileage and car price is negative, meaning that as the mileage of a car increases, the price of the car tends to decrease. This is because higher mileage indicates that the car has been driven more, which can lead to more wear and tear on the engine, transmission, and other components**

### Engine & Price

In [None]:
plt.figure(figsize = (14, 8))
sns.lineplot(data = data, x = 'Engine', y = 'Price', linewidth = 2, color = '#800000')
plt.show()

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = data, x = 'Mileage', y = 'Price', linewidth = 4, color = '#800000', edgecolor = "#800000")
plt.show()

#### Note:
**The relation between engine and car price is positive, meaning that the car engine is one of the factors that can impact the price of a car. Generally, a more powerful engine will increase the price of the car.**

### Power & Price

In [None]:
plt.figure(figsize = (14, 8))
sns.lineplot(data = data, x = 'Power', y = 'Price', linewidth = 2, color = '#d13d59')
plt.show()

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = data, x = 'Power', y = 'Price', linewidth = 4, color = '#d13d59', edgecolor = "#d13d59")
plt.show()

#### Note:
**The relation between the power of a car and the price of a car is positive, meaning that cars with more power tend to be more expensive than cars with less power. This is because higher-powered cars often have larger engines and more advanced technology, which can increase their manufacturing costs.**

### Seats & Price

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = data, x = 'Seats', y = 'Price', linewidth = 4, color = '#1a0000', edgecolor = "#1a0000")
plt.show()

#### Note:
**The number of seats in a car can affect its price, cars that have 5 seats have the highest price, followed by cars that have 4 and 7 seats.**

### Fuel_Type & Price

In [None]:
data_fuel_price = data.groupby('Fuel_Type')['Price'].median().sort_values(ascending = False)
data_fuel_price

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.barplot(x = data_fuel_price.index, y = data_fuel_price.values)
ax.set_ylabel("Price")
plt.show()

#### Note:
**The cars that use fuel-type diesel have higher prices.**

### Owner_Type & Price

In [None]:
data_owner_price = data.groupby('Owner_Type')['Price'].median().sort_values(ascending = False)
data_owner_price

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.barplot(x = data_owner_price.index, y = data_owner_price.values)
ax.set_ylabel("Price")
plt.show()

#### Note:
**The cars that had only one owner have a higher price than the cars that had multiple owners.**

### Location & Price

In [None]:
data_loca_price = data.groupby('Location')['Price'].median().sort_values(ascending = False)
data_loca_price

In [None]:
plt.figure(figsize = (14, 6))
ax = sns.barplot(x = data_loca_price.index, y = data_loca_price.values)
ax.set_ylabel("Price")
plt.show()

#### Notes:
**Cars in Coimbatore are higher priced than cars in other regions.**

**Cars in Ahmedabad, Hyderabad, Mumbai, and Delhi are the same price.**

### Transmission & Price

In [None]:
data_trans_price = data.groupby('Transmission')['Price'].median().sort_values(ascending = False)
data_trans_price

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.barplot(x = data_trans_price.index, y = data_trans_price.values)
ax.set_ylabel("Price")
plt.show()

#### Note:
**The cars that have a transmission automatic are higher priced than the cars that have a transmission Manual.**

### Brand & Price

In [None]:
data_brand_price = data.groupby('Brand')['Price'].median().sort_values(ascending = False).nlargest(30)
data_brand_price

In [None]:
plt.figure(figsize = (16, 6))
ax = sns.barplot(x = data_brand_price.index, y = data_brand_price.values)
ax.set_ylabel("Price")
plt.show()

#### Note:
**The cars from the type Porsche have a higher price compared to other types of cars and come after her the cars from the type Land.**

### Conclusions for Bivariate Analysis

In [None]:
fig, ax = plt.subplots(2, 3, figsize = (18, 12))
sns.lineplot(data = data, x = 'Year', y = 'Price', linewidth = 2, color = '#f01528', ax = ax[0, 0])
sns.lineplot(data = data, x = 'Kilometers_Driven', y = 'Price', linewidth = 2, color = '#0080fc', ax = ax[0, 1])
sns.lineplot(data = data, x = 'Mileage', y = 'Price', linewidth = 2, color = '#8b21b0', ax = ax[0, 2])
sns.lineplot(data = data, x = 'Engine', y = 'Price', linewidth = 2, color = '#800000', ax = ax[1, 0])
sns.lineplot(data = data, x = 'Power', y = 'Price', linewidth = 2, color = '#d13d59', ax = ax[1, 1])
sns.histplot(data = data, x = 'Seats', y = 'Price', color = '#1a0000', ax = ax[1, 2])
plt.show()

In [None]:
fig, ax = plt.subplots(2, 2, figsize = (24, 18))
sns.barplot(x = data_fuel_price.index, y = data_fuel_price.values, ax = ax[0, 0])
sns.barplot(x = data_owner_price.index, y = data_owner_price.values, ax = ax[0, 1])
sns.barplot(x = data_trans_price.index, y = data_trans_price.values, ax = ax[1, 0])
sns.barplot(x = data_loca_price.index, y = data_loca_price.values, ax = ax[1, 1])
ax[0, 0].set_ylabel('Price', size = 12)
ax[0, 1].set_ylabel('Price', size = 12)
ax[1, 0].set_ylabel('R&Price', size = 12)
ax[1, 1].set_ylabel('Price', size = 12)
plt.show()

## 3. Multivariate Analysis

In [None]:
sns.pairplot(data)

### Fuel_Type & Transmission & Engine & Power & Price

In [None]:
data.groupby(['Fuel_Type', 'Transmission'])['Engine'].median().sort_values(ascending = False)

In [None]:
data.groupby(['Fuel_Type', 'Transmission'])['Power'].median().sort_values(ascending = False)

In [None]:
data.groupby(['Fuel_Type', 'Transmission'])['Price'].median().sort_values(ascending = False)

In [None]:
fig, ax = plt.subplots(1, 3, figsize = (16, 7))
sns.barplot(data = data, x = 'Fuel_Type', y = 'Engine', hue = 'Transmission', ax = ax[0])
sns.barplot(data = data, x = 'Fuel_Type', y = 'Power', hue = 'Transmission', ax = ax[1])
sns.barplot(data = data, x = 'Fuel_Type', y = 'Price', hue = 'Transmission', ax = ax[2])
plt.show()

#### Note:
**The cars that use fuel-type diesel and have a transmission automatic have a higher engine equal to 2143CC,  higher power equal to 174bhp, and the price of these cars is very high.**

### Owner_Type & Transmission  & Fuel_Type & Price

In [None]:
data.groupby(['Owner_Type', 'Transmission'])['Price'].median().sort_values(ascending = False)

In [None]:
data.groupby(['Owner_Type', 'Fuel_Type'])['Price'].median().sort_values(ascending = False)

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (16, 7))
sns.barplot(data = data, x = 'Owner_Type', y = 'Price', hue = 'Transmission', ax = ax[0])
sns.barplot(data = data, x = 'Owner_Type', y = 'Price', hue = 'Fuel_Type', ax = ax[1])
plt.show()

#### Note:
**The cars that use fuel-type diesel, and had only one owner, and have a transmission automatic the price of these cars is very high, compared to the cars that had multiple owners and have a transmission man manual and use different fuel-type.**

### Location & Transmission & Fuel_Type & Price

In [None]:
data.groupby(['Location', 'Transmission'])['Price'].median().sort_values(ascending = False)

In [None]:
data.groupby(['Location', 'Fuel_Type'])['Price'].median().sort_values(ascending = False)

In [None]:
fig, ax = plt.subplots(2, 1, figsize = (16, 12))
sns.barplot(data = data, x = 'Location', y = 'Price', hue = 'Transmission', ax = ax[0])
sns.barplot(data = data, x = 'Location', y = 'Price', hue = 'Fuel_Type', ax = ax[1])
plt.show()

#### Note:
**The cars in Coimbatore that use fuel-type diesel and have a transmission automatic have higher prices than cars in other regions.**

# Data Pre-processing

## 1. Splitting the Features

In [None]:
# Splitting the data to Numerical Features and Categorical Features
num_features = list(data.select_dtypes('number'))
num_features.remove('Price')

nom_features = list(data.select_dtypes('object'))
nom_features.remove('Owner_Type')

ord_features = ['Owner_Type']

print(f'The Numerical Features: {num_features}')
print(f'The Nominal Features: {nom_features}')
print(f'The Ordinal Features: {ord_features}')

## 2. Splitting the data to training data and testing data

In [None]:
# taking a copy from data
data_modeling = data.copy()

X = data_modeling.drop('Price', axis = 1)
y = data_modeling['Price'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size  = 0.2, random_state = 42)

In [None]:
X_train[X_train['Brand'] == 'Isuzu']

In [None]:
X_test[X_test['Brand'] == 'Isuzu']

In [None]:
X_test = X_test.drop(2900)
y_test = y_test.drop(2900)

**The "Isuzu" brand is not in X_train, so it hasn't been encoded because it is not recognized in the transformation pipeline training, so we dropped it from X_test and we dropped its target from y_test.**

## 3. Transformation Pipeline

In [None]:
# numerical pipeline
num_pipeline = make_pipeline(
                             SimpleImputer(strategy = 'median'),
                             StandardScaler()
                            )
# nominal pipeline
nom_pipeline = make_pipeline(
                             SimpleImputer(strategy = 'most_frequent'),
                             OneHotEncoder()
                            )

# ordinal pipeline
ord_pipeline = make_pipeline(
                             SimpleImputer(strategy = 'most_frequent'),
                             OrdinalEncoder()
                            )
# Combine the numerical pipeline, nominal pipeline, and ordinal pipeline in a one Pipeline 
preprocessor = ColumnTransformer([
                                  ('numerical', num_pipeline, num_features), 
                                  ('nominal', nom_pipeline, nom_features),
                                  ('ordinal', ord_pipeline, ord_features)
                                ])
    
# Fitting the training data               
preprocessor.fit(X_train)

## 4. Data Transformation

In [None]:
# Transform the training data
X_train_transformed = preprocessor.transform(X_train)

# Transform the testing data
X_test_transformed = preprocessor.transform(X_test)

# Modeling

## 1. Linear Regression

In [None]:
# Linear Regression Model
lin_reg = LinearRegression()

# Model Fitting
lin_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the Linear Regression Model 

# Predictions of X_train
lr_pred_tarin = lin_reg.predict(X_train_transformed)

print("Training Accuracy of Linear Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, lr_pred_tarin)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, lr_pred_tarin))}')
print(f'R-Squered Training       :  {round(r2_score(y_train, lr_pred_tarin) * 100, 2)} %')

In [None]:
# Cross Validation for Linear Regression Model
lin_reg_scores = cross_val_score(lin_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'Linear Regression Accuracy          :  {lin_reg_scores}')
print(f'Linear Regression Standard Deviation:  {round(lin_reg_scores.std(), 2)}')
print(f'Linear Regression Accuracy Mean     :  {round(lin_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the Linear Regression Model 

# Predictions of X_test
lr_pred_test = lin_reg.predict(X_test_transformed)

print("Tasting Accuracy for Linear Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, lr_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, lr_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, lr_pred_test) * 100, 2)} %')

## 2. Polynomial Regression

In [None]:
# Linear Regression Model with Polynomial Features
poly_reg = Pipeline([
    
                      ('poly_features', PolynomialFeatures(degree = 2)),
                      ('lin_reg', LinearRegression())
    
                    ])

# Model Fitting
poly_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the Polynomial Regression Model 

# Predictions of X_train
poly_pred_train = poly_reg.predict(X_train_transformed)

print("Training Accuracy for Polynomial Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, poly_pred_train)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, poly_pred_train))}')
print(f'R-Squered Training       :  {round(r2_score(y_train, poly_pred_train) * 100, 2)} %')

In [None]:
# Cross Validation for Polynomial Regression Model
poly_reg_scores = cross_val_score(poly_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'Polynomial Regression Accuracy          :  {poly_reg_scores}')
print(f'Polynomial Regression Standard Deviation:  {round(poly_reg_scores.std(), 2)}')
print(f'Polynomial Regression Accuracy Mean     :  {round(poly_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the Polynomial Regression Model 

# Predictions of X_test
poly_pred_test = poly_reg.predict(X_test_transformed)

print("Tasting Accuracy for Polynomial Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, poly_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, poly_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, poly_pred_test) * 100, 2)} %')

## 3. Ridge Regression

In [None]:
# Ridge Regression Model with Polynomial Features
ridge_reg = Pipeline([
    
                            ('poly_features', PolynomialFeatures(degree = 2)),
                            ('ridge', Ridge(alpha = 0.5))
                        ])

# Model Fitting
ridge_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the Ridge Regression Model 

# Predictions of X_train
ridge_pred_train = ridge_reg.predict(X_train_transformed)

print("Training Accuracy for Ridge Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, ridge_pred_train)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, ridge_pred_train))}')
print(f'R-Squered Training          :  {round(r2_score(y_train, ridge_pred_train) * 100, 2)} %')

In [None]:
# Cross Validation for Ridge Regression Model
ridge_reg_scores = cross_val_score(ridge_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'Ridge Regression Accuracy          :  {ridge_reg_scores}')
print(f'Ridge Regression Standard Deviation:  {round(ridge_reg_scores.std(), 2)}')
print(f'Ridge Regression Accuracy Mean     :  {round(ridge_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the Ridge Regression Model 

# Predictions of X_test
ridge_pred_test = ridge_reg.predict(X_test_transformed)

print("Tasting Accuracy for Ridge Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, ridge_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, ridge_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, ridge_pred_test) * 100, 2)} %')

## 4. RidgeCV Regression

In [None]:
# RidgeCV Regression Model with Polynomial Features
ridgecv_reg = Pipeline([
    
                              ('poly_features', PolynomialFeatures(degree = 2)),
                              ('ridgecv', RidgeCV(alphas=[0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 5.0]))
    
                            ])

# Model Fitting
ridgecv_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the RidgeCV Regression Model

# Predictions of X_train
ridgecv_pred_train = ridgecv_reg.predict(X_train_transformed)

print("Training Accuracy for RidgeCV Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, ridgecv_pred_train)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, ridgecv_pred_train))}')
print(f'R-Squered Training          :  {round(r2_score(y_train, ridgecv_pred_train) * 100, 2)} %')

In [None]:
# Cross Validation for RidgeCV Regression Model
ridgecv_reg_scores = cross_val_score(ridgecv_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'RidgeCV Regression Accuracy          :  {ridgecv_reg_scores}')
print(f'RidgeCV Regression Standard Deviation:  {round(ridgecv_reg_scores.std(), 2)}')
print(f'RidgeCV Regression Accuracy Mean     :  {round(ridgecv_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the RidgeCV Regression Model

# Predictions of X_test
ridgecv_pred_test = ridgecv_reg.predict(X_test_transformed)

print("Tasting Accuracy for RidgeCV Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, ridgecv_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, ridgecv_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, ridgecv_pred_test) * 100, 2)} %')

## 5. Lasso Regression

In [None]:
# Lasso Regression Model with Polynomial Features
lasso_reg = Pipeline([
    
                            ('poly_features', PolynomialFeatures(degree = 2)),
                            ('lasso', Lasso(alpha = 0.5))
    
                        ])

# Model Fitting
lasso_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the Lasso Regression Model

# Predictions of X_train
lasso_pred_train = lasso_reg.predict(X_train_transformed)

print("Training Accuracy for Lasso Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, lasso_pred_train)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, lasso_pred_train))}')
print(f'R Squared Error          :  {round(r2_score(y_train, lasso_pred_train) * 100, 2)} %')

In [None]:
# Cross Validation for Lasso Regression Model
lasso_reg_scores = cross_val_score(ridgecv_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'Lasso Regression Accuracy          :  {lasso_reg_scores}')
print(f'Lasso Regression Standard Deviation:  {round(lasso_reg_scores.std(), 2)}')
print(f'Lasso Regression Accuracy Mean     :  {round(lasso_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the Lasso Regression Model

# Predictions of X_test
lasso_pred_test = lasso_reg.predict(X_test_transformed)

print("Tasting Accuracy for Lasso Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, lasso_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, lasso_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, lasso_pred_test) * 100, 2)} %')

## 6. LassoCV Regression

In [None]:
# LassoCV Regression Model with Polynomial Features
lassocv_reg = Pipeline([
    
                            ('poly_features', PolynomialFeatures(degree = 2)),
                            ('lassocv', LassoCV(alphas=[0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 5.0]))
                          
                            ])

# Model Fitting
lassocv_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the LassoCV Regression Model

# Predictions of X_train
lassocv_pred_train = lassocv_reg.predict(X_train_transformed)

print("Training Accuracy for LassoCV Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, lassocv_pred_train)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, lassocv_pred_train))}')
print(f'R-Squered Train          :  {round(r2_score(y_train, lassocv_pred_train) * 100, 2)} %')

In [None]:
# Cross Validation for LassoCV Regression Model
lassocv_reg_scores = cross_val_score(lassocv_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'LassoCV Regression Accuracy          :  {lassocv_reg_scores}')
print(f'LassoCV Regression Standard Deviation:  {round(lassocv_reg_scores.std(), 2)}')
print(f'LassoCV Regression Accuracy Mean     :  {round(lassocv_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the LassoCV Regression Model

# Predictions of X_test
lassocv_pred_test = lassocv_reg.predict(X_test_transformed)

print("Tasting Accuracy for LassoCV Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, lassocv_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, lassocv_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, lassocv_pred_test) * 100, 2)} %')

## 7. ElasticNet Regression

In [None]:
# ElasticNet Regression Model with Polynomial Features
elasticnet_reg = Pipeline([
    
                                ('poly_features', PolynomialFeatures(degree = 2)),
                                ('elasticnt', ElasticNet(alpha = 0.5))
    
                              ])

# Model Fitting
elasticnet_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the ElasticNet Regression Model

# Predictions of X_train
elasticnet_pred_train = elasticnet_reg.predict(X_train_transformed)

print("Training Accuracy for ElasticNet Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, elasticnet_pred_train)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, elasticnet_pred_train))}')
print(f'R-Squered Train          :  {round(r2_score(y_train, elasticnet_pred_train) * 100, 2)} %')

In [None]:
# Cross Validation for ElasticNet Regression Model
elasticnet_reg_scores = cross_val_score(elasticnet_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'ElasticNet Regression Accuracy          :  {elasticnet_reg_scores}')
print(f'ElasticNet Regression Standard Deviation:  {round(elasticnet_reg_scores.std(), 2)}')
print(f'ElasticNet Regression Accuracy Mean     :  {round(elasticnet_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the ElasticNet Regression Model

# Predictions of X_test
elasticnet_pred_test = elasticnet_reg.predict(X_test_transformed)

print("Tasting Accuracy for LassoCV Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, elasticnet_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, elasticnet_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, elasticnet_pred_test) * 100, 2)} %')

## 8. ElasticNetCV Regression

In [None]:
# ElasticNetCV Regression Model with Polynomial Features
elasticnetcv_reg = Pipeline([
    
                                  ('poly_features', PolynomialFeatures(degree = 2)),
                                  ('elasticntcv', ElasticNetCV())
                               
                                ])
 
# Model Fitting
elasticnetcv_reg.fit(X_train_transformed, y_train)

In [None]:
# Training the ElasticNetCV Regression Model

# Predictions of X_train
elasticnetcv_pred_train = elasticnetcv_reg.predict(X_train_transformed)

print("Training Accuracy for ElasticNetCV Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_train, elasticnetcv_pred_train)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_train, elasticnetcv_pred_train))}')
print(f'R-Squered Train          :  {round(r2_score(y_train, elasticnetcv_pred_train) * 100, 2)} %')

In [None]:
# Cross Validation for ElasticNet Regression Model
elasticnetcv_reg_scores = cross_val_score(elasticnetcv_reg, X_train_transformed, y_train, scoring = 'r2', cv = 8)

print(f'ElasticNetCV Regression Accuracy          :  {elasticnetcv_reg_scores}')
print(f'ElasticNetCV Regression Standard Deviation:  {round(elasticnetcv_reg_scores.std(), 2)}')
print(f'ElasticNetCV Regression Accuracy Mean     :  {round(elasticnetcv_reg_scores.mean() * 100, 2)} %')

In [None]:
#Testing the ElasticNetCV Regression Model

# Predictions of X_test
elasticnetcv_pred_test = elasticnetcv_reg.predict(X_test_transformed)

print("Tasting Accuracy for ElasticNetCV Regression: \n")
print(f'Mean Absolute Error      :  {mean_absolute_error(y_test, elasticnetcv_pred_test)}')
print(f'Root Mean Squared  Error :  {np.sqrt(mean_squared_error(y_test, elasticnetcv_pred_test))}')
print(f'R-Squered Testing        :  {round(r2_score(y_test, elasticnetcv_pred_test) * 100, 2)} %')

# Summarize for Models

In [None]:
# Linear Regression vs. Decision Tree Training and Testing Accuracy
Models_Summarization = pd.DataFrame(
                {
                  
                    'Training Accuracy': [round(r2_score(y_train, lr_pred_tarin) * 100, 2), 
                                          round(r2_score(y_train, poly_pred_train) * 100, 2), 
                                          round(r2_score(y_train, ridge_pred_train) * 100, 2), 
                                          round(r2_score(y_train, ridgecv_pred_train) * 100, 2), 
                                          round(r2_score(y_train, lasso_pred_train) * 100, 2), 
                                          round(r2_score(y_train, lassocv_pred_train) * 100, 2), 
                                          round(r2_score(y_train, elasticnet_pred_train) * 100, 2),
                                          round(r2_score(y_train, elasticnetcv_pred_train) * 100, 2)], 
               
                    'Teasting Accuracy': [round(r2_score(y_test, lr_pred_test) * 100, 2), 
                                         round(r2_score(y_test, poly_pred_test) * 100, 2), 
                                         round(r2_score(y_test, ridge_pred_test) * 100, 2), 
                                         round(r2_score(y_test, ridgecv_pred_test) * 100, 2), 
                                         round(r2_score(y_test, lasso_pred_test) * 100, 2), 
                                         round(r2_score(y_test, lassocv_pred_test) * 100, 2), 
                                         round(r2_score(y_test, elasticnet_pred_test) * 100, 2), 
                                         round(r2_score(y_test, elasticnetcv_pred_test) * 100, 2)], 
              
                }, 
              
                index = ['Linear Regression', 'Polynomial Regression', 'Ridge Regression', 'RidgeCV Regression', 
                         'Lasso Regression', 'LassoCV Regression', 'ElasticNet Regression', 'ElasticNetCV Regression']
            )

In [None]:
Models_Summarization

**The best regularization method That fits data well is RidgeCV**

# Save Model

In [None]:
# the library of save models
import joblib

In [None]:
# Save RidgeCV Regression Model
joblib.dump(ridgecv_reg, 'RidgeCV_Model.pkl')

In [None]:
# Save Transformation Pipeline
joblib.dump(preprocessor, 'preprocessor.pkl')