<a href="https://colab.research.google.com/github/pkant-0/Kegal_Notebooks/blob/main/Fuel_Efficiency_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'auto-mpg:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5349799%2F8898225%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240708%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240708T152334Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D33fdfc0bb5d7a1ea650fd441613cacc01652eee1509b0d7a18b643e53411f249a3252fcb1427e95fc732f09f16b4753fa4239f1c2a798cf7c877c20a4b9fde9bc8c39f6458279470f67142c13fdf2eae1863b7806fc9cdc30a1111378d50ea718f1b241ef92419bef21dfa8da381a37fe13133c3877dd1328ebfde024351f02ce3daf64307bb8b6e2bc5e464a6833a09312b212dee8757291c10d2bd0419f4bdb9fe49411611a9554e0a585882d7ad8303a30581e4799adbf328de6252b7f073dd5a51821cdc11f0e7406bc129aab60dc83428ad5cc60a239130149c96e1582f1ee5fbe94cc524bb900d6e004ce4995f562ee9c3d09c1f9ff903159e0196964a'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# **Fuel Efficiency Prediction - Regression Using the Auto MPG dataset**



In [None]:
# User Seaborn for pairplot
!pip install seaborn

In [None]:
from __future__ import absolute_import, division, print_function
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
print(tf.__version__)

In [None]:
dataset_path = keras.utils.get_file("auto-mpg.data", "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path

In [None]:
import os

In [None]:
dataset_path = '//kaggle/input/auto-mpg/'
files = os.listdir(dataset_path)
print(files)

In [None]:
dataset_name = 'auto-mpg.data.txt'
dataset_path = os.path.join(dataset_path, dataset_name)

In [None]:
# Read the CSV file into a DataFrame
dataset = pd.read_csv(dataset_path)

In [None]:
dataset

In [None]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values= "*", comment='\t', sep=" ", skipinitialspace=True)

In [None]:
dataset = raw_dataset.copy()
dataset.head()

In [None]:
dataset.tail()

# Data Cleaning
Looking into data to understand the pattern and find missing or override to make data more impactful

In [None]:
dataset.isnull().sum()

In [None]:
dataset= dataset.dropna()

In [None]:
import pandas as pd

# Assuming the dataset has already been read into the DataFrame 'dataset'
# Example:
# dataset = pd.read_csv(file_path, sep=' ')

# Ensure that the 'origin' column is present in the dataset
if 'Origin' in dataset.columns:
    # Create dummy variables for the 'origin' column
    origin_dummies = pd.get_dummies(dataset['Origin'], prefix='Origin')

    # Rename the columns to match 'USA', 'Europe', and 'Japan'
    origin_dummies.columns = ['USA', 'Europe', 'Japan']

    # Add the dummy columns to the original dataset
    dataset = pd.concat([dataset, origin_dummies], axis=1)

    # Optionally, drop the original 'origin' column if no longer needed
    dataset.drop('Origin', axis=1, inplace=True)

    # Display the first few rows of the dataset to verify the new columns
    print(dataset.head())
else:
    print("The 'Origin' column is not present in the dataset.")


In [None]:
dataset['USA'] = (dataset['Origin'] == 1).astype(float)
dataset['Europe'] = (dataset['Origin'] == 2).astype(float)
dataset['Japan'] = (dataset['Origin'] == 3).astype(float)

# Model training and evloution

We need to split the data into training set and testing set

In [None]:
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

# Undestand the data:
lets inspect the data at the joint distribution of a few pairs of columns from the trainig set,

In [None]:
sns.pairplot((train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde"))

In [None]:
sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde")

In [None]:
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats

# Inspect the feature and split from labels

we need to identify the target value from features and seperate the target values. The label is useful in train the model to predict.

In [None]:
train_labels = train_dataset.pop('MPG')

In [None]:
test_labels = test_dataset.pop('MPG')

# Data Normalization:

Normalizing data is a common preprocessing step in data analysis and machine learning that involves transforming data to a common scale without distorting differences in the ranges of values. Normalization ensures that each feature contributes equally to the analysis, preventing any single feature from dominating due to its scale. Here are several reasons why normalizing data is considered a good practice:
1. Improves Convergence in Gradient Descent

    Reason: Gradient descent algorithms, commonly used in training machine learning models, converge faster when the data is normalized.
    Explanation: When features are on different scales, the cost function contours can be elongated, causing the algorithm to oscillate and take longer to find the minimum. Normalized data typically results in more circular contours, leading to faster convergence.

2. Ensures Fair Comparison of Features

    Reason: Normalization ensures that features with larger magnitudes do not dominate the learning process.
    Explanation: Without normalization, features with larger ranges can disproportionately influence the model's predictions, leading to biased results. Normalizing allows each feature to contribute equally.

3. Improves Model Performance

    Reason: Many machine learning algorithms, such as k-nearest neighbors (KNN) and support vector machines (SVM), perform better with normalized data.
    Explanation: These algorithms rely on distance metrics. If features are on different scales, the distance calculations may become skewed, leading to inaccurate predictions.

4. Stabilizes Numerical Computations

    Reason: Normalizing data can prevent numerical instability in algorithms that involve matrix operations.
    Explanation: Large ranges in data can cause numerical instability, leading to overflow or underflow issues in computations. Normalized data typically results in more stable numerical operations.

5. Enhances Interpretability

    Reason: Normalized data makes it easier to interpret model coefficients and feature importance.
    Explanation: When data is normalized, the model coefficients can be directly compared to understand the relative importance of each feature.

Common Normalization Techniques

    Min-Max Scaling:
        Rescales the data to a fixed range, usually 0 to 1.
        Formula: X′=X−XminXmax−XminX′=Xmax​−Xmin​X−Xmin​​

    Standardization (Z-score normalization):
        Rescales data to have a mean of 0 and a standard deviation of 1.
        Formula: X′=X−μσX′=σX−μ​

    Max Abs Scaling:
        Scales each feature by its maximum absolute value.
        Formula: X′=X∣Xmax∣X′=∣Xmax​∣X​

    Robust Scaling:
        Scales data using statistics that are robust to outliers, such as median and interquartile range.
        Formula: X′=X−medianIQRX′=IQRX−median​

In [None]:
def norm(X):
    return (x - train_stats['mean']) / train_stats['std']

#normed_train_data = norm(train_dataset)
#normed_test_data = norm(test_dataset)

In [None]:
normed_train_data = norm(train_dataset)
#normed_test_data = norm(test_dataset)