In [1]:
#!pip3 install -U ucimlrepo 

#### 
The helper modules for data loading and exploration are already imported. The **data_loader** module loads the "auto_mpg" dataset, and **data_explorer** extracts and prints the dataset's metadata.

In [None]:
from utils import data_loader as dl
from utils import data_explorer as de
import numpy as np

data_loader = dl.DataLoader()
auto_mpg_data = data_loader.get_dataset("auto_mpg")
data_explorer = de.DataExplorer(auto_mpg_data)
metadata = data_explorer.describe_data()
print(metadata)


#### **Step 1**: Check for Missing Data

Check if there are any missing values in the dataset and removing rows that contain missing data.

**Task:**

- Check for columns (Features) with missing values.  **Hint:** Use the `.isna()` method to identify missing values in the DataFrame, and then use `.sum()` to count them. 
- Remove the rows with missing values. Hint: Use the `.dropna()` method to remove rows with missing data.
- Print out the number of rows before and after cleaning. Expected answer: `398` and `392`


In [None]:
## Complete the code here: check missing data




# remove the missing data
df = auto_mpg_data.dropna()

print(f"Number of samples Raw dataset: {auto_mpg_data.shape[0]}")
print(f"Number of samples cleaned dataset: {df.shape[0]}")

#### **Step 2**: Define your target variable (mpg) and features (all other numerical columns), and then split the data into training and testing sets.

**Task**:

- Define the target (mpg) and select the numerical columns as features.
- Split the data into training and testing sets (80% training, 20% testing). **Hint:** Use the `train_test_split` function from the `sklearn.model_selection` module.  

In [4]:
from sklearn.model_selection import train_test_split

target = 'mpg'
numerical_cols = df.select_dtypes(include=[np.number]).columns
features = numerical_cols[numerical_cols != target]

X = df[features].values
y = df[target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### **Step 3**: Preprocess the Data (Scaling)
Preprocess the data by normalizing the features using StandardScaler. This is an important step to ensure that the features are on the same scale.

**Task**:

- Normalize the training and testing features using the StandardScaler. **Hint:** Use the `StandardScaler` from the `sklearn.preprocessing` module to scale the features.  
- Convert the data to PyTorch tensors. Hint: Use the `torch.tensor()` function from the PyTorch library to convert `NumPy` arrays to tensors.

In [5]:
### Pre-processing step. 

from sklearn.preprocessing import StandardScaler
import torch


# Complete the codes here: Normalize the features using the training data only




# Complete the codes here: Convert the data into PyTorch tensors




#### **Step 4**:  Build the Neural Network Model
Define and build the neural network model. For this task, you will use `PyTorch` to define a simple feedforward neural network with one hidden layer. The model will take the normalized features as input and output the predicted fuel efficiency (MPG).


**Task**:

- Define a neural network model using nn.Sequential() or by creating a custom class that inherits from nn.Module.
- The model should include:
    - An input layer that matches the number of features.
    - At least one hidden layer with a ReLU activation function.
    - An output layer with one neuron (since the target is a single continuous value).

In [None]:
import torch.nn as nn
import torch.optim as optim
import pandas as pd

class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()        
        # Complete the code here: Define the layers (e.g., fully connected layers)
        pass

    def forward(self, x):
        # Complete the code here: Define the forward pass
        pass
    
    


input_size = X_train.shape[1]  
model = SimpleNN(input_size)

#### **Step 5**:  Train the Neural Network Model
Train the neural network model. This involves defining the loss function, specifying the optimizer, and running the training loop. The goal is to minimize the loss (difference between the predicted and actual MPG values) using gradient descent.

**Task**:

- Define a loss function (e.g., Mean Squared Error for regression tasks).
- Choose an optimizer (e.g., Adam, which is commonly used for neural network training).
- Write the training loop where the model learns from the training data, calculates the loss, and updates the weights through backpropagation.

In [None]:
criterion = nn.MSELoss()          
optimizer = optim.Adam(model.parameters(), lr=0.001)  

loss_values_train = []
loss_values_test = []
epochs = 1000
for epoch in range(epochs):
    model.train()  
    # Complete the code: Forward pass, compute loss, backward pass, and optimizer step
    pass
    
    # Complete the code: Testing phase (no gradient computation)
    pass



import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(range(epochs), loss_values_train, label="Training Loss", color="blue")
plt.plot(range(epochs), loss_values_test, label="Testing Loss", color="orange")
plt.title("Loss Curve Over Training Epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

#### **Step 6**:  Evaluate the Model's Performance
Evaluate how well the model performs on the train and test dataset that it has not seen before. This helps determine if the model generalizes well to new data or if it overfits to the training data.

**Task**:

- Use the trained model to make predictions on the train/test data.
- Calculate evaluation metrics :Mean Squared Error (MSE).
- Visualize the results using plots like predicted vs. actual values.

In [None]:
model.eval()

# Complete the code: Make predictions on the test set (turn off gradients)
y_pred_test = []
pass

# Complete the code: Compute and print the Mean Squared Error for the test set
pass

# Complete the code: Make predictions on the training set (turn off gradients)
y_pred_train = []
pass

# Complete the code: Compute and print the Mean Squared Error for the training set
pass

# Optional: Convert predictions to NumPy arrays if needed
# y_pred_test = ...
# y_pred_train = ...


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


df_plot_train = pd.DataFrame({
    'Actual_train': y_train,
    'Predicted_train': y_pred_train.flatten()
})
min_train = df_plot_train['Actual_train'].min()
max_train = df_plot_train['Actual_train'].max()

df_plot_test = pd.DataFrame({
    'Actual_test': y_test,
    'Predicted_test': y_pred_test.flatten()
})

min_test = df_plot_test['Actual_test'].min()
max_test = df_plot_test['Actual_test'].max()

plt.figure(figsize = (8,8))
sns.scatterplot(data=df_plot_train, x='Actual_train', y='Predicted_train', color = 'blue', marker = 'o', label = 'Training data')
sns.scatterplot(data=df_plot_test, x='Actual_test', y='Predicted_test', color = 'green', marker = '^', label = 'Testing data')
plt.plot([min(min_train, min_test), max(max_train, max_test)], [min(min_train, min_test), max(max_train, max_test)], color='red', linestyle='--', label='y_pred = y_actual') 
plt.title('Actual vs Predicted values after Optimization')
plt.xlabel('Actual values (y)')
plt.ylabel('Predicted values (y_pred)')
plt.legend()
plt.show()

#### **Step 7**: Save the Model

**Task:**
- Save the trained model using PyTorch's `torch.save()` function.
- Ensure the model state dictionary (**model.state_dict()**) is saved, as it contains the model parameters.