<a href="https://colab.research.google.com/github/mcmejiag14/Deep-Learning/blob/main/CodePart3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## First we are going to import the packages that we are going to use
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Generate random data
np.random.seed(123) ## Generates the same results everytime that I run the code

## First we generate 1000 datapoints with normal distribution, mean 0 and variance 25
N = np.random.normal(0, 25, 1000)
## We generate 1000 datapoints from an uniform distribution [0,1]
X1 = np.random.uniform(0, 1, 1000)
X2 = np.random.uniform(0, 1, 1000)
X3 = np.random.uniform(0, 1, 1000)
X4 = np.random.uniform(0, 1, 1000)
X5 = np.random.uniform(0, 1, 1000)

In [3]:
# Create a DataFrame
data1 = pd.DataFrame({'X1': X1, 'X2': X2, 'X3': X3, 'X4': X4, 'X5': X5})

In [4]:
# Define the non-linear model
y = 10 * np.sin(np.pi * X1 * X2) + 20 * (X3 - 0.5) ** 2 + 10 * X4 + 5 * X5 + N

In [5]:
# Add the target variable to the DataFrame
data1['y'] = y

In [6]:
data1.describe()

Unnamed: 0,X1,X2,X3,X4,X5,y
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.501913,0.500498,0.500356,0.49672,0.503592,13.405709
std,0.292654,0.289558,0.289439,0.285333,0.283864,25.571996
min,0.000136,6.8e-05,0.000557,0.003233,0.000125,-74.586975
25%,0.24454,0.241449,0.249461,0.24169,0.260899,-2.988751
50%,0.501205,0.511871,0.518273,0.502519,0.52489,13.344152
75%,0.762591,0.748016,0.746952,0.735394,0.743764,30.55895
max,0.999644,0.997651,0.998999,0.99989,0.997312,104.756169


In [None]:
# Display the structure of the DataFrame
print(data1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      1000 non-null   float64
 1   X2      1000 non-null   float64
 2   X3      1000 non-null   float64
 3   X4      1000 non-null   float64
 4   X5      1000 non-null   float64
 5   y       1000 non-null   float64
dtypes: float64(6)
memory usage: 47.0 KB
None


In [7]:
# Before Split the data into training (80%) and testing (20%) sets we need to define our Features variables and our Target Variable
X = data1.iloc[:, :-1]  # Features (X1 to X5)
y = data1.iloc[:, -1]   # Target variable (y)

In [9]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) ## We use the function train_test_split to split our data, random_state=123 to generate reproducible results

In [10]:
# Standardize the features
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [11]:
# Initialize k-nearest neighbors regressor models with different values of k
k_values = [5, 7, 9, 11, 13,15, 17, 19, 21]
trained_kNN_models = []
rmse_scores = []  # To store RMSE scores for each k
mae_scores = []   # To store MAE scores for each k

In [12]:
for k in k_values:
    knn_model = KNeighborsRegressor(n_neighbors=k)
    knn_model.fit(X_train_scaled, y_train)
    trained_kNN_models.append(knn_model) ## Append create a list with the model that we trained

    # Make predictions on the testing data
    y_pred = knn_model.predict(X_test_scaled)

    # Calculate RMSE and MAE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    # Append RMSE and MAE scores to the respective lists
    rmse_scores.append(rmse)
    mae_scores.append(mae)

    print(f"For k = {k}:")
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("\n")
# Now, trained_kNN_models contains the trained k-nearest neighbors regression models
# for k values 5, 7, 9, 11, 13, 17, 19, and 21.

For k = 5:
RMSE: 28.249072695124998
MAE: 22.76088258942403


For k = 7:
RMSE: 27.066752958690813
MAE: 21.583864056947505


For k = 9:
RMSE: 26.45675524893546
MAE: 20.96105046376455


For k = 11:
RMSE: 25.95859068386559
MAE: 20.427914933041553


For k = 13:
RMSE: 25.940778931660837
MAE: 20.60494855276136


For k = 15:
RMSE: 25.887023869151907
MAE: 20.45293167614352


For k = 17:
RMSE: 25.597877334457788
MAE: 20.074642730177164


For k = 19:
RMSE: 25.671095869477412
MAE: 20.206375450127


For k = 21:
RMSE: 25.543502040571244
MAE: 20.108435871362524




In [15]:
# Find the index of the minimum RMSE and MAE to determine the optimal k
optimal_k_rmse = k_values[np.argmin(rmse_scores)]
optimal_k_mae = k_values[np.argmin(mae_scores)]

print(f"Optimal k based on RMSE: {optimal_k_rmse}")
print(f"Optimal k based on MAE: {optimal_k_mae}")


Optimal k based on RMSE: 21
Optimal k based on MAE: 17
