In [126]:
# Sample dataset to use
from sklearn.datasets import load_diabetes
import pandas as pd

# Load the dataset
diabetes = load_diabetes()

# Create a DataFrame from the dataset
data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
data['target'] = diabetes.target  # Add the target variable

In [127]:
# Example to use as frame of reference for accuracy

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler


# Split the data into features (X) and target variable (y)
X = data.drop('target', axis=1)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)
# Use the same scaler to transform the testing data
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the scaled testing set
y_pred_scaled = model.predict(X_test_scaled)

# Calculate RMSE
rmse_scaled = np.sqrt(mean_squared_error(y_test, y_pred_scaled))
print("Root Mean Squared Error (Scaled):", rmse_scaled)

Root Mean Squared Error (Scaled): 53.85344583676593


In [128]:
# Custom model
import pandas as pd
import numpy as np
class CustomModel:
    def __init__(self):
        self.model = None
    
    def fit(self, X_train, y_train):
        w = []
        # Convert DataFrames to NumPy arrays
        X_train_array = X_train.to_numpy()
        y_train_array = y_train.to_numpy().transpose()

        # To initialize w: For each column of X, I want to find the average. 
        # Then, find the average for y (1 dimmensional). For each x, divide 
        # y by x, and make that a matrix. 
        
        feature_means = np.mean(X_train_array, axis=0)
        y_mean = np.mean(y_train_array)
        
        # Calculate the ratio of each feature to the mean of y
        w = np.divide(feature_means, y_mean)

        # TODO: Add code to optimize w
        
        self.model = w

    def predict(self, X_test):
        # Convert DataFrame to NumPy array
        X_test_array = X_test.to_numpy()
        
        # Perform prediction: X_test * w
        y_pred = np.dot(X_test_array, self.model)
        
        return y_pred


In [129]:
# Call model and train. Obtain accuracy
X = data.drop('target', axis=1)
y = data['target']

# Assuming you want to split into 80% training and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

custom_model = CustomModel()
custom_model.fit(X_train, y_train)
y_pred = custom_model.predict(X_test)

In [130]:
rmse_scaled = np.sqrt(mean_squared_error(y_test, y_pred))

In [131]:
rmse_scaled

162.9373628985285