In [1]:
import numpy as np
import pandas as pd
import os

# Function to perform gradient descent for linear regression using matrix operations with tolerance
def gradient_descent_matrix_method(X, y, learning_rate=0.0001, iterations=100000, tolerance=1e-7):
    # Number of data points (m) and number of features (n)
    m, n = X.shape
    
    # Initialize parameters (theta) to zeros (one for each feature, including intercept)
    theta = np.zeros(n)
    
    for i in range(iterations):
        # Compute predictions: X.dot(theta) is a matrix-vector multiplication
        y_predicted = X.dot(theta)
        
        # Compute the gradient: (1/m) * X.T.dot(y_predicted - y)
        gradient = (1/m) * X.T.dot(y_predicted - y)
        
        # Update parameters (theta)
        new_theta = theta - learning_rate * gradient
        
        # Check for convergence: If the change in parameters is smaller than tolerance, stop
        if np.linalg.norm(new_theta - theta, ord=1) < tolerance:
            print(f"Convergence reached at iteration {i}")
            break
        
        # Update theta for the next iteration
        theta = new_theta
    
    # Return the final parameters (theta)
    return theta

# Function to read data from a file (CSV, Excel, JSON, etc.)
def read_data_from_file(file_path, target_column):
    # Get the file extension
    file_extension = os.path.splitext(file_path)[1]

    # Read the file based on the extension
    if file_extension == '.csv':
        df = pd.read_csv(file_path)
    elif file_extension in ['.xls', '.xlsx']:
        df = pd.read_excel(file_path)
    elif file_extension == '.json':
        df = pd.read_json(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

    # Display the first few rows of the dataset
    print("First 10 rows of the dataset:")
    print(df.head(10))

    # Separate the target column (dependent variable, y) from the rest of the features
    y = df[target_column].values  # Target variable (dependent)

    # Drop the target column to get all features (independent variables)
    X = df.drop(columns=[target_column]).values

    # Add a column of ones for the intercept term
    intercept_column = np.ones((X.shape[0], 1))
    X = np.hstack((intercept_column, X))  # Add the intercept column to the features

    # Return features, target, and feature names
    return X, y, df.drop(columns=[target_column]).columns

# Main function to run the regression
def run_regression(file_path, target_column):
    # Step 1: Read data from the file
    X, y, feature_names = read_data_from_file(file_path, target_column)

    # Step 2: Run gradient descent with the matrix method and tolerance
    theta = gradient_descent_matrix_method(X, y, learning_rate=0.001, tolerance=1e-7)

    # Step 3: Display the theta values with names
    print("Theta values (including intercept):")
    for i in range(len(theta)):
        if i == 0:
            print(f"theta0 (Intercept): {theta[i]}")
        else:
            print(f"theta{i} ({feature_names[i-1]}): {theta[i]}")

    # Step 4: Print the optimized equation with actual feature and target names
    equation = f"{target_column} = {theta[0]}"  # Start with the intercept
    for i in range(1, len(theta)):
        equation += f" + {theta[i]}*{feature_names[i-1]}"
    
    print(f"The optimized equation is: {equation}")

# Example usage
# Use the provided file path with forward slashes
file_path = 'C:/Users/19874/OneDrive/桌面/NCSU FM/Coding Files/Machine Learning/Sample Data/Matrix GDM General.xlsx' # Copy and paste file path here, switch \ to /
target_column = 'target'  # Specify the name of the column that contains the dependent variable (y)
run_regression(file_path, target_column)


First 10 rows of the dataset:
   feature_1  feature_2  target
0          3          1       2
1          7          9       6
2         20         14      16
3         13          6      12
4          9         10       7
Convergence reached at iteration 25074
Theta values (including intercept):
theta0 (Intercept): 0.10910437420417896
theta1 (feature_1): 0.899307650991062
theta2 (feature_2): -0.10774947322960175
The optimized equation is: target = 0.10910437420417896 + 0.899307650991062*feature_1 + -0.10774947322960175*feature_2
