In [2]:
import pandas as pd
import numpy as np

# Create a sample dataset
np.random.seed(42)  # For reproducibility

data = pd.DataFrame({
    'Math': np.random.randint(50, 100, 50),      # 50 students, Math marks
    'Reading': np.random.randint(50, 100, 50),   # Reading marks
    'Writing': np.random.randint(50, 100, 50)    # Writing marks
})

# Save to CSV (optional)
# data.to_csv('student.csv', index=False)

# Preview the dataset
print("Top 5 rows:\n", data.head())
print("\nBottom 5 rows:\n", data.tail())
print("\nInfo:\n", data.info())
print("\nDescriptive Statistics:\n", data.describe())


Top 5 rows:
    Math  Reading  Writing
0    88       51       58
1    78       69       73
2    64       77       50
3    92       96       93
4    57       56       57

Bottom 5 rows:
     Math  Reading  Writing
45    74       64       58
46    63       94       64
47    99       50       64
48    58       74       75
49    75       56       91
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     50 non-null     int64
 1   Reading  50 non-null     int64
 2   Writing  50 non-null     int64
dtypes: int64(3)
memory usage: 1.3 KB

Info:
 None

Descriptive Statistics:
             Math    Reading    Writing
count  50.000000  50.000000  50.000000
mean   73.680000  74.460000  72.960000
std    13.890887  15.114812  14.289942
min    51.000000  50.000000  50.000000
25%    63.250000  63.000000  58.500000
50%    73.000000  74.000000  74.000000
75%    86.750000  89

In [3]:
# Features (Math and Reading)
X = data[['Math', 'Reading']].values

# Target (Writing)
Y = data['Writing'].values


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [5]:
def cost_function(X, Y, W):
    n = len(Y)
    Y_pred = np.dot(X, W)
    cost = (1 / (2 * n)) * np.sum((Y_pred - Y) ** 2)
    return cost


In [6]:
def gradient_descent(X, Y, W, alpha, iterations):
    m = len(Y)
    cost_history = []

    for i in range(iterations):
        Y_pred = np.dot(X, W)
        loss = Y_pred - Y
        dw = (1 / m) * np.dot(X.T, loss)
        W = W - alpha * dw
        cost_history.append(cost_function(X, Y, W))

    return W, cost_history

# Initialize weights
W = np.zeros(X_train.shape[1])
alpha = 0.0001
iterations = 1000

# Train
W_opt, cost_hist = gradient_descent(X_train, Y_train, W, alpha, iterations)


In [7]:
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))

def r2(Y, Y_pred):
    ss_tot = np.sum((Y - np.mean(Y)) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    return 1 - (ss_res / ss_tot)

# Predictions
Y_pred = np.dot(X_test, W_opt)
print("Final Weights:", W_opt)
print("RMSE:", rmse(Y_test, Y_pred))
print("R2:", r2(Y_test, Y_pred))


Final Weights: [0.57823869 0.39406809]
RMSE: 13.443168063626988
R2: 0.01617525403168063


In [8]:
# Test Gradient Descent on our synthetic dataset
W_init = np.zeros(X_train.shape[1])
alpha = 0.0001
iterations = 1000

W_opt, cost_hist = gradient_descent(X_train, Y_train, W_init, alpha, iterations)

print("Optimized Weights:", W_opt)
print("Cost History (first 10 iterations):", cost_hist[:10])


Optimized Weights: [0.57823869 0.39406809]
Cost History (first 10 iterations): [np.float64(192.26242912023713), np.float64(148.0507828187602), np.float64(147.22881707799706), np.float64(147.1404341053017), np.float64(147.06770690417866), np.float64(146.99840781697708), np.float64(146.9321905057287), np.float64(146.86891480422273), np.float64(146.80844998317335), np.float64(146.75067117466747)]


In [9]:
# Predict on test set
Y_pred = np.dot(X_test, W_opt)

# RMSE
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))

# R-squared
def r2(Y, Y_pred):
    ss_tot = np.sum((Y - np.mean(Y)) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    return 1 - (ss_res / ss_tot)

print("RMSE on Test Set:", rmse(Y_test, Y_pred))
print("R² on Test Set:", r2(Y_test, Y_pred))


RMSE on Test Set: 13.443168063626988
R² on Test Set: 0.01617525403168063


In [10]:
def main():
    # Step 1: Create synthetic dataset
    np.random.seed(42)
    data = pd.DataFrame({
        'Math': np.random.randint(50, 100, 50),
        'Reading': np.random.randint(50, 100, 50),
        'Writing': np.random.randint(50, 100, 50)
    })

    # Step 2: Split into features and target
    X = data[['Math', 'Reading']].values
    Y = data['Writing'].values

    # Step 3: Split into training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Step 4: Initialize weights, learning rate, iterations
    W = np.zeros(X_train.shape[1])
    alpha = 0.0001
    iterations = 1000

    # Step 5: Train using Gradient Descent
    W_opt, cost_hist = gradient_descent(X_train, Y_train, W, alpha, iterations)

    # Step 6: Predictions
    Y_pred = np.dot(X_test, W_opt)

    # Step 7: Evaluate
    model_rmse = rmse(Y_test, Y_pred)
    model_r2 = r2(Y_test, Y_pred)

    # Step 8: Print results
    print("Final Weights:", W_opt)
    print("Cost History (first 10 iterations):", cost_hist[:10])
    print("RMSE on Test Set:", model_rmse)
    print("R² on Test Set:", model_r2)

if __name__ == "__main__":
    main()


Final Weights: [0.57823869 0.39406809]
Cost History (first 10 iterations): [np.float64(192.26242912023713), np.float64(148.0507828187602), np.float64(147.22881707799706), np.float64(147.1404341053017), np.float64(147.06770690417866), np.float64(146.99840781697708), np.float64(146.9321905057287), np.float64(146.86891480422273), np.float64(146.80844998317335), np.float64(146.75067117466747)]
RMSE on Test Set: 13.443168063626988
R² on Test Set: 0.01617525403168063
