In [23]:
import numpy as np
import pandas as pd
import seaborn as sns

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [26]:
df = df[['total_bill', 'tip']]
df.head(3)

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5


- EDA part is left to do just to show the ML flow

In [27]:
from sklearn.model_selection import train_test_split

# independent and dependent feature split
X = df.drop('tip', axis=1)
y = df['tip']
type(X), type(y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [28]:
# train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((195, 1), (49, 1), (195,), (49,))

In [29]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

In [30]:
X_test.loc[24]

total_bill    19.82
Name: 24, dtype: float64

In [31]:
lr_model.predict([X_test.loc[24]])



array([3.04525623])

In [32]:
lr_model.predict([[19.82]])



array([3.04525623])

In [33]:
predictions = lr_model.predict(X_test)
predictions

array([3.04525623, 1.86330727, 3.55119456, 3.69452593, 2.31576375,
       2.83881627, 3.96728338, 2.26014262, 2.50615915, 2.57033737,
       2.88160176, 2.07723468, 2.06439904, 2.47407003, 2.00236009,
       2.91903905, 2.92652651, 3.23351235, 2.68478854, 5.33107064,
       3.13831465, 3.13403611, 2.4558862 , 1.94673896, 3.16077703,
       2.17564129, 2.02375283, 3.62927807, 2.68906708, 6.07767732,
       4.99734388, 1.75313465, 2.83025918, 3.09552917, 2.74040966,
       3.50092162, 2.21200895, 5.53644096, 2.33287794, 3.35010279,
       2.04942412, 2.47834858, 3.48701634, 2.03017065, 2.03124029,
       1.25361414, 2.05798121, 2.92438724, 1.73388118])

In [34]:
test = pd.DataFrame({
    'total_bill': X_test['total_bill'],
    'tip': y_test,
    'predictions': predictions
})
test.head()

Unnamed: 0,total_bill,tip,predictions
24,19.82,3.18,3.045256
6,8.77,2.0,1.863307
153,24.55,2.0,3.551195
211,25.89,5.16,3.694526
198,13.0,2.0,2.315764


In [35]:
test['diff'] = test['predictions'] - test['tip']
test.head()

Unnamed: 0,total_bill,tip,predictions,diff
24,19.82,3.18,3.045256,-0.134744
6,8.77,2.0,1.863307,-0.136693
153,24.55,2.0,3.551195,1.551195
211,25.89,5.16,3.694526,-1.465474
198,13.0,2.0,2.315764,0.315764


In [36]:
from sklearn.metrics import root_mean_squared_error, r2_score

root_mean_squared_error(y_test, predictions)

0.7541977545199626

In [37]:
import math
math.sqrt(500)

22.360679774997898

In [38]:
r2_score(y_test, predictions)

0.5449381659234663

In [41]:
import numpy as np
import pandas as pd

np.random.seed(42)

class LinearRegressionCustom:
    def __init__(self, learning_rate=0.001, epoch=100, bias=0.0) -> None:
        self.learning_rate = learning_rate
        self.epoch = epoch
        self.bias = bias
        # self.w = None

    def train(self, X: pd.DataFrame, y: pd.Series):
        if isinstance(X, pd.DataFrame):
            _X = X.to_numpy()
        elif not isinstance(X, np.ndarray):
            raise ValueError("X must be a pandas DataFrame or a 2D numpy array")
        
        if isinstance(y, pd.Series):
            _y = y.to_numpy()
        elif not isinstance(y, np.ndarray):
            raise ValueError("y must be a pandas Series or a 1D numpy array")
        
        # find num records and num features
        n_recs, n_features = _X.shape

        # initialize the weights
        self.w = np.random.random(n_features)

        for epoch in range(1, self.epoch+1):
            y_hat = self.predict(_X)

            diff = _y - y_hat
            
            loss = self.loss_mse(diff, n_recs)

            grad_w = -(2 / n_recs) * np.dot(_X.T, diff)
            grad_b = -(2 / n_recs) * np.sum(diff)

            self.w = self.w - self.learning_rate * grad_w
            self.bias = self.bias - self.learning_rate * grad_b

            # if epoch % 5 == 0:
            print(f"Epoch: {epoch}, Loss: {loss}")
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        elif not isinstance(X, np.ndarray):
            raise ValueError("X must be a pandas DataFrame or a 2D numpy array")
        return np.dot(X, self.w) + self.bias
    
    def loss_mse(self, diff, n_recs):
        return (1 / n_recs) * np.sum(np.square(diff))

        

In [42]:
lr_model_custom = LinearRegressionCustom()
lr_model_custom.train(X_train, y_train)

Epoch: 1, Loss: 26.746392016059623
Epoch: 2, Loss: 1.3134467127995642
Epoch: 3, Loss: 1.2937749527989693
Epoch: 4, Loss: 1.2936732269785145
Epoch: 5, Loss: 1.2935866259473556
Epoch: 6, Loss: 1.2935000910374277
Epoch: 7, Loss: 1.2934136106182175
Epoch: 8, Loss: 1.293327184646494
Epoch: 9, Loss: 1.2932408130879702
Epoch: 10, Loss: 1.2931544959083883
Epoch: 11, Loss: 1.2930682330735113
Epoch: 12, Loss: 1.2929820245491244
Epoch: 13, Loss: 1.2928958703010343
Epoch: 14, Loss: 1.2928097702950685
Epoch: 15, Loss: 1.2927237244970768
Epoch: 16, Loss: 1.2926377328729302
Epoch: 17, Loss: 1.2925517953885215
Epoch: 18, Loss: 1.2924659120097644
Epoch: 19, Loss: 1.292380082702594
Epoch: 20, Loss: 1.2922943074329678
Epoch: 21, Loss: 1.292208586166864
Epoch: 22, Loss: 1.2921229188702823
Epoch: 23, Loss: 1.2920373055092438
Epoch: 24, Loss: 1.291951746049791
Epoch: 25, Loss: 1.2918662404579877
Epoch: 26, Loss: 1.2917807886999197
Epoch: 27, Loss: 1.2916953907416933
Epoch: 28, Loss: 1.2916100465494365
Epoch

In [43]:
custom_preds = lr_model_custom.predict(X_test)
custom_preds

array([2.88765208, 1.28857815, 3.57214254, 3.76605739, 1.90071233,
       2.60835681, 4.13507445, 1.82546179, 2.15830071, 2.24512825,
       2.66624184, 1.57800329, 1.56063778, 2.11488694, 1.47670449,
       2.71689124, 2.72702112, 3.1423462 , 2.39997071, 5.98015976,
       3.01355201, 3.00776351, 2.0902858 , 1.40145395, 3.04394165,
       1.71113886, 1.50564701, 3.67778272, 2.40575921, 6.99025351,
       5.52865653, 1.1395242 , 2.5967798 , 2.95566698, 2.47522124,
       3.50412764, 1.76034113, 6.2580079 , 1.92386634, 3.30008291,
       1.54037802, 2.12067544, 3.485315  , 1.51432976, 1.51577689,
       0.46371648, 1.55195503, 2.72412687, 1.11347593])

In [44]:
pd.DataFrame({
    'total_bill': X_test['total_bill'],
    'tip': y_test,
    'predictions': predictions,
    'custom_preds': custom_preds
})

Unnamed: 0,total_bill,tip,predictions,custom_preds
24,19.82,3.18,3.045256,2.887652
6,8.77,2.0,1.863307,1.288578
153,24.55,2.0,3.551195,3.572143
211,25.89,5.16,3.694526,3.766057
198,13.0,2.0,2.315764,1.900712
176,17.89,2.0,2.838816,2.608357
192,28.44,2.56,3.967283,4.135074
124,12.48,2.52,2.260143,1.825462
9,14.78,3.23,2.506159,2.158301
101,15.38,3.0,2.570337,2.245128


In [45]:
r2_score(y_test, custom_preds)

0.4170523952724915

In [None]:
import numpy as np
import pandas as pd


class CustomLinearRegression:
    def __init__(self, alpha=0.001, n_iter=100, scale_features=True):
        self.alpha = alpha
        self.n_iter = n_iter
        self.scale_features = scale_features
        self.w = None
        self.b = np.random.random()

    def _feature_scaling(self, X):
        """Scale features to have zero mean and unit variance."""
        self.feature_means = np.mean(X, axis=0)
        self.feature_stds = np.std(X, axis=0)
        # Prevent division by zero for constant features
        self.feature_stds[self.feature_stds == 0] = 1.0
        return (X - self.feature_means) / self.feature_stds

    def train(self, X, y):
        """Train the linear regression model using gradient descent."""
        # Ensure input is numpy array
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.to_numpy()
        elif not isinstance(X, np.ndarray):
            raise ValueError("X must be a numpy array or pandas DataFrame")

        if isinstance(y, (pd.DataFrame, pd.Series)):
            y = y.to_numpy()
        elif not isinstance(y, np.ndarray):
            raise ValueError("y must be a numpy array or pandas DataFrame")

        # Feature scaling
        scaled = False
        if self.scale_features:
            X = self._feature_scaling(X)
            scaled = True

        self.n_rec, self.n_features = X.shape

        # Initialize weights
        self.w = np.random.random(self.n_features)  

        for i in range(self.n_iter):
            y_hat = self.predict(X, scaled=scaled)  
            diff = y - y_hat
            loss = self.loss_mse(y, y_hat)

            # Gradient calculation
            grad_w = -(2 / self.n_rec) * np.dot(X.T, diff)
            grad_b = -(2 / self.n_rec) * np.sum(diff)

            self.w = self.w - self.alpha * grad_w
            self.b = self.b - self.alpha * grad_b

            # if i % 10 == 0 or i == self.n_iter - 1:
            print(f"Epoch {i + 1}/{self.n_iter}: Loss = {loss:.6f}")

    def predict(self, X, scaled=False):
        """Predict values using the trained model."""
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.to_numpy()
        elif not isinstance(X, np.ndarray):
            raise ValueError("X must be a numpy array or pandas DataFrame")

        # Scale features if scaling is enabled
        if self.scale_features and not scaled:
            X = (X - self.feature_means) / self.feature_stds

        return np.dot(X, self.w) + self.b

    def loss_mse(self, y, y_hat):
        """Calculate Mean Squared Error."""
        return np.mean((y - y_hat) ** 2)


reg_model = CustomLinearRegression(alpha=0.02, n_iter=100)
reg_model.train(X_train, y_train)
preds = reg_model.predict(X_test)
root_mean_squared_error(y_test, preds)