In [44]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [45]:
df = pd.read_csv('SOCR-HeightWeight.csv')

In [46]:
df.head()

Unnamed: 0,Index,Height(Inches),Weight(Pounds)
0,1,65.78331,112.9925
1,2,71.51521,136.4873
2,3,69.39874,153.0269
3,4,68.2166,142.3354
4,5,67.78781,144.2971


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Index           25000 non-null  int64  
 1   Height(Inches)  25000 non-null  float64
 2   Weight(Pounds)  25000 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 586.1 KB


In [48]:
df.describe()

Unnamed: 0,Index,Height(Inches),Weight(Pounds)
count,25000.0,25000.0,25000.0
mean,12500.5,67.993114,127.079421
std,7217.022701,1.901679,11.660898
min,1.0,60.27836,78.01476
25%,6250.75,66.704397,119.308675
50%,12500.5,67.9957,127.15775
75%,18750.25,69.272958,134.89285
max,25000.0,75.1528,170.924


In [49]:
df.shape

(25000, 3)

In [50]:
 #Separating the independent and dependent features
#Dependent feature
y = np.asarray(df['Weight(Pounds)'].values.tolist()) 

# Independent Feature
X = np.asarray(df['Height(Inches)'].values.tolist())

In [51]:
# Reshaping the independent feature
X = X.reshape(-1,1)

In [52]:
#Reshaping the Dependent features
y = y.reshape(len(y),1 ) # Changing the shape from (50,) to (50,1)

In [53]:
#Get the shapes of X and y
print("The shape of the independent fatures are ",X.shape)
print("The shape of the dependent fatures are ",y.shape)

The shape of the independent fatures are  (25000, 1)
The shape of the dependent fatures are  (25000, 1)


In [54]:
 #The method "poly_features" concatenates polynomials of independent feature to X
# This is similar to PolynomialFeatures class from sklearn.preprocessing
def poly_features(features, X):
  data = pd.DataFrame(np.zeros((X.shape[0],features)))
  for i in range(1,features+1):
    data.iloc[:,i-1] = (X**i).reshape(-1,1)
  X_poly = np.array(data.values.tolist())
  return X_poly

In [55]:
# The method "split_data" splits the given dataset into trainset and testset
# This is similar to the method "train_test_split" from "sklearn.model_selection"
def split_data(X,y,test_size=0.2,random_state=0):
    np.random.seed(random_state)                  #set the seed for reproducible results
    indices = np.random.permutation(len(X))       #shuffling the indices
    data_test_size = int(X.shape[0] * test_size)  #Get the test size

    #Separating the Independent and Dependent features into the Train and Test Set
    train_indices = indices[data_test_size:]
    test_indices = indices[:data_test_size]
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    return X_train, y_train, X_test, y_test

In [56]:
class polynomialRegression():

  def __init__(self):
    #No instance Variables required
    pass

  def forward(self,X,y,W):
    """
    Parameters:
    X (array) : Independent Features
    y (array) : Dependent Features/ Target Variable
    W (array) : Weights 

    Returns:
    loss (float) : Calculated Sqaured Error Loss for y and y_pred
    y_pred (array) : Predicted Target Variable
    """
    y_pred = sum(W * X)
    loss = ((y_pred-y)**2)/2    #Loss = Squared Error, we introduce 1/2 for ease in the calculation
    return loss, y_pred

  def updateWeights(self,X,y_pred,y_true,W,alpha,index):
    """
    Parameters:
    X (array) : Independent Features
    y_pred (array) : Predicted Target Variable
    y_true (array) : Dependent Features/ Target Variable
    W (array) : Weights
    alpha (float) : learning rate
    index (int) : Index to fetch the corresponding values of W, X and y 

    Returns:
    W (array) : Update Values of Weight
    """
    for i in range(X.shape[1]):
      #alpha = learning rate, rest of the RHS is derivative of loss function
      W[i] -= (alpha * (y_pred-y_true[index])*X[index][i]) 
    return W

  def train(self, X, y, epochs=10, alpha=0.001, random_state=0):
    """
    Parameters:
    X (array) : Independent Feature
    y (array) : Dependent Features/ Target Variable
    epochs (int) : Number of epochs for training, default value is 10
    alpha (float) : learning rate, default value is 0.001

    Returns:
    y_pred (array) : Predicted Target Variable
    loss (float) : Calculated Sqaured Error Loss for y and y_pred
    """

    num_rows = X.shape[0] #Number of Rows 
    num_cols = X.shape[1] #Number of Columns 
    W = np.random.randn(1,num_cols) / np.sqrt(num_rows) #Weight Initialization

    #Calculating Loss and Updating Weights
    train_loss = []
    num_epochs = []
    train_indices = [i for i in range(X.shape[0])]
    for j in range(epochs):
      cost=0
      np.random.seed(random_state)
      np.random.shuffle(train_indices)
      for i in train_indices:
        loss, y_pred = self.forward(X[i],y[i],W[0])
        cost+=loss
        W[0] = self.updateWeights(X,y_pred,y,W[0],alpha,i)
      train_loss.append(cost)
      num_epochs.append(j)
    return W[0], train_loss, num_epochs

  def test(self, X_test, y_test, W_trained):
    """
    Parameters:
    X_test (array) : Independent Features from the Test Set
    y_test (array) : Dependent Features/ Target Variable from the Test Set
    W_trained (array) : Trained Weights
    test_indices (list) : Index to fetch the corresponding values of W_trained,
                          X_test and y_test 

    Returns:
    test_pred (list) : Predicted Target Variable
    test_loss (list) : Calculated Sqaured Error Loss for y and y_pred
    """
    test_pred = []
    test_loss = []
    test_indices = [i for i in range(X_test.shape[0])]
    for i in test_indices:
        loss, y_test_pred = self.forward(X_test[i], W_trained, y_test[i])
        test_pred.append(y_test_pred)
        test_loss.append(loss)
    return test_pred, test_loss
    

  def predict(self, W_trained, X_sample):
    prediction = sum(W_trained * X_sample)
    return prediction

In [57]:
# Independent Feature
X = np.asarray(df['Height(Inches)'].values.tolist())


In [58]:
y = np.asarray(df['Weight(Pounds)'].values.tolist())

In [59]:
# Reshaping the independent feature
X = X.reshape(-1,1)

In [60]:
X

array([[65.78331],
       [71.51521],
       [69.39874],
       ...,
       [64.69855],
       [67.52918],
       [68.87761]])

In [61]:
#Adding the feature X0 = 1, so we have the equation: y =  W0 + (W1 * X1) + (W2 * (X1**2))
X = np.concatenate((X,np.ones((25000,1))), axis = 1)

In [62]:
X

array([[65.78331,  1.     ],
       [71.51521,  1.     ],
       [69.39874,  1.     ],
       ...,
       [64.69855,  1.     ],
       [67.52918,  1.     ],
       [68.87761,  1.     ]])

In [63]:
y

array([112.9925, 136.4873, 153.0269, ..., 118.2655, 132.2682, 124.8742])

In [64]:
#Splitting the dataset
X_train, y_train, X_test, y_test = split_data(X,y)

In [65]:
#declaring the "regressor" as an object of the class polynomialRegression
regressor = polynomialRegression()

In [66]:
#Training 
W_trained,train_loss, num_epochs = regressor.train(X_train, y_train, epochs=100, alpha=0.00001)

In [67]:
#Testing on the Test Dataset
test_pred, test_loss = regressor.test(X_test, y_test, W_trained)

In [68]:
y_pred = regressor.train(X_test,y)

In [69]:
def mse(X ,y):
    return np.mean((X - y) ** 2)

In [70]:
y_test.shape
y_pred = np.array(y_pred)


In [71]:
y_pred = regressor.predict(W_trained,X_sample=5)

In [72]:
# compute the mean squared error between the predicted and true values

mse_score = np.sum((y_test-y_pred)**2)/len(y_test)
print('MSE:', mse_score)

MSE: 15426.975092188271


In [73]:
# Calculate the mean of the actual values
y_mean = np.mean(y_test)

# Calculate the total sum of squares (TSS)

TSS = np.sum(y_test - y_pred)
# Calculate the residual sum of squares (RSS)
RSS = np.sum((y_mean - y_pred) ** 2)

# Calculate the R-squared (coefficient of determination)
R2 = 1 - RSS / TSS

In [74]:
print("R-squared:", R2)

R-squared: 0.9752733058292795


In [75]:
def poly_reg_grid_search(X_train, y_train, X_val, y_val, degrees, alpha_vals, learning_rates):
    best_params = {}
    best_score = float('inf')
    
    for degree in degrees:
        for alpha in alpha_vals:
            for learning_rate in learning_rates:
                # Train polynomial regression model with current hyperparameters
                X_train_poly = poly_features(degree, X_train)
                X_val_poly = poly_features(degree, X_val)
                model = polynomialRegression(X_train_poly, y_train, alpha, learning_rate)
                
                # Evaluate model on validation set
                val_pred = model.predict(X_val_poly)
                val_mse = mse(y_val, val_pred)
                
                # Store best hyperparameters and validation score
                if val_mse < best_score:
                    best_score = val_mse
                    best_params = {'degree': degree, 'alpha': alpha, 'learning_rate': learning_rate}
    
    # Retrain model using best hyperparameters on full training set
    X_train_poly = poly_features(best_params['degree'], X_train)
    model = polynomialRegression(X_train_poly, y_train, best_params['alpha'], best_params['learning_rate'])
    
    return model, best_params, best_score


In [76]:
# compute the mean squared error between the predicted and true values
mse_score = np.sum((y_test-y_pred)**2)/len(y_test) 
print('MSE:', mse_score)


MSE: 15426.975092188271


In [77]:
# Calculate the mean of the actual values
y_mean = np.mean(y_test)

# Calculate the total sum of squares (TSS)

TSS = np.sum(y_test - y_pred)
# Calculate the residual su m of squares (RSS)
RSS = np.sum((y_mean - y_pred) ** 2)

# Calculate the R-squared (coefficient of determination)
R2 = 1 - RSS / TSS

In [78]:
print("R-squared:", R2)

R-squared: 0.9752733058292795


In [79]:



def poly_reg_random_search(X_train, y_train, X_val, y_val, degrees, alpha_vals, learning_rates, num_iterations):
    best_params = {}
    best_score = float('inf')
    
    for i in range(num_iterations):
        degree = random.choice(degrees)
        alpha = random.choice(alpha_vals)
        learning_rate = random.choice(learning_rates)
        
        # Train polynomial regression model with current hyperparameters
        X_train_poly = poly_features(degree, X_train)
        X_val_poly = poly_features(degree, X_val)
        model = polynomialRegression(X_train_poly, y_train, alpha, learning_rate)
        
        # Evaluate model on validation set
        val_pred = model.predict(X_val_poly)
        val_mse = mean_squared_error(y_val, val_pred)
        
        # Store best hyperparameters and validation score
        if val_mse < best_score:
            best_score = val_mse
            best_params = {'degree': degree, 'alpha': alpha, 'learning_rate': learning_rate}
    
    # Retrain model using best hyperparameters on full training set
    X_train_poly = poly_features(best_params['degree'], X_train)
    model = polynomialRegression(X_train_poly, y_train, best_params['alpha'], best_params['learning_rate'])
    
    return model, best_params, best_score


In [80]:
# compute the mean squared error between the predicted and true values

mse_score = np.sum((y_test-y_pred)**2)/len(y_test)
print('MSE:', mse_score)

MSE: 15426.975092188271


In [81]:
# Calculate the mean of the actual values
y_mean = np.mean(y_test)

# Calculate the total sum of squares (TSS)

TSS = np.sum(y_test - y_pred)
# Calculate the residual su m of squares (RSS)
RSS = np.sum((y_mean - y_pred) ** 2)

# Calculate the R-squared (coefficient of determination)
R2 = 1 - RSS / TSS

In [82]:
print("R-squared:", R2)

R-squared: 0.9752733058292795
