In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('SOCR-HeightWeight.csv')

In [3]:
df.head()

Unnamed: 0,Index,Height(Inches),Weight(Pounds)
0,1,65.78331,112.9925
1,2,71.51521,136.4873
2,3,69.39874,153.0269
3,4,68.2166,142.3354
4,5,67.78781,144.2971


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Index           25000 non-null  int64  
 1   Height(Inches)  25000 non-null  float64
 2   Weight(Pounds)  25000 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 586.1 KB


In [5]:
df.isnull().sum()

Index             0
Height(Inches)    0
Weight(Pounds)    0
dtype: int64

In [6]:
#Separating the independent and dependent features
#Dependent feature
y = np.asarray(df['Weight(Pounds)'].values.tolist()) 

# Independent Feature
X = np.asarray(df['Height(Inches)'].values.tolist())

In [7]:
# Reshaping the independent feature
X = X.reshape(-1,1)

In [8]:
#Reshaping the Dependent features
y = y.reshape(len(y),1 ) # Changing the shape from (50,) to (50,1)

In [9]:
#Get the shapes of X and y
print("The shape of the independent fatures are ",X.shape)
print("The shape of the dependent fatures are ",y.shape)

The shape of the independent fatures are  (25000, 1)
The shape of the dependent fatures are  (25000, 1)


In [10]:
#The method "poly_features" concatenates polynomials of independent feature to X
# This is similar to PolynomialFeatures class from sklearn.preprocessing
def poly_features(features, X):
  data = pd.DataFrame(np.zeros((X.shape[0],features)))
  for i in range(1,features+1):
    data.iloc[:,i-1] = (X**i).reshape(-1,1)
  X_poly = np.array(data.values.tolist())
  return X_poly

In [11]:
# The method "split_data" splits the given dataset into trainset and testset
# This is similar to the method "train_test_split" from "sklearn.model_selection"
def split_data(X,y,test_size=0.2,random_state=0):
    np.random.seed(random_state)                  #set the seed for reproducible results
    indices = np.random.permutation(len(X))       #shuffling the indices
    data_test_size = int(X.shape[0] * test_size)  #Get the test size

    #Separating the Independent and Dependent features into the Train and Test Set
    train_indices = indices[data_test_size:]
    test_indices = indices[:data_test_size]
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    return X_train, y_train, X_test, y_test

In [12]:
class polynomialRegression():

  def __init__(self):
    #No instance Variables required
    pass

  def forward(self,X,y,W):
    """
    Parameters:
    X (array) : Independent Features
    y (array) : Dependent Features/ Target Variable
    W (array) : Weights 

    Returns:
    loss (float) : Calculated Sqaured Error Loss for y and y_pred
    y_pred (array) : Predicted Target Variable
    """
    y_pred = sum(W * X)
    loss = ((y_pred-y)**2)/2    #Loss = Squared Error, we introduce 1/2 for ease in the calculation
    return loss, y_pred

  def updateWeights(self,X,y_pred,y_true,W,alpha,index):
    """
    Parameters:
    X (array) : Independent Features
    y_pred (array) : Predicted Target Variable
    y_true (array) : Dependent Features/ Target Variable
    W (array) : Weights
    alpha (float) : learning rate
    index (int) : Index to fetch the corresponding values of W, X and y 

    Returns:
    W (array) : Update Values of Weight
    """
    for i in range(X.shape[1]):
      #alpha = learning rate, rest of the RHS is derivative of loss function
      W[i] -= (alpha * (y_pred-y_true[index])*X[index][i]) 
    return W

  def train(self, X, y, epochs=10, alpha=0.001, random_state=0):
    """
    Parameters:
    X (array) : Independent Feature
    y (array) : Dependent Features/ Target Variable
    epochs (int) : Number of epochs for training, default value is 10
    alpha (float) : learning rate, default value is 0.001

    Returns:
    y_pred (array) : Predicted Target Variable
    loss (float) : Calculated Sqaured Error Loss for y and y_pred
    """

    num_rows = X.shape[0] #Number of Rows
    num_cols = X.shape[1] #Number of Columns 
    W = np.random.randn(1,num_cols) / np.sqrt(num_rows) #Weight Initialization

    #Calculating Loss and Updating Weights
    train_loss = []
    num_epochs = []
    train_indices = [i for i in range(X.shape[0])]
    for j in range(epochs):
      cost=0
      np.random.seed(random_state)
      np.random.shuffle(train_indices)
      for i in train_indices:
        loss, y_pred = self.forward(X[i],y[i],W[0])
        cost+=loss
        W[0] = self.updateWeights(X,y_pred,y,W[0],alpha,i)
      train_loss.append(cost)
      num_epochs.append(j)
    return W[0], train_loss, num_epochs

  def test(self, X_test, y_test, W_trained):
    """
    Parameters:
    X_test (array) : Independent Features from the Test Set
    y_test (array) : Dependent Features/ Target Variable from the Test Set
    W_trained (array) : Trained Weights
    test_indices (list) : Index to fetch the corresponding values of W_trained,
                          X_test and y_test 

    Returns:
    test_pred (list) : Predicted Target Variable
    test_loss (list) : Calculated Sqaured Error Loss for y and y_pred
    """
    test_pred = []
    test_loss = []
    test_indices = [i for i in range(X_test.shape[0])]
    for i in test_indices:
        loss, y_test_pred = self.forward(X_test[i], W_trained, y_test[i])
        test_pred.append(y_test_pred)
        test_loss.append(loss)
    return test_pred, test_loss
    

  def predict(self, W_trained, X_sample):
    prediction = sum(W_trained * X_sample)
    return prediction

 


In [13]:
# Independent Feature
X = np.asarray(df['Height(Inches)'].values.tolist())


In [14]:
y = np.asarray(df['Weight(Pounds)'].values.tolist())

In [15]:
# Reshaping the independent feature
X = X.reshape(-1,1)

In [16]:
X

array([[65.78331],
       [71.51521],
       [69.39874],
       ...,
       [64.69855],
       [67.52918],
       [68.87761]])

In [17]:
#Adding the feature X0 = 1, so we have the equation: y =  W0 + (W1 * X1) + (W2 * (X1**2))
X = np.concatenate((X,np.ones((25000,1))), axis = 1)


In [18]:
X

array([[65.78331,  1.     ],
       [71.51521,  1.     ],
       [69.39874,  1.     ],
       ...,
       [64.69855,  1.     ],
       [67.52918,  1.     ],
       [68.87761,  1.     ]])

In [19]:
y

array([112.9925, 136.4873, 153.0269, ..., 118.2655, 132.2682, 124.8742])

In [20]:
#Splitting the dataset
X_train, y_train, X_test, y_test = split_data(X,y)

In [21]:
#declaring the "regressor" as an object of the class polynomialRegression
regressor = polynomialRegression()

In [22]:
#Training 
W_trained,train_loss, num_epochs = regressor.train(X_train, y_train, epochs=200, alpha=0.00001)

In [23]:
#Testing on the Test Dataset
test_pred, test_loss = regressor.test(X_test, y_test, W_trained)

In [24]:
y_pred = regressor.train(X_test,y)

In [25]:
def mse(X ,y):
    return np.mean((X - y) ** 2)

In [26]:
y_test.shape
y_pred = np.array(y_pred)


In [27]:
y_pred = regressor.predict(W_trained,X_sample=5)

In [28]:
# compute the mean squared error between the predicted and true values

mse_score = np.sum((y_test-y_pred)**2)/len(y_test)
print('MSE:', mse_score)

MSE: 16957.89656694449


In [29]:
import pickle

In [30]:
model = open('poly.pkl','wb')
model = pickle.dump(regressor,model)