In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
#import matplotlib.pyplot as plt
#from mpl_toolkits import mplot3d

In [None]:
#retrieve dataset
url = 'https://raw.githubusercontent.com/ncmerem/LinearRegression/main/wqW.csv'
data = pd.read_csv(url, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [None]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [None]:
data_z = (data - data.mean()) / data.std()
data_z.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,2.653755e-14,-1.053431e-14,5.34461e-14,-2.538326e-15,-1.419036e-15,6.210721e-18,-1.387439e-16,2.148461e-12,1.316599e-14,-1.280696e-14,-2.846868e-14,3.622675e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.619982,-1.966784,-2.761461,-1.141827,-1.683102,-1.958477,-3.043919,-2.312802,-3.101091,-2.364468,-2.043089,-3.24953
25%,-0.657434,-0.6770318,-0.5304215,-0.924953,-0.447289,-0.7237012,-0.7144009,-0.770628,-0.6507699,-0.6996389,-0.8241915,-0.9912727
50%,-0.06492444,-0.1809733,-0.117266,-0.2348977,-0.1268931,-0.07691388,-0.1026084,-0.09608339,-0.05474574,-0.1739035,-0.09285319,0.1378561
75%,0.5275851,0.414297,0.4611517,0.6917479,0.1935028,0.6286722,0.6738976,0.6929749,0.6075033,0.5270772,0.719745,0.1378561
max,8.704217,8.152811,10.9553,11.71292,13.74167,14.91679,7.09772,15.02976,4.183648,5.171074,2.99502,3.525242


In [None]:
sigma = data.std()
mu = data.mean()
print("Standard Deviations\n", sigma)
print("\nAverages\n", mu)

Standard Deviations
 fixed acidity            0.843868
volatile acidity         0.100795
citric acid              0.121020
residual sugar           5.072058
chlorides                0.021848
free sulfur dioxide     17.007137
total sulfur dioxide    42.498065
density                  0.002991
pH                       0.151001
sulphates                0.114126
alcohol                  1.230621
quality                  0.885639
dtype: float64

Averages
 fixed acidity             6.854788
volatile acidity          0.278241
citric acid               0.334192
residual sugar            6.391415
chlorides                 0.045772
free sulfur dioxide      35.308085
total sulfur dioxide    138.360657
density                   0.994027
pH                        3.188267
sulphates                 0.489847
alcohol                  10.514267
quality                   5.877909
dtype: float64


In [None]:
def mse_cost(feature, target, weight):
  se = np.power(((feature * weight.T) - target), 2)
  return np.sum(se) / (2 * len(feature))

In [None]:
def train(features, target, weights, step, iters):
  temp = np.matrix(np.zeros(weights.shape))
  
  parameters = int(weights.ravel().shape[1])
  
  cost = np.zeros(iters + 1)
  cost[0] = mse_cost(features, target, weights)
  
  for i in range(iters):
    error = (features * weights.T) - target
    
    for j in range(parameters):
      term = np.multiply(error, features[:,j])
      temp[0,j] = weights[0,j] - ((step / len(features)) * np.sum(term))
      
    weights = temp
    cost[i + 1] = mse_cost(features, target, weights)

    if i % 10 == 0:
      print ("iter={}    cost={}".format(i, cost[i]))
    
  return weights, cost

In [None]:
'''
def test(features, target, weights):
  predictions = features * weights.T
  test_mse = mse_cost(features, target, weights)
  return predictions, test_mse
'''

'\ndef test(features, target, weights):\n  predictions = features * weights.T\n  test_mse = mse_cost(features, target, weights)\n  return predictions, test_mse\n'

In [None]:
data_z['x_0'] = 1
X = data_z.drop(['quality'], axis=1)
y = data_z['quality']

In [None]:
X = np.matrix(X.values)
y = np.matrix(y.values).T
W = np.matrix(np.zeros(shape=[1,X.shape[1]]))

In [None]:
print(X.shape, y.shape, W.shape)

(4898, 12) (4898, 1) (1, 12)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)
print(X_train.shape, y_train.shape, W.shape)
print(X_test.shape, y_test.shape)

(3918, 12) (3918, 1) (1, 12)
(980, 12) (980, 1)


In [None]:
mse_cost(X_train, y_train, W)

0.4930676279622134

In [None]:
lr = 0.01
iters = 1000
W_final, cost = train(X_train, y_train, W, lr, iters)

iter=0    cost=0.4930676279622134
iter=10    cost=0.4596196454418549
iter=20    cost=0.4380588459991032
iter=30    cost=0.4233332282474974
iter=40    cost=0.41270161589102144
iter=50    cost=0.404644485231683
iter=60    cost=0.3982946878451881
iter=70    cost=0.39313854180688074
iter=80    cost=0.388857818216767
iter=90    cost=0.38524537908105955
iter=100    cost=0.3821594930083651
iter=110    cost=0.37949860058520146
iter=120    cost=0.3771869990717154
iter=130    cost=0.3751664403389812
iter=140    cost=0.37339099442568374
iter=150    cost=0.3718237644756719
iter=160    cost=0.37043468689305564
iter=170    cost=0.36919899345030843
iter=180    cost=0.3680960953433389
iter=190    cost=0.3671087485318325
iter=200    cost=0.36622241462552596
iter=210    cost=0.3654247627427333
iter=220    cost=0.36470527604373304
iter=230    cost=0.36405493778486925
iter=240    cost=0.3634659788302682
iter=250    cost=0.3629316732701768
iter=260    cost=0.3624461720551968
iter=270    cost=0.362004366891

In [None]:
mse_cost(X_train, y_train, W_final)

0.3567031944254385

In [None]:
mse_cost(X_test, y_test, W_final)

0.37922630809035096