In [45]:
import pandas as pd 
import numpy as np

In [46]:
wine_white_file = "C:/Anaconda3/Notebooks/linear-regression/winequality-white.csv"

In [47]:
data = pd.read_csv(wine_white_file,sep=",",header=0)

In [48]:
data.shape

(4898, 12)

In [49]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [50]:
X = data.loc[0:,['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]

In [51]:
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [52]:
Y=data.loc[0:,['quality']]

In [53]:
Y.head()

Unnamed: 0,quality
0,6
1,6
2,6
3,6
4,6


In [54]:
labels[0]

matrix([[6]], dtype=int64)

In [55]:
#Normalize the features
for i in X.columns:
    min = X[i].min()
    max = X[i].max()
    X[i]=(X[i]-min)/(max-min)

In [56]:
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.307692,0.186275,0.216867,0.308282,0.106825,0.149826,0.37355,0.267785,0.254545,0.267442,0.129032
1,0.240385,0.215686,0.204819,0.015337,0.118694,0.041812,0.285383,0.132832,0.527273,0.313953,0.241935
2,0.413462,0.196078,0.240964,0.096626,0.121662,0.097561,0.204176,0.154039,0.490909,0.255814,0.33871
3,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452
4,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452


In [57]:
#Get the matrix from dataframe using numpy
features = np.matrix(X)
labels = np.matrix(Y)

In [64]:
bias = np.ones((features.shape[0],1))
features =  np.append(bias, features,axis=1)
features[0]

matrix([[ 1.        ,  0.30769231,  0.18627451,  0.21686747,  0.30828221,
          0.10682493,  0.14982578,  0.37354988,  0.26778485,  0.25454545,
          0.26744186,  0.12903226]])

In [70]:
#Cost function which gives rmse - root mean squared error
def costfunction(featureValues, actualValues, theta):
    summands = np.power((featureValues*theta.T - actualValues),2)
    return np.sum(summands)/(2*len(X))

In [81]:
def gradientDescent(featureValues, actualValues, learning_rate, num_iterations):
    num_features = featureValues.shape[1] #size of theta is determined by number of features
    theta = np.matrix([0.0 for i in range(num_features)]) # initialize theta values by 0.0
    
    cost = [0.0 for i in range(num_iterations)]
    
    for it in range(num_iterations):
        error = np.repeat((featureValues*theta.T-actualValues), num_features, axis=1)
       
        error_derivative = np.sum(np.multiply(featureValues,error),axis=0)
        theta = theta - (learning_rate*(1/len(actualValues)))*error_derivative
        
        cost[it] = costfunction(featureValues, actualValues, theta)
    return theta, cost

In [136]:
learning_rate=0.09
num_iterations=50000
theta, cost = gradientDescent(features, labels, learning_rate, num_iterations)

In [137]:
theta

matrix([[ 5.29640923, -0.17417767, -1.9638404 , -0.01607314,  2.7233268 ,
         -0.25940851,  1.29356319, -0.29101152, -2.37055054,  0.34550015,
          0.41518398,  1.92332786]])

In [138]:
cost[0],cost[num_iterations-1]

(12.632113399825249, 0.28331680105599644)

In [139]:
#Test
test_index=1336
features[test_index]

predictedValue = np.sum(np.dot(theta, features[test_index][0].T))
actualValue = labels[test_index]
predictedValue, actualValue

(6.0384136738184591, matrix([[8]], dtype=int64))