# Linear Regression of Wine Characteristics on Quality

In [1]:
import pandas as pd
from numpy import cov
wine_quality = pd.read_csv('wine_quality_white.csv')
wine_quality.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


## Slope

In the regression equation $y = \beta_0 + \beta_1x$ the slope, $\beta_1$, is calculated as follows:<br>
$\beta_1 = \frac{\text{cov}\left(x,y\right)}{\sigma_x^2}$

In [2]:
var_density = wine_quality['density'].var()
cov_density_quality = cov(wine_quality['density'],wine_quality['quality'])[0,1]
slope_density = cov_density_quality / var_density
print(slope_density)

-90.9423999421


## Intercept

$\beta_0 = \bar{y} - \beta_1\bar{x}$

In [4]:
intercept_density = wine_quality['quality'].mean() - slope_density * wine_quality['density'].mean()
print(intercept_density)

96.2771445761


In [6]:
def calc_y(x):
    return intercept_density + slope_density * x

predicted_quality = wine_quality['density'].apply(calc_y)

## Linear Regression Made Easy Using SciPy

In [7]:
from scipy.stats import linregress

slope, intercept, r_value, p_value, stderr_slope = linregress(wine_quality['density'], wine_quality['quality'])
print('Slope: ',slope)
print('Intercept: ',intercept)

Slope:  -90.9423999421
Intercept:  96.2771445761


In [9]:
predicted_quality = wine_quality['density'] * slope + intercept
residuals = predicted_quality - wine_quality['quality']
rss = sum(residuals ** 2)
print(rss)

3478.689469688176


In [11]:
n = len(wine_quality['quality'])
stderr = (rss / (n-2)) ** (1/2)

def within_count(k):
    count = len(wine_quality[abs((predicted_quality - wine_quality['quality']) / stderr) <= k])
    n = len(wine_quality['quality'])
    return count / n

within_one = within_count(1)
within_two = within_count(2)
within_three = within_count(3)

In [12]:
print(within_one)

0.6845651286239282


In [13]:
print(within_two)

0.9356880359330338


In [14]:
print(within_three)

0.9936708860759493
