In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression

LOAD DATASET

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/winequality-red.csv"

In [3]:
#Data has no headers

In [4]:
col_names = ["fixed_acidity","volatile_acidity","citric_acid","residual_sugar","chlorides","free_sulfur_dioxide","total_sulfur_dioxide","density","pH","sulphates","alcohol","quality"]

In [5]:
data = pd.read_csv(url, names = col_names)

In [6]:
print(data.head())

   fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free_sulfur_dioxide  total_sulfur_dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

SPLIT FEATURES AND LABEL (quality)

In [7]:
x = data.drop("quality", axis = 1)
y = data["quality"]

SPLIT TRAIN AND TEST DATA (test size 20%, random_state for same split everytime)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

TRAIN MODEL

In [None]:
model = LinearRegression()


In [9]:
model.fit(x_train, y_train)

PREDICT USING MODEL

In [10]:
y_pred = model.predict(x_test)

EVALUATE

In [11]:
mae = mean_absolute_error(y_test, y_pred)

In [12]:
mse = mean_squared_error(y_test, y_pred)

In [13]:
rmse = mse**0.5

In [14]:
r2 = r2_score(y_test, y_pred)

In [21]:
evaluation_chart = pd.DataFrame({
    "Metric": ["MAE", "MSE", "RMSE", "R2 Score"], "Value": [mae, mse, rmse, r2] 
})
evaluation_chart["Value"] = evaluation_chart["Value"].round(2)

print(evaluation_chart)

     Metric  Value
0       MAE   0.50
1       MSE   0.39
2      RMSE   0.62
3  R2 Score   0.40


In [34]:
max = data["quality"].max()
min = data["quality"].min()
mean = data["quality"].mean()
median = data["quality"].median()

In [35]:
data_summary = pd.DataFrame({
    "Metric": ["Max", "Min", "Mean", "Median"], "Value": [max, min, mean, median]
})
data_summary["Value"] = data_summary["Value"].round(2)

print(data_summary)

   Metric  Value
0     Max   8.00
1     Min   3.00
2    Mean   5.64
3  Median   6.00


#### **Evaluating MAE**
0.5 MAE means model is on average off by 0.5 units on each prediction for a range of 3 to 8
For more clear view, we see error with respect to the range

In [41]:
percent_mae = mae / (max-min) * 100
print(round(percent_mae,2),"% of error")

10.07 % of error


In [43]:
print(model.coef_)

[ 2.30853339e-02 -1.00130443e+00 -1.40821461e-01  6.56431104e-03
 -1.80650315e+00  5.62733439e-03 -3.64444893e-03 -1.03515936e+01
 -3.93687732e-01  8.41171623e-01  2.81889567e-01]
