In [1]:
# 9/17/24

# Let my try to make a ML model to predict the quality of wine based on the features of the wine.
# I will use the wine quality dataset from UCI machine learning repository. 
# Specifically this notebook will be using the red wine dataset.
# A linear regression ML model will be my goal here.


In [3]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import csv
import json
import os
from pandas import DataFrame, Series

import seaborn as sns

from itertools import combinations


from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()

from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()

from sklearn.svm import SVR
svr = SVR()


from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import VotingRegressor


from scipy.stats import ttest_rel


In [None]:
# I don't think I will need any second order interaction terms here, like density * alcohol.
# I will just use the columns as is and have multiple features in the model.


In [4]:
lr = LinearRegression()
dtr = DecisionTreeRegressor()
knr = KNeighborsRegressor()
svr = SVR()


In [5]:
# Let me show just an example of the fiat dataset how we will prepare the actual dataset.

fiat = datasets.fetch_openml(data_id=43828, as_frame=True)
fiat.data.info()


  warn(


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1538 entries, 0 to 1537
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   model            1538 non-null   object 
 1   engine_power     1538 non-null   int64  
 2   age_in_days      1538 non-null   int64  
 3   km               1538 non-null   int64  
 4   previous_owners  1538 non-null   int64  
 5   lat              1538 non-null   float64
 6   lon              1538 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 84.2+ KB


In [7]:
# I will transform the columns so that any categorical columns are one hot encoded.
# That means that the model column will have each of its unique values turned into a column itself.
# For example, if the model column has values of 'a', 'b', and 'c', then the model column 
# will be turned into 3 separate columns, where the columns indicate if the value is 'a', 'b', or 'c'.

# So the fiat dataset will go from having 7 columns to 9 columns.
# See below.

# This is what OneHot encoding does.

# My wine dataset actually won't need this, though, because all the columns are numerical.
# I will still show how to do this, though.



In [8]:
ct = ColumnTransformer([("encoder",
OneHotEncoder(sparse=False), [0])],  # [0] is the column index of the column to be one hot encoded. In our case, it is the model column.
remainder="passthrough")


new_data = ct.fit_transform(fiat.data)
        
type(ct)

ct.get_feature_names_out()



fiat_new_data = pd.DataFrame(new_data, columns =
ct.get_feature_names_out(), index = fiat.data.index)


fiat_new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1538 entries, 0 to 1537
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   encoder__model_lounge       1538 non-null   float64
 1   encoder__model_pop          1538 non-null   float64
 2   encoder__model_sport        1538 non-null   float64
 3   remainder__engine_power     1538 non-null   float64
 4   remainder__age_in_days      1538 non-null   float64
 5   remainder__km               1538 non-null   float64
 6   remainder__previous_owners  1538 non-null   float64
 7   remainder__lat              1538 non-null   float64
 8   remainder__lon              1538 non-null   float64
dtypes: float64(9)
memory usage: 108.3 KB




In [44]:
# Now looking at the red wine data, let me make a linear regression model to predict quality.

# Read the Red Wine data
df = pd.read_csv('winequality-red.csv')

# Display the first 5 rows of the data
df


Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
...,...
1594,6.2;0.6;0.08;2;0.09;32;44;0.9949;3.45;0.58;10.5;5
1595,5.9;0.55;0.1;2.2;0.062;39;51;0.99512;3.52;0.76...
1596,6.3;0.51;0.13;2.3;0.076;29;40;0.99574;3.42;0.7...
1597,5.9;0.645;0.12;2;0.075;32;44;0.99547;3.57;0.71...


In [45]:
# Split the values in the column into separate columns
df_cleaned = df['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'].str.split(';', expand=True)

# Rename the columns
df_cleaned.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

# Convert the columns to appropriate data types
df_cleaned = df_cleaned.astype({'fixed acidity': float, 'volatile acidity': float, 'citric acid': float, 'residual sugar': float, 'chlorides': float, 'free sulfur dioxide': float, 'total sulfur dioxide': float, 'density': float, 'pH': float, 'sulphates': float, 'alcohol': float, 'quality': int})

# Display the cleaned dataframe
df_cleaned


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [46]:
# Let me save a subset of the data to a test dataframe.
# I will use the last 5 rows of the data for testing.
df_test = df_cleaned.tail(5)

# And also remove the test data from the original dataframe.
df_cleaned = df_cleaned.drop(df_test.index)
df_cleaned


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1589,6.6,0.725,0.20,7.8,0.073,29.0,79.0,0.99770,3.29,0.54,9.2,5
1590,6.3,0.550,0.15,1.8,0.077,26.0,35.0,0.99314,3.32,0.82,11.6,6
1591,5.4,0.740,0.09,1.7,0.089,16.0,26.0,0.99402,3.67,0.56,11.6,6
1592,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6


In [47]:
# Do n number of cross validation folds on the model.

scores = cross_validate(lr, df_cleaned.drop('quality', axis=1), df_cleaned['quality'], 
cv=10, scoring='neg_root_mean_squared_error')

scores


{'fit_time': array([0.08214116, 0.00560308, 0.00389409, 0.00467467, 0.00438094,
        0.00432897, 0.00295186, 0.00422215, 0.00274014, 0.00622916]),
 'score_time': array([0.00184488, 0.00193501, 0.0057528 , 0.00132227, 0.00597501,
        0.0014708 , 0.00539708, 0.00144291, 0.0018611 , 0.00252771]),
 'test_score': array([-0.68614001, -0.63254008, -0.68513029, -0.65684812, -0.61503238,
        -0.71946656, -0.63532199, -0.62844202, -0.63685563, -0.70212002])}

In [48]:
# RMSE values of the 10 cross validation folds.

scores['test_score']


array([-0.68614001, -0.63254008, -0.68513029, -0.65684812, -0.61503238,
       -0.71946656, -0.63532199, -0.62844202, -0.63685563, -0.70212002])

In [49]:
scores['test_score'].mean()


-0.6597897085384468

In [50]:
# This cell will give the model RMSE.
# Smaller RMSE is better. RMSE of 0 is perfect.

lr_rmse = rmse = 0-scores["test_score"]
rmse.mean()
lr_rmse
print("lr_rmse = " + str(lr_rmse.mean()))


lr_rmse = 0.6597897085384468


In [51]:
# Now let's fit the model to the data.
lr.fit(df_cleaned.drop('quality', axis=1), df_cleaned['quality'])


In [52]:
# And now use the model to predict the test data.

lr.predict(df_test.drop('quality', axis=1))

array([5.53385924, 5.96646765, 5.94623399, 5.475676  , 6.00997871])