## Predict wine quality using Random Forest

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

### Data collection & preprocessing

In [2]:
df = pd.read_csv("dataset/winequality-red.csv")

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
df.shape

(1599, 12)

#### Statistical measures

In [5]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


#### Check for missing values

In [6]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### Data visualization

In [None]:
# values for each quality

# sns.catplot(x="quality", data=df, kind="count")

##### Volatile acidity vs. quality

In [None]:
# plot = plt.figure(figsize=(7, 8))
# sns.barplot(x="quality", y="volatile acidity", data=df)

##### Citric acid vs. Quality

In [None]:
# plot = plt.figure(figsize=(7, 8))
# sns.barplot(x="quality", y="citric acid", data=df)

##### Correlation

In [7]:
correlation = df.corr()

In [None]:
# plt.figure(figsize=(10, 10))
# sns.heatmap(correlation, cbar=True, square=True, fmt=".1f", annot=True, annot_kws={"size": 0}, cmap="Blues")

In [12]:
X = df.drop("quality", axis=1)

### Label encoding

In [10]:
y = df["quality"].apply(lambda y_value: 1 if y_value >= 7 else 0)

### Split dataset into training and testing data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [14]:
print(y.shape, y_train.shape, y_test.shape)

(1599,) (1279,) (320,)


### Training model using Random Forest

In [15]:
model = RandomForestClassifier()

In [16]:
model.fit(X_train, y_train)

#### Model evaluation

In [18]:
# 1. Accuracy score - testing data

test_data_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, test_data_prediction)

In [19]:
print(f"Accuracy score (training data): {test_data_accuracy}")

Accuracy score (training data): 0.925


In [21]:
model.score(X_test, y_test)

0.925

### Build a predictive system

This system will predict the quality of the wine. **0** represents poor quality and **1** represents good quality.

In [27]:
input_data = (7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5)

# convert data to a numpy array
input_data_array = np.asarray(input_data)

# reshape array
reshaped_data_array = input_data_array.reshape(1, -1)

prediction = model.predict(reshaped_data_array)

print("Good quality! 😋") if prediction[0] == 1 else print("Poor quality. 🤢")

Poor quality. 🤢


