# **Red Wine Quality Prediction using Random Forest with Hyperparameter Tuning**

In [29]:
import warnings
warnings.filterwarnings('ignore')

> **1. Data Pre-processing :**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [5]:
X = df.drop('quality', axis=1)
Y = df['quality']

In [7]:
# Train test split
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

> **2. Fitting Random Forest Algorithm to Training Data :**

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy')
classifier.fit(x_train, y_train)

> **3. Predicting Test Set Result :**

In [11]:
y_pred = classifier.predict(x_test)

> **4. Evaluate Performance of Model :**

In [12]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [13]:
mat = confusion_matrix(y_test, y_pred)
mat

array([[ 0,  0,  1,  0,  0,  0],
       [ 0,  1,  7,  2,  0,  0],
       [ 0,  1, 99, 28,  2,  0],
       [ 0,  0, 39, 82, 11,  0],
       [ 0,  0,  0, 22, 20,  0],
       [ 0,  0,  0,  0,  5,  0]])

In [16]:
score = accuracy_score(y_test, y_pred)
score

# The score is : 0.63125, which not so good

0.63125

In [15]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.50      0.10      0.17        10
           5       0.68      0.76      0.72       130
           6       0.61      0.62      0.62       132
           7       0.53      0.48      0.50        42
           8       0.00      0.00      0.00         5

    accuracy                           0.63       320
   macro avg       0.39      0.33      0.33       320
weighted avg       0.61      0.63      0.62       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Fine Tuning using RandomSearchCV**

In [24]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(random_state = 42)

d = regressor.get_params()

for i in d:
    print(i, ':', d[i])

bootstrap : True
ccp_alpha : 0.0
criterion : squared_error
max_depth : None
max_features : 1.0
max_leaf_nodes : None
max_samples : None
min_impurity_decrease : 0.0
min_samples_leaf : 1
min_samples_split : 2
min_weight_fraction_leaf : 0.0
n_estimators : 100
n_jobs : None
oob_score : False
random_state : 42
verbose : 0
warm_start : False


> **1. Create  :**

In [None]:
boot = [True, False]
n_estim = [int(i) for i in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['sqrt']
max_depth = [int(x) for x in np.linspace(2, 14, num = 7)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

random_grid = {'n_estimators': n_estim,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': boot}

for i in random_grid:
    print(i, ':', random_grid[i])

> **2. Create RandomizedSearchCV object :**

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [31]:
regressor = RandomForestRegressor()

rscv = RandomizedSearchCV(estimator=regressor, param_distributions=random_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1)

# Fit the random search model
rscv.fit(x_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


> **4. Tuned Model, Parameters and Score :**

In [32]:
best_params = rscv.best_params_
print(best_params)

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'bootstrap': True}


In [33]:
best_score = rscv.best_score_
print(best_score)

0.4297532585688636


In [34]:
best_model = rscv.best_estimator_