# Random Forest Classifier

**Import Lib**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from pandas.plotting import parallel_coordinates
import matplotlib.pyplot as plt
import seaborn as sns

**Load the dataset**

In [2]:
data = pd.read_csv("./winequality-white.csv")

**Overview**

In [3]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
data.value_counts(['quality'])

quality
6          2198
5          1457
7           880
8           175
4           163
3            20
9             5
Name: count, dtype: int64

**Train And Evaluate Model**

In [5]:

# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)


# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print(report)

Accuracy: 0.6642857142857143
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.75      0.18      0.29        33
           5       0.69      0.66      0.67       291
           6       0.63      0.80      0.70       440
           7       0.71      0.48      0.57       176
           8       1.00      0.51      0.68        35
           9       0.00      0.00      0.00         1

    accuracy                           0.66       980
   macro avg       0.54      0.38      0.42       980
weighted avg       0.68      0.66      0.65       980



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Train And Evaluate Model Using Repeated hold-out**

In [6]:
n_repeats = 10
test_size = 0.3
# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Initialize a list to store performance metrics
accuracies = []
for _ in range(n_repeats):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,stratify=y, random_state=None)
    
    # Create and train your machine learning model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate and store the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"Accuracy: {accuracy:.2f}")

# Calculate and print the average accuracy and its standard deviation
avg_accuracy = sum(accuracies) / n_repeats
std_accuracy = np.std(accuracies)
print(f"Average Accuracy: {avg_accuracy:.2f}")
print(f"Standard Deviation: {std_accuracy:.2f}")



Accuracy: 0.66
Accuracy: 0.68
Accuracy: 0.67
Accuracy: 0.67
Accuracy: 0.67
Accuracy: 0.69
Accuracy: 0.65
Accuracy: 0.65
Accuracy: 0.67
Accuracy: 0.67
Average Accuracy: 0.67
Standard Deviation: 0.01
