# Random Forest Classifier

**Import Lib**

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from pandas.plotting import parallel_coordinates


**Load the dataset**

In [12]:
data = pd.read_csv("./winequality-white.csv")

**Overview**

In [13]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [14]:

data.value_counts(['quality'])


quality
6          2198
5          1457
7           880
8           175
4           163
3            20
9             5
Name: count, dtype: int64

**Split Data**

In [15]:
# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)
# Standardize the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.173077,0.127451,0.144578,0.058282,0.053412,0.121951,0.327146,0.074995,0.527273,0.104651,0.564516
1,0.230769,0.215686,0.186747,0.015337,0.077151,0.132404,0.225058,0.039136,0.490909,0.197674,0.790323
2,0.25,0.186275,0.192771,0.059816,0.68546,0.205575,0.382831,0.148255,0.363636,0.302326,0.225806
3,0.240385,0.137255,0.204819,0.009202,0.080119,0.10453,0.201856,0.048197,0.309091,0.604651,0.580645
4,0.144231,0.215686,0.096386,0.055215,0.059347,0.121951,0.211137,0.065356,0.527273,0.162791,0.612903


**Train And Evaluate Model**

In [17]:
# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6561224489795918


**Train And Evaluate Model Using Repeated hold-out**

In [18]:
n_repeats = 10
test_size = 0.2
# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Initialize a list to store performance metrics
accuracies = []
model = RandomForestClassifier()
for _ in range(n_repeats):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,stratify=y, random_state=42)
    
    # Create and train your machine learning model
   
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate and store the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"Accuracy: {accuracy:.2f}")

# Calculate and print the average accuracy and its standard deviation
avg_accuracy = sum(accuracies) / n_repeats
std_accuracy = np.std(accuracies)
print(f"Average Accuracy: {avg_accuracy:.2f}")
print(f"Standard Deviation: {std_accuracy:.2f}")



Accuracy: 0.67
Accuracy: 0.67
Accuracy: 0.67
Accuracy: 0.67
Accuracy: 0.66
Accuracy: 0.66
Accuracy: 0.66
Accuracy: 0.67
Accuracy: 0.65
Accuracy: 0.66
Average Accuracy: 0.66
Standard Deviation: 0.00
