# Random Forest Classifier

**Import Lib**

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from pandas.plotting import parallel_coordinates


**Load the dataset**

In [7]:
data = pd.read_csv("./removed_outlier.csv")

**Overview**

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [9]:

data.value_counts(['quality'])


quality
6          1860
5          1101
7           792
8           145
4            88
3             8
9             4
Name: count, dtype: int64

**Split Data**

In [10]:
# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)
# Standardize the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.81601,0.473684,0.493827,0.510638,0.38785,0.589286,0.384615,0.384615,0.50101,0.581081,0.584906,0.464286
1,0.291811,0.315789,0.345679,0.829787,0.042056,0.642857,0.205128,0.282051,0.383165,0.648649,0.433962,0.214286
2,0.001429,0.552632,0.469136,0.553191,0.939252,0.535714,0.551282,0.636752,0.935354,0.243243,0.415094,0.071429
3,0.706963,0.5,0.222222,0.510638,0.065421,0.446429,0.461538,0.384615,0.194613,0.567568,0.415094,0.803571
4,0.224219,0.421053,0.518519,0.319149,0.46729,0.428571,0.384615,0.423077,0.612121,0.662162,0.207547,0.303571


**Train And Evaluate Model**

In [12]:
# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.70625


**Train And Evaluate Model Using Repeated hold-out**

In [13]:
n_repeats = 10
test_size = 0.2
# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Initialize a list to store performance metrics
accuracies = []
model = RandomForestClassifier()
for _ in range(n_repeats):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,stratify=y, random_state=42)
    
    # Create and train your machine learning model
   
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate and store the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"Accuracy: {accuracy:.2f}")

# Calculate and print the average accuracy and its standard deviation
avg_accuracy = sum(accuracies) / n_repeats
std_accuracy = np.std(accuracies)
print(f"Average Accuracy: {avg_accuracy:.2f}")
print(f"Standard Deviation: {std_accuracy:.2f}")



Accuracy: 0.72
Accuracy: 0.70
Accuracy: 0.71
Accuracy: 0.70
Accuracy: 0.72
Accuracy: 0.71
Accuracy: 0.72
Accuracy: 0.70
Accuracy: 0.72
Accuracy: 0.71
Average Accuracy: 0.71
Standard Deviation: 0.01
