In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
#from google.colab import drive
#drive.mount('/content/gdrive')

# Load data
#data = pd.read_csv("/content/gdrive/MyDrive/winequality-white.csv", sep=";")
data1 = pd.read_csv("winequality-white.csv", sep=";")
data2 = pd.read_csv("winequality-red.csv", sep=";")
data = pd.concat([data1, data2], axis=0)

# Exploratory Data Analysis (EDA)
print(f"Shape of  data: {data.shape}")
print(data.head())
column_names =  data.columns.tolist()
print(column_names)
print(data.info())
print("Data Summary:")
print(data.describe())  # Print summary statistics
print("\nCorrelation Matrix:")
correlation = data.corr()
print(correlation)


Shape of  data: (6497, 12)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4                 47.0                 186.0   0.9956  3.19       0.40   

   alcohol  quality  
0      8.8        6  
1      9.5     

In [3]:
# Data Preprocessing
# - Handle missing values (if any)
# - Encode categorical features (if any)
# - Scale numerical features (if needed)

# Split data into training and testing sets
scaler = StandardScaler()
X = scaler.fit_transform(data.drop("quality", axis=1))  # Features after scaling
y = data["quality"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)  # Set number of trees (n_estimators)

# Train the model
model.fit(X_train, y_train)

# Make predictions on test set
y_pred = np.around(model.predict(X_test))

# Model evaluation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nModel Evaluation (Random Forest Regression):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R^2): {r2:.2f}")
print(f"Accuracy: {accuracy:.2f}")


Model Evaluation (Random Forest Regression):
Mean Absolute Error (MAE): 0.32
R-squared (R^2): 0.48
Accuracy: 0.70
