In [None]:
# read winequality-red.csv into a pandas dataframe
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('winequality-red.csv', sep=';')
df.head()

In [None]:
# check the missing values
df.isnull().sum()


: 

In [None]:
# remove unwanted features from the dataset
df = df.drop(['citric acid', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates'], axis=1) # we are dropping the columns because we don't need them
df.head()

In [None]:
# Check for the outliers in the columns and treat the outliers if present.
# We will use boxplot to check for the outliers
df.boxplot(column=['fixed acidity', 'volatile acidity', 'chlorides', 'density', 'alcohol', 'quality'])
plt.show()

# remove outliers 
df = df[df['fixed acidity'] < 15]
df = df[df['volatile acidity'] < 1.5]
df = df[df['chlorides'] < 0.6]
df = df[df['density'] < 1.0]
df = df[df['alcohol'] < 15]

# apply standardization on the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)


: 

In [None]:
# Handle the Target columns. Map the quality column to numeric form such as: “ >8 ” to 3 (Best Quality) and 
# “ Greater than 6 and less than 8” to 2 (medium quality)“less than 5” to 1 (Worst)
df['quality'] = df['quality'].apply(lambda x: 3 if x > 8 else (2 if x > 6 else 1))
df.head()

# Split the dataset into training and testing dataset with 80:20 ratio
from sklearn.model_selection import train_test_split
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# build a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

# fit the model
gnb.fit(X_train, y_train)

# predict the model
y_pred = gnb.predict(X_test)

# evaluate the model
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
# print recall 
from sklearn.metrics import recall_score
print("Recall:", recall_score(y_test, y_pred, average='macro'))

# Also check the values in confusion matrix
from sklearn.metrics import confusion_matrix
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
