In [None]:
import pandas as pd

data = pd.read_csv('WineQT.csv')

print(data.shape)
print(data.head())
print(data.info())
print(data.isna().sum())

In [None]:
data.describe().round(2).T

In [15]:
data.drop(columns='Id', inplace=True)

In [None]:
print("Unique target classes:", data['quality'].unique())

In [None]:
data.plot(figsize=(15, 7))

In [None]:
data.groupby('quality').mean().round(2)

In [None]:
data.groupby('quality').mean().plot(kind='bar', figsize=(20, 10))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))

sns.lineplot(data=data, x='quality', y='volatile acidity', label="Volatile Acidity")
sns.lineplot(data=data, x='quality', y='citric acid', label="Citric Acid")
sns.lineplot(data=data, x='quality', y='chlorides', label="Chlorides")
sns.lineplot(data=data, x='quality', y='pH', label="pH")
sns.lineplot(data=data, x='quality', y='sulphates', label="Sulphates")
sns.lineplot(data=data, x='quality', y='alcohol', label="Alcohol")
sns.lineplot(data=data, x='quality', y='total sulfur dioxide', label="Total Sulfur Dioxide")
sns.lineplot(data=data, x='quality', y='free sulfur dioxide', label="Free Sulfur Dioxide")

plt.ylabel('Quantity')
plt.xlabel('Quality')
plt.title("Impact On Quality")
plt.legend()
plt.show()


In [None]:
feature_list = data.drop('quality', axis=1).columns.to_list()
len(feature_list)

In [None]:
rows = 3
cols = 4

fig, axes = plt.subplots(rows, cols, figsize=(20, 15))

axes = axes.ravel()

for i, feature in enumerate(feature_list):
    sns.lineplot(data=data, x='quality', y=feature, ax=axes[i])
    axes[i].set_title(f"Impact of {feature} on quality")
    axes[i].set_xlabel("Quality")
    axes[i].set_ylabel("Quantity")

plt.tight_layout()
plt.show()


In [None]:
import plotly.express as px

px.scatter(data, x='free sulfur dioxide', y='total sulfur dioxide', animation_frame='quality')

In [57]:
from sklearn.model_selection import train_test_split

X = data.drop('quality', axis=1)
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = y_pred.round()

accuracy_score(y_test, y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, solver='liblinear')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))

feature_importances = [(feature_name, importance) for feature_name, importance in zip(X.columns, model.feature_importances_)]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

for feature, importance in feature_importances:
    print(f"{feature}: {importance.round(2)}")

In [None]:
from sklearn.svm import SVC

model = SVC(C=50, kernel='rbf')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)



In [None]:
from sklearn.svm import SVR

model = SVR(C=50, kernel='rbf')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = y_pred.round()

accuracy_score(y_test, y_pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

grid_search.best_params_