# Red Wine Quality Prediction
This notebook demonstrates the end-to-end process of predicting red wine quality using machine learning.

### Project Workflow:
1. **Data Collection**: Sourced from Kaggle/UCI.
2. **EDA**: Data cleaning, duplicate removal, and outlier detection.
3. **Feature Engineering**: Correlation analysis and SMOTE balancing.
4. **Model Development**: XGBoost classifier with hyperparameter tuning.
5. **Inference**: Interactive prediction.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set visual style
sns.set(style="whitegrid")

## 1. Data Collection

In [None]:
# Source: UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, sep=';')
print(f"Dataset Shape: {df.shape}")
df.head()

## 2. EDA & Data Preparation

In [None]:
# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Handle Duplicates
print(f"\nDuplicates found: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Shape after duplicate removal: {df.shape}")

# Outlier Handling (IQR)
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

df = remove_outliers(df, df.columns.drop('quality'))
print(f"Shape after outlier removal: {df.shape}")

## 3. Feature Engineering

In [None]:
# Drop low impact features as per project logic
features_to_drop = ['pH', 'fixed acidity', 'citric acid', 'free sulfur dioxide']
df.drop(columns=features_to_drop, inplace=True)

# Target Encoding (Good: >= 7, Bad: < 7)
df['quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)
print("Target Distribution:\n", df['quality'].value_counts())

# Scaling
X = df.drop('quality', axis=1)
y = df['quality']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# SMOTE Balancing
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_scaled, y)

## 4. Model Training (XGBoost)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

xgb = XGBClassifier(eval_metric='logloss')
param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [5, 7],
    'n_estimators': [100, 200]
}

grid = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

final_model = grid.best_estimator_
y_pred = final_model.predict(X_test)

print(f"Best Parameters: {grid.best_params_}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print("\nConfusion Matrix:")
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Reds')
plt.show()

## 5. Interactive Prediction Dashboard

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

print("Enter Wine Parameters:")
inputs = {}
for col in X.columns:
    inputs[col] = widgets.FloatText(description=col, value=df[col].mean())
    display(inputs[col])

button = widgets.Button(description="Predict Quality", button_style='danger')
output = widgets.Output()
display(button, output)

def on_predict_clicked(b):
    with output:
        clear_output()
        data = [inputs[col].value for col in X.columns]
        scaled_data = scaler.transform([data])
        pred = final_model.predict(scaled_data)[0]
        prob = final_model.predict_proba(scaled_data)[0][pred]
        
        result = "üç∑ GOOD QUALITY" if pred == 1 else "üíß BAD QUALITY"
        color = "green" if pred == 1 else "red"
        print(f"Result: {result} (Confidence: {prob*100:.1f}%)")

button.on_click(on_predict_clicked)