In [142]:
"""
SIMPLE ML LIFECYCLE - WINE QUALITY PREDICTION
Data loaded directly from web URL
"""

# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

print("üç∑ WINE QUALITY PREDICTION - ML LIFECYCLE")
print("=" * 70)

üç∑ WINE QUALITY PREDICTION - ML LIFECYCLE


In [140]:
# ============================================================
# STEP 1: LOAD DATA FROM WEBSITE
# ============================================================
print("\nSTEP 1: LOAD DATA FROM WEBSITE")
print("-" * 70)

# URL to wine quality dataset (red wine)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

print(f"Loading data from: {url}")

try:
    # Load data directly from URL
    df = pd.read_csv(url, sep=';')
    print(f"‚úì Successfully loaded {len(df)} wine samples from web!")
    
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    print("\nUsing backup URL...")
    # Backup: GitHub mirror
    url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/WineQualityRed.csv'
    df = pd.read_csv(url)
    print(f"‚úì Loaded {len(df)} samples from backup source")

print(f"\nDataset shape: {df.shape}")
print(f"Features: {df.shape[1] - 1}")
print(f"Target: quality (wine rating 3-8)")

print("\nFirst 5 samples:")
print(df.head())

print("\nColumn names:")
print(df.columns.tolist())


STEP 1: LOAD DATA FROM WEBSITE
----------------------------------------------------------------------
Loading data from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
‚úì Successfully loaded 1599 wine samples from web!

Dataset shape: (1599, 12)
Features: 11
Target: quality (wine rating 3-8)

First 5 samples:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0    

In [102]:
# ============================================================
# STEP 2: EXPLORE DATA
# ============================================================
print("\n\n STEP 2: EXPLORE DATA")
print("-" * 70)

print("\n Dataset Info:")
print(df.info())

print("\n Statistical Summary:")
print(df.describe())

print("\n Target Distribution (Quality Scores):")
print(df['quality'].value_counts().sort_index())

print("\n Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("  ‚úì No missing values!")
else:
    print(missing[missing > 0])

print("\n Correlation with Quality:")
correlations = df.corr()['quality'].sort_values(ascending=False)
print(correlations)



 STEP 2: EXPLORE DATA
----------------------------------------------------------------------

 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None

 Statisti

In [108]:
# ============================================================
# STEP 3: PREPARE DATA
# ============================================================
print("\n\nüßπ STEP 3: PREPARE DATA")
print("-" * 70)

# Simplify problem: Convert to binary classification
# Good wine (quality >= 6) vs Bad wine (quality < 6)
df['quality_binary'] = (df['quality'] >= 6).astype(int)

print("Converting to binary classification:")
print(f"  Bad wine (quality < 6):  {(df['quality_binary'] == 0).sum()} samples")
print(f"  Good wine (quality >= 6): {(df['quality_binary'] == 1).sum()} samples")

# Separate features and target
X = df.drop(['quality', 'quality_binary'], axis=1)
y = df['quality_binary']

print(f"\n‚úì Features: {X.shape[1]}")
print(f"‚úì Target: quality_binary (0=bad, 1=good)")

# Split data (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n‚úì Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"‚úì Testing set:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"‚úì Features scaled (normalized)")



üßπ STEP 3: PREPARE DATA
----------------------------------------------------------------------
Converting to binary classification:
  Bad wine (quality < 6):  744 samples
  Good wine (quality >= 6): 855 samples

‚úì Features: 11
‚úì Target: quality_binary (0=bad, 1=good)

‚úì Training set: 1279 samples (80.0%)
‚úì Testing set:  320 samples (20.0%)
‚úì Features scaled (normalized)


In [136]:
# ============================================================
# STEP 4: TRAIN MODEL
# ============================================================
print("\n\nSTEP 4: TRAIN MODEL")
print("-" * 70)

# Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    max_depth=10
)

print("Training Random Forest model...")
print("  Parameters:")
print(f"    - Trees: 100")
print(f"    - Max depth: 10")
print(f"    - Random state: 42")

model.fit(X_train_scaled, y_train)

print("\n‚úì Model trained successfully!")

# Show feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importance.head())



STEP 4: TRAIN MODEL
----------------------------------------------------------------------
Training Random Forest model...
  Parameters:
    - Trees: 100
    - Max depth: 10
    - Random state: 42

‚úì Model trained successfully!

Top 5 Most Important Features:
                 feature  importance
10               alcohol    0.199976
9              sulphates    0.142437
1       volatile acidity    0.112184
6   total sulfur dioxide    0.099197
7                density    0.091480


In [134]:
# ============================================================
# STEP 5: EVALUATE MODEL
# ============================================================
print("\n\nSTEP 5: EVALUATE MODEL")
print("-" * 70)

# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nModel Performance:")
print(f"  Training Accuracy: {train_accuracy*100:.2f}%")
print(f"  Testing Accuracy:  {test_accuracy*100:.2f}%")

# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print(f"  ‚ö† Warning: Possible overfitting (gap: {(train_accuracy-test_accuracy)*100:.2f}%)")
else:
    print(f"  ‚úì Good fit (gap: {(train_accuracy-test_accuracy)*100:.2f}%)")

# Detailed report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Bad Wine', 'Good Wine']))

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_test_pred)
print(f"                Predicted")
print(f"                Bad   Good")
print(f"Actual  Bad     {cm[0][0]:3d}   {cm[0][1]:3d}")
print(f"        Good    {cm[1][0]:3d}   {cm[1][1]:3d}")

# Sample predictions
print("\nSample Predictions:")
print("-" * 70)
print(f"{'Actual':<12} {'Predicted':<12} {'Correct?':<10} {'Confidence'}")
print("-" * 70)

# Get probabilities for confidence
y_test_proba = model.predict_proba(X_test_scaled)

for i in range(min(10, len(y_test))):
    actual = "Good Wine" if y_test.iloc[i] == 1 else "Bad Wine"
    predicted = "Good Wine" if y_test_pred[i] == 1 else "Bad Wine"
    correct = "‚úì" if y_test.iloc[i] == y_test_pred[i] else "‚úó"
    confidence = y_test_proba[i].max()
    
    print(f"{actual:<12} {predicted:<12} {correct:<10} {confidence*100:.1f}%")



STEP 5: EVALUATE MODEL
----------------------------------------------------------------------

Model Performance:
  Training Accuracy: 97.50%
  Testing Accuracy:  79.69%

Detailed Classification Report:
              precision    recall  f1-score   support

    Bad Wine       0.77      0.80      0.79       149
   Good Wine       0.82      0.80      0.81       171

    accuracy                           0.80       320
   macro avg       0.80      0.80      0.80       320
weighted avg       0.80      0.80      0.80       320


Confusion Matrix:
                Predicted
                Bad   Good
Actual  Bad     119    30
        Good     35   136

Sample Predictions:
----------------------------------------------------------------------
Actual       Predicted    Correct?   Confidence
----------------------------------------------------------------------
Good Wine    Bad Wine     ‚úó          57.2%
Bad Wine     Good Wine    ‚úó          52.0%
Bad Wine     Bad Wine     ‚úì          76.1

In [132]:
# ============================================================
# STEP 6: MAKE PREDICTIONS ON NEW DATA
# ============================================================
print("\n\nSTEP 6: PREDICT NEW WINE QUALITY")
print("-" * 70)

# Example: New wine sample
new_wine = pd.DataFrame({
    'fixed acidity': [7.4],
    'volatile acidity': [0.7],
    'citric acid': [0.0],
    'residual sugar': [1.9],
    'chlorides': [0.076],
    'free sulfur dioxide': [11.0],
    'total sulfur dioxide': [34.0],
    'density': [0.9978],
    'pH': [3.51],
    'sulphates': [0.56],
    'alcohol': [9.4]
})

print("üç∑ New wine measurements:")
print(new_wine.T)

# Scale and predict
new_wine_scaled = scaler.transform(new_wine)
prediction = model.predict(new_wine_scaled)[0]
probabilities = model.predict_proba(new_wine_scaled)[0]

quality = "GOOD" if prediction == 1 else "BAD"
confidence = probabilities[prediction]

print(f"\nPrediction: {quality} WINE")
print(f"Confidence: {confidence*100:.2f}%")

print("\nProbabilities:")
print(f"  Bad Wine:  {probabilities[0]*100:.2f}%")
print(f"  Good Wine: {probabilities[1]*100:.2f}%")



STEP 6: PREDICT NEW WINE QUALITY
----------------------------------------------------------------------
üç∑ New wine measurements:
                            0
fixed acidity          7.4000
volatile acidity       0.7000
citric acid            0.0000
residual sugar         1.9000
chlorides              0.0760
free sulfur dioxide   11.0000
total sulfur dioxide  34.0000
density                0.9978
pH                     3.5100
sulphates              0.5600
alcohol                9.4000

Prediction: BAD WINE
Confidence: 86.54%

Probabilities:
  Bad Wine:  86.54%
  Good Wine: 13.46%


In [128]:
# ============================================================
# STEP 7: SAVE MODEL
# ============================================================
print("\n\nSTEP 7: SAVE MODEL FOR DEPLOYMENT")
print("-" * 70)

# Save model
joblib.dump(model, 'wine_quality_model.pkl')
print("‚úì Model saved: wine_quality_model.pkl")

# Save scaler
joblib.dump(scaler, 'wine_scaler.pkl')
print("‚úì Scaler saved: wine_scaler.pkl")

# Save feature names
joblib.dump(X.columns.tolist(), 'wine_features.pkl')
print("‚úì Feature names saved: wine_features.pkl")

print("\n Model package ready for deployment!")



STEP 7: SAVE MODEL FOR DEPLOYMENT
----------------------------------------------------------------------
‚úì Model saved: wine_quality_model.pkl
‚úì Scaler saved: wine_scaler.pkl
‚úì Feature names saved: wine_features.pkl

 Model package ready for deployment!


In [116]:
# STEP 6: FEATURE SCALING (NORMALIZATION)
from sklearn.preprocessing import StandardScaler

def scale_features(X_train, X_test):
    """Step 6: Normalize features"""
    print("\n" + "=" * 50)
    print("STEP 6: FEATURE SCALING")
    print("=" * 50)
    
    scaler = StandardScaler()
    
    # Fit on training data only
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("‚úì Features standardized (mean=0, std=1)")
    print(f"Example: Original range of MedInc: {X_train.iloc[:, 0].min():.2f} to {X_train.iloc[:, 0].max():.2f}")
    print(f"         Scaled range: {X_train_scaled[:, 0].min():.2f} to {X_train_scaled[:, 0].max():.2f}")
    
    return X_train_scaled, X_test_scaled, scaler

# Run it
X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

# Scaling puts all features on same scale - important for many algorithms!


STEP 6: FEATURE SCALING
‚úì Features standardized (mean=0, std=1)
Example: Original range of MedInc: 4.60 to 15.90
         Scaled range: -2.13 to 4.30


In [126]:
# ============================================================
# STEP 8: LOAD AND USE SAVED MODEL
# ============================================================
print("\n\n STEP 8: LOAD AND USE DEPLOYED MODEL")
print("-" * 70)

# Load everything
loaded_model = joblib.load('wine_quality_model.pkl')
loaded_scaler = joblib.load('wine_scaler.pkl')
loaded_features = joblib.load('wine_features.pkl')

print("‚úì Model loaded successfully")
print("‚úì Scaler loaded successfully")
print("‚úì Features loaded successfully")

# Test with another wine
test_wine = pd.DataFrame({
    'fixed acidity': [8.5],
    'volatile acidity': [0.5],
    'citric acid': [0.3],
    'residual sugar': [2.5],
    'chlorides': [0.08],
    'free sulfur dioxide': [15.0],
    'total sulfur dioxide': [40.0],
    'density': [0.997],
    'pH': [3.3],
    'sulphates': [0.65],
    'alcohol': [11.5]
})

print("\nTesting loaded model with new wine:")
test_wine_scaled = loaded_scaler.transform(test_wine)
prediction = loaded_model.predict(test_wine_scaled)[0]
proba = loaded_model.predict_proba(test_wine_scaled)[0]

result = "GOOD" if prediction == 1 else "BAD"
print(f"Prediction: {result} WINE ({proba[prediction]*100:.1f}% confidence)")



 STEP 8: LOAD AND USE DEPLOYED MODEL
----------------------------------------------------------------------
‚úì Model loaded successfully
‚úì Scaler loaded successfully
‚úì Features loaded successfully

Testing loaded model with new wine:
Prediction: GOOD WINE (87.5% confidence)


In [120]:
# ============================================================
# SUMMARY
# ============================================================
print("\n\n‚úÖ ML LIFECYCLE COMPLETE!")
print("=" * 70)
print("\n What We Did:")
print("  1. ‚úì Loaded data from web (UCI Repository)")
print("  2. ‚úì Explored data (1599 wine samples, 11 features)")
print("  3. ‚úì Prepared data (binary classification, scaled features)")
print("  4. ‚úì Trained model (Random Forest with 100 trees)")
print(f"  5. ‚úì Evaluated model ({test_accuracy*100:.2f}% test accuracy)")
print("  6. ‚úì Made predictions (classified wine quality)")
print("  7. ‚úì Saved model (ready for deployment)")
print("  8. ‚úì Loaded model (deployed and working!)")

print("\n You just built an ML system with real web data!")
print("=" * 70)




‚úÖ ML LIFECYCLE COMPLETE!

 What We Did:
  1. ‚úì Loaded data from web (UCI Repository)
  2. ‚úì Explored data (1599 wine samples, 11 features)
  3. ‚úì Prepared data (binary classification, scaled features)
  4. ‚úì Trained model (Random Forest with 100 trees)
  5. ‚úì Evaluated model (79.69% test accuracy)
  6. ‚úì Made predictions (classified wine quality)
  7. ‚úì Saved model (ready for deployment)
  8. ‚úì Loaded model (deployed and working!)

 You just built an ML system with real web data!
