# Wine Cultivar Origin Prediction System
## Model Development and Training

This notebook trains a machine learning model to predict wine cultivar based on chemical properties.

**Features Used:** alcohol, malic_acid, total_phenols, flavanoids, color_intensity, proline

### Step 1: Import Required Libraries

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!")

### Step 2: Load the Wine Dataset

In [None]:
# Load wine dataset
wine_data = load_wine()

# Create DataFrame for better visualization
df = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names)
df['cultivar'] = wine_data.target

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Dataset information
print("Dataset Information:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nTarget Variable Distribution:")
print(df['cultivar'].value_counts())

### Step 3: Feature Selection

We select 6 features as required:
1. alcohol
2. malic_acid
3. total_phenols
4. flavanoids
5. color_intensity
6. proline

In [None]:
# Select 6 features
selected_features = [
    'alcohol',
    'malic_acid',
    'total_phenols',
    'flavanoids',
    'color_intensity',
    'proline'
]

# Create feature matrix (X) and target vector (y)
X = df[selected_features]
y = df['cultivar']

print(f"Selected {len(selected_features)} features:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")

print(f"\nFeature Matrix Shape: {X.shape}")
print(f"Target Vector Shape: {y.shape}")

In [None]:
# Display feature statistics
print("Feature Statistics:")
X.describe()

### Step 4: Data Preprocessing

In [None]:
# Check for missing values
print("Missing values in features:")
print(X.isnull().sum())

# No missing values in the Wine dataset, but if there were, we could handle them
# X = X.fillna(X.mean())  # Example: fill with mean

print("\n✓ No missing values found!")

In [None]:
# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training Set: {X_train.shape[0]} samples")
print(f"Testing Set: {X_test.shape[0]} samples")
print(f"\nTraining Set Class Distribution:")
print(y_train.value_counts().sort_index())
print(f"\nTesting Set Class Distribution:")
print(y_test.value_counts().sort_index())

### Step 5: Feature Scaling (Mandatory)

Feature scaling is required due to varying feature ranges.

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit the scaler on training data and transform both train and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Feature scaling completed!")
print(f"\nScaled Training Data Shape: {X_train_scaled.shape}")
print(f"Scaled Testing Data Shape: {X_test_scaled.shape}")

# Display mean and std after scaling (should be ~0 and ~1)
print(f"\nMean of scaled training data: {X_train_scaled.mean(axis=0).round(4)}")
print(f"Std of scaled training data: {X_train_scaled.std(axis=0).round(4)}")

### Step 6: Model Training

Using **Random Forest Classifier** for this multiclass classification task.

In [None]:
# Initialize Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train the model
print("Training Random Forest Classifier...")
model.fit(X_train_scaled, y_train)
print("✓ Model training completed!")

### Step 7: Model Evaluation

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Classification Report
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, target_names=['Cultivar 0', 'Cultivar 1', 'Cultivar 2']))

In [None]:
# Precision, Recall, F1-Score (Weighted)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print("\n" + "="*60)
print("WEIGHTED METRICS")
print("="*60)
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted):    {recall:.4f}")
print(f"F1-Score (Weighted):  {f1:.4f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Cultivar 0', 'Cultivar 1', 'Cultivar 2'],
            yticklabels=['Cultivar 0', 'Cultivar 1', 'Cultivar 2'])
plt.title('Confusion Matrix - Wine Cultivar Prediction', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

print("Confusion Matrix:")
print(cm)

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.tight_layout()
plt.show()

print("\nFeature Importance:")
print(feature_importance)

### Step 8: Save the Trained Model

We save both the model and the scaler as a single pickle file.

In [None]:
# Create a dictionary to store model, scaler, and feature names
model_package = {
    'model': model,
    'scaler': scaler,
    'feature_names': selected_features,
    'target_names': ['Cultivar 0', 'Cultivar 1', 'Cultivar 2']
}

# Save the model using joblib
joblib.dump(model_package, 'wine_cultivar_model.pkl')

print("✓ Model saved successfully as 'wine_cultivar_model.pkl'")
print("\nModel package contains:")
print("  - Trained Random Forest model")
print("  - StandardScaler (fitted)")
print("  - Feature names")
print("  - Target class names")

In [None]:
# Verify the saved model by loading and testing
loaded_model_package = joblib.load('wine_cultivar_model.pkl')

# Test with a sample prediction
sample_data = X_test.iloc[0:1]
sample_scaled = loaded_model_package['scaler'].transform(sample_data)
sample_prediction = loaded_model_package['model'].predict(sample_scaled)

print("\n" + "="*60)
print("MODEL VERIFICATION")
print("="*60)
print(f"Sample Input: {sample_data.values[0]}")
print(f"Predicted Cultivar: {loaded_model_package['target_names'][sample_prediction[0]]}")
print(f"Actual Cultivar: {loaded_model_package['target_names'][y_test.iloc[0]]}")
print("\n✓ Model loaded and verified successfully!")

### Step 9: Model Summary

In [None]:
print("\n" + "="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"Algorithm Used: Random Forest Classifier")
print(f"Number of Features: {len(selected_features)}")
print(f"Features: {', '.join(selected_features)}")
print(f"Number of Classes: 3 (Cultivar 0, 1, 2)")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Testing Samples: {X_test.shape[0]}")
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Weighted Precision: {precision:.4f}")
print(f"Weighted Recall: {recall:.4f}")
print(f"Weighted F1-Score: {f1:.4f}")
print(f"Model Persistence: Joblib")
print(f"Model File: wine_cultivar_model.pkl")
print("="*60)

print("\n✓ Model development completed successfully!")
print("\nNext Steps:")
print("1. Download 'wine_cultivar_model.pkl' from Colab")
print("2. Place it in the 'model/' folder of your project")
print("3. Run the web application (app.py)")

### Download Model File (Google Colab)

If you're running this in Google Colab, run the cell below to download the model file.

In [None]:
# Uncomment and run this cell if you're in Google Colab
# from google.colab import files
# files.download('wine_cultivar_model.pkl')