# Wine Cultivar Origin Prediction - Model Development

This notebook implements the model development for predicting wine cultivar origin using the UCI Wine dataset.

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

## 2. Load Wine Dataset

In [None]:
# Load the Wine dataset from sklearn
wine = load_wine()
print("Wine Dataset loaded successfully!")
print(f"Features: {wine.feature_names}")
print(f"Target Classes: {wine.target_names}")
print(f"Data shape: {wine.data.shape}")

## 3. Data Preprocessing

In [None]:
# Create a DataFrame for easier manipulation
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
df['target'] = wine.target

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Display basic statistics
print("\nDataset Info:")
print(df.head())

In [None]:
# Select 6 most important features based on correlation with target
# We'll use feature importance from a quick Random Forest to select features
from sklearn.ensemble import RandomForestClassifier

# Quick feature selection
temp_model = RandomForestClassifier(n_estimators=100, random_state=42)
temp_model.fit(wine.data, wine.target)

# Get feature importances
feature_importance = pd.DataFrame({
    'feature': wine.feature_names,
    'importance': temp_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importances:")
print(feature_importance)

# Select top 6 features
selected_features = feature_importance.head(6)['feature'].tolist()
print(f"\nSelected 6 features: {selected_features}")

In [None]:
# Prepare data with selected features
X = df[selected_features]
y = df['target']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

## 4. Split Data into Training and Testing Sets

In [None]:
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

## 5. Feature Scaling

In [None]:
# Initialize and fit the scaler on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"Scaled training data shape: {X_train_scaled.shape}")

## 6. Train Random Forest Classifier

In [None]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Train the model
print("Training Random Forest Classifier...")
model.fit(X_train_scaled, y_train)
print("Model training completed!")

## 7. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\n=== Model Performance Metrics ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
# Detailed classification report
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=wine.target_names))

In [None]:
# Confusion Matrix
print("\n=== Confusion Matrix ===")
cm = confusion_matrix(y_test, y_pred)
print(cm)

## 8. Save the Model and Scaler

In [None]:
# Save the trained model and scaler using joblib
model_data = {
    'model': model,
    'scaler': scaler,
    'selected_features': selected_features,
    'target_names': wine.target_names.tolist()
}

joblib.dump(model_data, 'wine_cultivar_model.pkl')
print("\nModel saved successfully as 'wine_cultivar_model.pkl'")
print(f"Selected features: {selected_features}")
print(f"Target classes: {wine.target_names.tolist()}")

## 9. Test the Saved Model

In [None]:
# Load the saved model to verify it works
loaded_model_data = joblib.load('wine_cultivar_model.pkl')
loaded_model = loaded_model_data['model']
loaded_scaler = loaded_model_data['scaler']
loaded_features = loaded_model_data['selected_features']

print("Model loaded successfully!")
print(f"Model type: {type(loaded_model).__name__}")
print(f"Features: {loaded_features}")

# Test prediction on a sample
sample = X_test.iloc[0:1]
sample_scaled = loaded_scaler.transform(sample)
prediction = loaded_model.predict(sample_scaled)
print(f"\nSample prediction: Cultivar {prediction[0]} ({wine.target_names[prediction[0]]})")
print(f"Actual: Cultivar {y_test.iloc[0]} ({wine.target_names[y_test.iloc[0]]})")