# Classification Challenge

Wine experts can identify wines from specific vineyards through smell and taste, but the factors that give different wines their individual charateristics are actually based on their chemical composition.

In this challenge, you must train a classification model to analyze the chemical and visual features of wine samples and classify them based on their cultivar (grape variety).

> **Citation**: The data used in this exercise was originally collected by Forina, M. et al.
>
> PARVUS - An Extendible Package for Data Exploration, Classification and Correlation.
Institute of Pharmaceutical and Food Analysis and Technologies, Via Brigata Salerno,
16147 Genoa, Italy.
>
> It can be downloaded from the UCI dataset repository (Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository]([http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science). 

## Explore the data

Run the following cell to load a CSV file of wine data, which consists of 12 numeric features and a classification label with the following classes:

- **0** (*variety A*)
- **1** (*variety B*)
- **2** (*variety C*)

In [None]:
import pandas as pd

# load the training dataset
data = pd.read_csv('data/wine.csv')
sample = data.sample(10)

Your challenge is to explore the data and train a classification model that achieves an overall *Recall* metric of over 0.95 (95%).

> **Note**: There is no single "correct" solution. A sample solution is provided in [03 - Wine Classification Solution.ipynb](03%20-%20Wine%20Classification%20Solution.ipynb).

## Train and evaluate a model

Add markdown and code cells as required to to explore the data, train a model, and evaluate the model's predictive performance.

In [None]:
wine_classes = ["Variety A", "Variety B", "Variety C"]
print(sample.columns[0:12].values, 'Variety')
for index, row in data.sample(10).iterrows():
    print('[', 
          row[0], 
          row[1], 
          row[2], 
          row[3], 
          int(row[4]),
          row[5], 
          row[6], 
          row[7], 
          row[8], 
          row[9], 
          row[10], 
          row[11],
          row[12],
          int(row[13]), ']', wine_classes[int(row[13])])

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

data_features = [
    'Alcohol',
    'Malic_acid',
    'Ash',
    'Alcalinity',
    'Magnesium',
    'Phenols',
    'Flavanoids',
    'Nonflavanoids',
    'Proanthocyanins',
    'Color_intensity',
    'Hue',
    'OD280_315_of_diluted_wines',
    'Proline'
]
data_label = 'WineVariety'

for col in data_features:
    data.boxplot(column=col, by=data_label, figsize=(6,6))
    plt.title(col)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and labels
X, y = data[data_features].values, data[data_label].values

for n in range(0,4):
    print("Wine", str(n+1), "\n Features:", list(X[n]), "\n Label:", y[n])

# Split data 70%-30% into training set and test set
x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(X, y,
                                                                        test_size=0.30,
                                                                        random_state=0,
                                                                        stratify=y)

print('Training Set: %d\nTest Set: %d \n' % (x_data_train.shape[0], x_data_test.shape[0]))

### The following code can be skipped

In [None]:
from sklearn.linear_model import LogisticRegression

# Set regularization rate
reg = 0.1

# train a logistic regression model on the training set
multi_model = LogisticRegression(C=1/reg, solver='lbfgs', multi_class='auto', max_iter=10000).fit(x_data_train, y_data_train)
print(multi_model)

In [None]:
data_predictions = multi_model.predict(x_data_test)
print('Predicted labels: ', data_predictions[:15])
print('Actual labels:    ', y_data_test[:15])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_data_test, data_predictions))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Overall Accuracy:", accuracy_score(y_data_test, data_predictions))
print("Overall Precision:", precision_score(y_data_test, data_predictions, average='macro'))
print("Overall Recall:", recall_score(y_data_test, data_predictions, average='macro'))

In [None]:
from sklearn.metrics import confusion_matrix

# Print the confusion matrix
mcm = confusion_matrix(y_data_test, data_predictions)
print(mcm)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(mcm, interpolation="nearest", cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(wine_classes))
plt.xticks(tick_marks, wine_classes, rotation=45)
plt.yticks(tick_marks, wine_classes)
plt.xlabel("Predicted Wine Varieties")
plt.ylabel("Actual Wine Varieties")
plt.show()

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# Get class probability scores
data_prob = multi_model.predict_proba(x_data_test)

# Get ROC metrics for each class
fpr = {}
tpr = {}
thresh = {}
for i in range(len(wine_classes)):
    fpr[i], tpr[i], thresh[i] = roc_curve(y_data_test, data_prob[:,i], pos_label=i)

# Plot the ROC chart
plt.plot(fpr[0], tpr[0], linestyle='--', color='orange', label=wine_classes[0] + ' vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--', color='green', label=wine_classes[1] + ' vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--', color='blue', label=wine_classes[2] + ' vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
plt.show()

In [None]:
auc = roc_auc_score(y_data_test, data_prob, multi_class='ovr')
print('Average AUC:', auc)

### Continue here

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Define preprocessing for numeric columns (scale them)
feature_columns = [0,1,2,3,4,5,6,7,8,9,10,11,12]
feature_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
    ])

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('preprocess', feature_transformer, feature_columns)])

# Create training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', SVC(probability=True))])

# Fit the pipeline to train a linear regression model on the training set
multi_model = pipeline.fit(x_data_train, y_data_train)
print(multi_model)

In [None]:
# Get predictions from test data
data_predictions = multi_model.predict(x_data_test)
data_prob = multi_model.predict_proba(x_data_test)

#Overall metrics
print("Overall Accuracy:", accuracy_score(y_data_test, data_predictions))
print("Overall Precision:", precision_score(y_data_test, data_predictions, average='macro'))
print("Overall Recall:", recall_score(y_data_test, data_predictions, average='macro'))
print("Average AUC:", roc_auc_score(y_data_test, data_prob, multi_class='ovr'))

# Confusion matrix
plt.imshow(mcm, interpolation="nearest", cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(wine_classes))
plt.xticks(tick_marks, wine_classes, rotation=45)
plt.yticks(tick_marks, wine_classes)
plt.xlabel("Predicted Wine Varieties")
plt.ylabel("Actual Wine Varieties")
plt.show()


## Use the model with new data observation

When you're happy with your model's predictive performance, save it and then use it to predict classes for the following two new wine samples:

- \[13.72,1.43,2.5,16.7,108,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285\]
- \[12.37,0.94,1.36,10.6,88,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520\]


In [None]:
import joblib

# Save the model as a pickle file
filename = './wine_model.pkl'
joblib.dump(multi_model, filename)

In [None]:
# Load the saved model
multi_model = joblib.load(filename)

In [None]:
# Array of two feature arrays
x_new = np.array([
    [13.72,1.43,2.5,16.7,108,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285],
    [12.37,0.94,1.36,10.6,88,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520]])

# Call the web service, passing the input data
predictions = multi_model.predict(x_new)

# Get the predicted classes
for prediction in predictions:
    print(prediction, '(' + wine_classes[prediction] + ')')