In [17]:
#morgan prukop
# Import Dependencies
import plotly.express as px
import pandas as pd
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Wine Quality

## Description
The combined dataset includes red and white variants of the Portuguese "Vinho Verde" wine. Only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).

#### Attribute Information:

Continuous input variables (based on physicochemical tests):
- fixed acidity
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH
- sulphates
- alcohol

Labeled input variables (based on sensory and category):
- quality (score between 0 and 10)
- color (White: 0; Red: 1)



## Source
https://archive.ics.uci.edu/ml/datasets/wine+quality

In [19]:
# Read the csv file into a pandas DataFrame called `wine_df`

wine = os.path.join('../','Resources', 'wine.csv')
wine_df = pd.read_csv(wine)
wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,0
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,0
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,0
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,1


## Use K-Means to predict clusters

## Find the best value for _k_ using an Elbow Curve

In [22]:
# Setup for loop
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(wine_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using plotly
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,22973410.0
1,2,8595660.0
2,3,4336875.0
3,4,3044144.0
4,5,2399310.0


In [26]:
# Look for the best k


In [27]:
# Plot the Elbow Curve using plotly

px.scatter(x=k,y=inertia,labels={"x":"Number of clusters","y":"Inertia"})

In [35]:
# Initialize a model with K based on the Elbow plot above
model = KMeans(n_clusters=3, random_state=5)

In [36]:
# Fit the model

model.fit(wine_df)

KMeans(n_clusters=3, random_state=5)

In [37]:
# Add a new `class` column to wine_df

wine_df["class"] = model.labels_
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,class
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0,2
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,0,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,0,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0,2
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0,2


In [38]:
# Create a 3D scatter plot to visualize our clustering 
# usings `total sulfur dioxide` and any two other features

px.scatter_3d(wine_df,x="free sulfur dioxide",y="density",z="pH",color="class")

Notice how `total sulfur dioxide` basically drives the decision of `class`. Why might this be? Do you think the scale of values compared to all other features matters? Confirm this hypothesis by next performing scaling.  

## Standard Scalar

Documentation: [StandarScaler()](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)

In [39]:
# Drop the class from our wine data
wine_df = wine_df.drop("class",axis=1)

In [43]:
# Scale all fields with the StandardScaler() function based on the documentation above.
# Initialize a `StandardScaler` object
scaler = StandardScaler().fit(wine_df)
scaler

StandardScaler()

In [46]:
# Fit and Transform the `wine_df` with the StandardScaler 
# Store the results in the variable `scaled_features`
scaled_features = scaler.fit_transform(wine_df)


In [47]:
# Creates a new DataFrame based on the `scaled_featuers` and the original index and column values
wine_df_scaled = pd.DataFrame(scaled_features, index=wine_df.index, columns=wine_df.columns)
wine_df_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,-0.166089,-0.423183,0.284686,3.206929,-0.314975,0.815565,0.959976,2.102214,-1.359049,-0.546178,-1.418558,0.207999,-0.571367
1,-0.706073,-0.240949,0.147046,-0.807837,-0.20079,-0.931107,0.287618,-0.232332,0.506915,-0.277351,-0.831615,0.207999,-0.571367
2,0.682458,-0.362438,0.559966,0.306208,-0.172244,-0.029599,-0.33166,0.134525,0.25812,-0.613385,-0.328521,0.207999,-0.571367
3,-0.011808,-0.666161,0.009406,0.642523,0.056126,0.928254,1.243074,0.301278,-0.177272,-0.882212,-0.496219,0.207999,-0.571367
4,-0.011808,-0.666161,0.009406,0.642523,0.056126,0.928254,1.243074,0.301278,-0.177272,-0.882212,-0.496219,0.207999,-0.571367


## Find a new best value for _k_ of scaled data using an Elbow Curve

In [48]:
# Setup for loop
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(wine_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using plotly
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,22973410.0
1,2,8595660.0
2,3,4336875.0
3,4,3044144.0
4,5,2399310.0


In [None]:
# Look for the new best k

### YOUR CODE HERE

In [49]:
# Plot the Elbow Curve using plotly

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

px.scatter(x=k,y=inertia,labels={"x":"Number of clusters","y":"Inertia"})

In [50]:
# Initialize a mew model with K based on the Elbow plot above

model = KMeans(n_clusters=3, random_state=5)

In [51]:
# Fit the new model

model.fit(wine_df_scaled)

KMeans(n_clusters=3, random_state=5)

In [52]:
# Add a new class column to wine_df_scaled

wine_df_scaled["class"] = model.labels_
wine_df_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,class
0,-0.166089,-0.423183,0.284686,3.206929,-0.314975,0.815565,0.959976,2.102214,-1.359049,-0.546178,-1.418558,0.207999,-0.571367,2
1,-0.706073,-0.240949,0.147046,-0.807837,-0.20079,-0.931107,0.287618,-0.232332,0.506915,-0.277351,-0.831615,0.207999,-0.571367,0
2,0.682458,-0.362438,0.559966,0.306208,-0.172244,-0.029599,-0.33166,0.134525,0.25812,-0.613385,-0.328521,0.207999,-0.571367,0
3,-0.011808,-0.666161,0.009406,0.642523,0.056126,0.928254,1.243074,0.301278,-0.177272,-0.882212,-0.496219,0.207999,-0.571367,2
4,-0.011808,-0.666161,0.009406,0.642523,0.056126,0.928254,1.243074,0.301278,-0.177272,-0.882212,-0.496219,0.207999,-0.571367,2


In [54]:
# Create a 3D scatter plot to visualize our clustering 
# usings the same features as before

px.scatter_3d(wine_df_scaled,x="free sulfur dioxide",y="density",z="pH",color="class")

Does `total sulfur dioxide` still drive the decision of `class` or do new features have an impact?