In [53]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [54]:
 # Read in the CSV file as a Pandas DataFrame
cereal_df = pd.read_csv(
    Path("../Data/cereal.csv"),
    index_col="name", 
)

# Review the DataFrame
cereal_df.head()

Unnamed: 0_level_0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [55]:
# review shape of the dataframe
cereal_df.shape

(77, 15)

In [56]:
# Drop the non-nutritional value columns
cereal_df = cereal_df.drop(columns=['vitamins', 'shelf', 'weight', 'cups', 'rating'])

# Review the DataFrame
cereal_df.head()

Unnamed: 0_level_0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100% Bran,N,C,70,4,1,130,10.0,5.0,6,280
100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135
All-Bran,K,C,70,4,1,260,9.0,7.0,5,320
All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330
Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1


In [57]:
# review the data
cereal_df.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.597403,6.922078,96.077922
std,19.484119,1.09479,1.006473,83.832295,2.383364,4.278956,4.444885,71.286813
min,50.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,40.0
50%,110.0,3.0,1.0,180.0,2.0,14.0,7.0,90.0
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0


In [58]:
# Scale numerical data
cereal_data_scaled = StandardScaler().fit_transform(
    cereal_df[["calories", "protein", "fat", "sodium", "fiber", "carbo", "sugars", "potass"]]
)

In [59]:
# Create a DataFrame with the scaled data
df_cereal_data = pd.DataFrame(
    cereal_data_scaled,
    columns=["calories", "protein", "fat", "sodium", "fiber", "carbo", "sugars", "potass"]
)

# Copy the tickers names from the original data
df_cereal_data.index = cereal_df.index

# Display sample data
df_cereal_data.head()

Unnamed: 0_level_0,calories,protein,fat,sodium,fiber,carbo,sugars,potass
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100% Bran,-1.905397,1.337319,-0.012988,-0.356306,3.314439,-2.257639,-0.208807,2.596948
100% Natural Bran,0.677623,0.417912,3.987349,-1.737087,-0.064172,-1.551936,0.244099,0.549573
All-Bran,-1.905397,1.337319,-0.012988,1.204578,2.892113,-1.78717,-0.43526,3.161741
All-Bran with Extra Fiber,-2.938605,1.337319,-1.013072,-0.236238,5.003745,-1.551936,-1.567525,3.302939
Almond Delight,0.161019,-0.501495,0.987096,0.48417,-0.486498,-0.14053,0.244099,-1.370723


In [60]:
 # Encode the "EnergyType" column to variables to categorize oil versus non-oil firms. 
cereal_dummies = pd.get_dummies(cereal_df[["mfr", "type"]], drop_first=True, dtype=int)
cereal_dummies.head()

Unnamed: 0_level_0,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_H
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100% Bran,0,0,1,0,0,0,0
100% Natural Bran,0,0,0,0,1,0,0
All-Bran,0,1,0,0,0,0,0
All-Bran with Extra Fiber,0,1,0,0,0,0,0
Almond Delight,0,0,0,0,0,1,0


In [61]:
 # Concatenate the encoded variables with the scaled data DataFrame.
df_cereal_concat = pd.concat([df_cereal_data, cereal_dummies], axis=1)

# Display the sample data
df_cereal_concat.head()

Unnamed: 0_level_0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_H
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100% Bran,-1.905397,1.337319,-0.012988,-0.356306,3.314439,-2.257639,-0.208807,2.596948,0,0,1,0,0,0,0
100% Natural Bran,0.677623,0.417912,3.987349,-1.737087,-0.064172,-1.551936,0.244099,0.549573,0,0,0,0,1,0,0
All-Bran,-1.905397,1.337319,-0.012988,1.204578,2.892113,-1.78717,-0.43526,3.161741,0,1,0,0,0,0,0
All-Bran with Extra Fiber,-2.938605,1.337319,-1.013072,-0.236238,5.003745,-1.551936,-1.567525,3.302939,0,1,0,0,0,0,0
Almond Delight,0.161019,-0.501495,0.987096,0.48417,-0.486498,-0.14053,0.244099,-1.370723,0,0,0,0,0,1,0


In [62]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1, 11))

In [63]:
 # Create an empy list to store the inertia values
inertia = []

In [64]:
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_stocks_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_cereal_concat)
    inertia.append(model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [65]:
 # Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    "k": k,
    "inertia": inertia
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [66]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve Using PCA Data", xticks=k)
elbow_plot

In [67]:
 # Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=5)

In [68]:
 # Fit the model for the df_stocks_scaled DataFrame
model.fit(df_cereal_concat)

  super()._check_params_vs_input(X, default_n_init=10)


In [69]:
 # Predict the model segments (clusters)
cereal_clusters = model.predict(df_cereal_concat)

# View the stock segments
print(cereal_clusters)

[3 0 3 3 4 4 4 0 1 2 4 1 4 0 4 1 1 4 4 0 1 1 4 1 4 4 2 0 0 4 4 4 2 1 0 4 4
 4 1 0 1 0 4 2 0 0 0 1 4 0 1 0 0 1 2 2 2 2 0 0 2 1 1 2 2 2 4 1 2 1 0 1 1 4
 1 1 4]


In [70]:
# Create a copy of the cereal concat df
df_creal_concat_kmeans = df_cereal_concat.copy()

# Create a new column in the DataFrame with the predicted clusters
df_creal_concat_kmeans["KmeansCluster"] = cereal_clusters

# Review the DataFrame
df_creal_concat_kmeans.head()

Unnamed: 0_level_0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_H,KmeansCluster
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
100% Bran,-1.905397,1.337319,-0.012988,-0.356306,3.314439,-2.257639,-0.208807,2.596948,0,0,1,0,0,0,0,3
100% Natural Bran,0.677623,0.417912,3.987349,-1.737087,-0.064172,-1.551936,0.244099,0.549573,0,0,0,0,1,0,0,0
All-Bran,-1.905397,1.337319,-0.012988,1.204578,2.892113,-1.78717,-0.43526,3.161741,0,1,0,0,0,0,0,3
All-Bran with Extra Fiber,-2.938605,1.337319,-1.013072,-0.236238,5.003745,-1.551936,-1.567525,3.302939,0,1,0,0,0,0,0,3
Almond Delight,0.161019,-0.501495,0.987096,0.48417,-0.486498,-0.14053,0.244099,-1.370723,0,0,0,0,0,1,0,4


In [71]:
 # Create a scatter plot
df_creal_concat_kmeans.hvplot.scatter(
    x="calories",
    y="sugars",
    by="KmeansCluster",
    hover_cols = ["name"], 
    title = "Scatter Plot by Caloreis and Sugar Content - k=5"
)

In [72]:
 # Create the PCA model instance where n_components=2
pca = PCA(n_components=2)

In [73]:
# Fit the df_stocks_scaled data to the PCA
cereal_pca_data = pca.fit_transform(df_cereal_concat)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
cereal_pca_data[:5]

array([[ 5.11495326,  0.34117053],
       [ 1.47046834, -2.81946238],
       [ 4.75398122,  0.17181737],
       [ 6.474524  ,  2.1426905 ],
       [-1.24696503, -0.59951793]])

In [74]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.30100478, 0.22883116])

In [75]:
# Creating a DataFrame with the PCA data
df_cereals_pca = pd.DataFrame(cereal_pca_data, columns=["PC1", "PC2"])

# Copy the tickers names from the original data
df_cereals_pca.index = cereal_df.index

# Review the DataFrame
df_cereals_pca.head()

Unnamed: 0_level_0,PC1,PC2
name,Unnamed: 1_level_1,Unnamed: 2_level_1
100% Bran,5.114953,0.341171
100% Natural Bran,1.470468,-2.819462
All-Bran,4.753981,0.171817
All-Bran with Extra Fiber,6.474524,2.14269
Almond Delight,-1.246965,-0.599518


In [76]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [77]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1, n_init=10)
    k_model.fit(df_cereals_pca)
    inertia.append(k_model.inertia_)



In [78]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,359.517769
1,2,232.649064
2,3,154.597373
3,4,93.897513
4,5,69.322759


In [79]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow_pca_graph = df_elbow_pca.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve with PCA Data",
    xticks=k,
    width=500
)

df_elbow_pca_graph

In [80]:
 # Initialize the K-Means model with n_clusters=5
model = KMeans(n_clusters=4)

# Fit the model for the df_stocks_pca DataFrame
model.fit(df_cereals_pca)

# Predict the model segments (clusters)
cereal_clusters = model.predict(df_cereals_pca)

# Print the stock segments
print(cereal_clusters)

  super()._check_params_vs_input(X, default_n_init=10)


[3 2 3 3 1 2 1 2 0 0 1 2 1 2 1 1 1 1 1 2 0 1 2 1 1 1 0 2 2 1 1 1 0 0 2 1 2
 1 1 1 1 2 1 0 2 2 2 1 1 2 0 2 2 1 0 0 0 3 2 2 0 1 1 0 0 0 1 0 0 1 2 0 1 1
 0 0 1]


In [81]:
 # Create a copy of the df_stocks_pca DataFrame and name it as df_stocks_pca_predictions
df_cereals_pca_predictions = df_cereals_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_cereals_pca_predictions["PCACluster"] = cereal_clusters

# Review the DataFrame
df_cereals_pca_predictions.head()

Unnamed: 0_level_0,PC1,PC2,PCACluster
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100% Bran,5.114953,0.341171,3
100% Natural Bran,1.470468,-2.819462,2
All-Bran,4.753981,0.171817,3
All-Bran with Extra Fiber,6.474524,2.14269,3
Almond Delight,-1.246965,-0.599518,1


In [82]:
 # Create the scatter plot with x="PC1" and y="PC2"
df_cereals_pca_predictions.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="PCACluster",
    hover_cols = ["name"], 
    title = "Scatter Plot by Cereal - PCA=2"
)