In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering


In [None]:
# Load the crypto_data.csv dataset.
file_path = "resources/winequality-red.csv"
red_df = pd.read_csv(file_path)
red_df.head()

In [None]:
red_df= red_df.drop_duplicates()
red_df.head()

In [None]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(red_df)
print(X_scaled[0:5])

In [None]:
# Using PCA to reduce dimension to three principal components.

# Initialize PCA model
pca = PCA(n_components=3)

# fit and transform the scaled data
red_pca = pca.fit_transform(X_scaled)

In [None]:
# Create a DataFrame with the three principal components.

pca_df = pd.DataFrame(
    data=red_pca, columns=["PC 1", "PC 2", "PC 3"], index = red_df.index
)
pca_df.head(10)

In [None]:
# Create an elbow curve to find the best value for K.
# Finding the best value for k
inertia = []
k = list(range(1, 12))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Plot the curve
# df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

# Plotly express plot
fig = px.line(df_elbow, x="k", y="inertia", title='Elbow Curve')
fig.show()

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)

In [None]:
#Create a new DataFrame including predicted clusters and features.
# Concatentate the DataFrames on the same columns.
clustered_df = pd.concat([red_df, pca_df], axis = 1)
#clustered_df.head()

#  Add a new column that holds the predictions.
clustered_df["Clusters"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.sample(10)

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters

# Plot 3D
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Clusters",
       
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
clustered_df.sort_values("Clusters", ascending=False).groupby("Clusters")
fig2 = px.bar(
    clustered_df,
    x= 'Clusters',
    y='quality',
    color = 'Clusters',
)
fig2.update_layout(barmode='stack')
fig2.show()