In [0]:
display(spark.sql("""
    SELECT quality, COUNT(*) as count
    FROM jpmc_group_catalog.mlops.wine_data_poc_dabs
    GROUP BY quality
    ORDER BY quality
"""))

In [0]:

# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data from Delta table into Spark DataFrame
wine_df = spark.table("jpmc_group_catalog.mlops.wine_data_poc_dabs")

# Convert to Pandas DataFrame for EDA
pdf = wine_df.toPandas()

In [0]:
# Show first few rows
display (wine_df)

# Show schema
wine_df.printSchema()

# Summary statistics
display (wine_df.describe())

In [0]:

# Check for missing values
missing = pdf.isnull().sum()
print("Missing values per column:\n", missing)

# Data types
print("Data types: \n", pdf.dtypes)


In [0]:
# Distribution of wine quality
plt.figure(figsize=(8,5))
sns.countplot(x="quality", data=pdf, palette="viridis")
plt.title("Distribution of wine Quality")
plt.xlabel("Quality")
plt.ylabel("Count")
plt.show()


In [0]:
# Plot distributions for key features
features = [
    "fixed_acidity", "volatile_acidity", "citric_acid",
    "residual_sugar",
    "chlorides", "free_sulfur_dioxide",
    "total_sulfur_dioxide",
    "density", "pH", "sulphates", "alcohol"
]

plt.figure(figsize=(16,12))
for i, feature in enumerate(features):
    plt.subplot(4, 3, i+1)
    sns.histplot(pdf[feature], kde=True, color="skyblue")
    plt.title(f"{feature} Distribution")
plt.tight_layout() 
plt.show()

In [0]:
# Correlation matrix
corr = pdf [features + ["quality"]].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap="coolwarm") 
plt.title("Feature Correlation Matrix")
plt.show()

In [0]:
# Boxplots of features vs. quality 
plt.figure(figsize=(16,12))
for i, feature in enumerate (features):
    plt.subplot(4, 3, i+1)
    sns.boxplot(x="quality", y=feature, data=pdf,
    palette="Set2")
    plt.title(f"{feature} vs Quality")
plt.tight_layout()
plt.show()


In [0]:
# Outlier detection using boxplots
plt.figure(figsize=(16,12))
for i, feature in enumerate (features):
    plt.subplot(4, 3, i+1)
    sns.boxplot(y=pdf [feature], color="orange")   
    plt.title(f"Outliers in {feature}")

    plt.tight_layout()
plt.show()
