In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

CSV_PATH = "winequality-red.csv"
FIG_DIR = "figures_eda"
os.makedirs(FIG_DIR, exist_ok=True)

# --- Fix header parsing ---
with open(CSV_PATH, "r") as f:
    lines = f.readlines()

header = lines[0].replace('"', '').strip().split(';')
data_lines = lines[1:]

df = pd.DataFrame(
    [row.strip().split(';') for row in data_lines],
    columns=[h.lower() for h in header]
).astype(float)

target_col = "quality"
feature_cols = [c for c in df.columns if c != target_col]

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing values:\n", df.isna().sum())
print("Quality counts:\n", df[target_col].value_counts().sort_index())


Shape: (1599, 12)
Columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'quality']
Missing values:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
Quality counts:
 3.0     10
4.0     53
5.0    681
6.0    638
7.0    199
8.0     18
Name: quality, dtype: int64


In [6]:
# Quality distribution
counts = df[target_col].value_counts().sort_index()
plt.figure()
plt.bar(counts.index.astype(str), counts.values)
plt.xlabel("Quality score")
plt.ylabel("Count")
plt.title("Wine Quality Score Distribution (Red Wine)")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "01_quality_distribution.png"), dpi=200)
plt.close()

# Feature histograms
ncols = 3
nrows = int(np.ceil(len(feature_cols) / ncols))
fig, axes = plt.subplots(nrows, ncols, figsize=(12, 3.5 * nrows))
axes = axes.flatten()

for i, col in enumerate(feature_cols):
    axes[i].hist(df[col], bins=30)
    axes[i].set_title(col)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel("Count")

for j in range(i + 1, len(axes)):
    axes[j].axis("off")

fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, "02_feature_histograms.png"), dpi=200)
plt.close(fig)

# Boxplots
fig, axes = plt.subplots(nrows, ncols, figsize=(12, 3.5 * nrows))
axes = axes.flatten()

for i, col in enumerate(feature_cols):
    axes[i].boxplot(df[col])
    axes[i].set_title(col)
    axes[i].set_xticks([])

for j in range(i + 1, len(axes)):
    axes[j].axis("off")

fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, "03_feature_boxplots.png"), dpi=200)
plt.close(fig)

# Correlation heatmap
corr = df.corr()

plt.figure(figsize=(10, 8))
plt.imshow(corr, aspect="auto")
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "04_correlation_heatmap.png"), dpi=200)
plt.close()

# Correlation with quality
corr_q = corr[target_col].drop(target_col).sort_values()

plt.figure(figsize=(8, 6))
plt.barh(corr_q.index, corr_q.values)
plt.xlabel("Correlation coefficient")
plt.title("Feature Correlation with Quality")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "05_corr_with_quality.png"), dpi=200)
plt.close()

# Scatter plots
for col in ["alcohol", "volatile acidity", "sulphates", "citric acid"]:
    if col in df.columns:
        plt.figure()
        plt.scatter(df[col], df[target_col], s=12)
        plt.xlabel(col)
        plt.ylabel("quality")
        plt.title(f"Quality vs {col}")
        plt.tight_layout()
        plt.savefig(os.path.join(FIG_DIR, f"06_scatter_quality_vs_{col.replace(' ', '_')}.png"), dpi=200)
        plt.close()

print("EDA figures saved to:", os.path.abspath(FIG_DIR))


EDA figures saved to: c:\Users\Maxld\Downloads\ese417\machinelearningfinalproject\figures_eda
