In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Loading and Combining the datasets

In [None]:
# Load the separate datasets
red = pd.read_csv('Data/Real_Datasets/winequality-red.csv', sep=';')
white = pd.read_csv('Data/Real_Datasets/winequality-white.csv', sep=';')

# Add a 'type' column to distinguish between red and white wines
red['type'] = 'red'
white['type'] = 'white'

# Combine datasets
wine_df = pd.concat([red, white], ignore_index=True)

# Display the shape of the combined dataset
print(wine_df.shape)  # Output: (6497, 13)

#save the combined unprocessed dataset
wine_df.to_csv("Data/Real_Datasets/wine_unprocessed.csv", index=False)

In [None]:
Data exploration

In [None]:
print(wine_df.head())

print(wine_df.dtypes)

# Get summary statistics
print(wine_df.describe())

# Check for missing values
print(wine_df.isnull().sum())

In [None]:
# Visual check for normality
wine_df.hist(bins=30, figsize=(15, 10))
plt.suptitle("Histogram of Each Feature (Wine Quality - Cleaned)", fontsize=16)
plt.show()

Scaling and Normalizing
As shown in the histograms, not all the features are normally distributed. The only feature that is close to bell-shaped is PH. Quality, citric acid, alcohol, fixed acidity, density and Sulphates are moderately skewed. The rest of the features are heavily right-skewed and not normal, including residual sugar which is very skewed, chlorides, free sulfur dioxide, total sulfur dioxide, volatile acidity. Since Pearson correlation assumes that the data is normally distributed, the heavily skewed columns are transformed by np.log1p(), which reduces the right-skewness. Then we apply Standardization to all of the features which works better for roughly normal data.  

In [None]:
skewed_cols = [
    'residual sugar', 'chlorides', 'free sulfur dioxide',
    'total sulfur dioxide', 'volatile acidity'
]

for col in skewed_cols:
    wine_df[col] = np.log1p(wine_df[col])

In [None]:
from sklearn.preprocessing import StandardScaler

numeric_cols = wine_df.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
wine_df[numeric_cols] = scaler.fit_transform(wine_df[numeric_cols])

print(wine_df.describe().T[['mean', 'std']])


In [None]:
Histogram after scaling

In [None]:
numeric_cols = wine_df.select_dtypes(include=['float64', 'int64']).columns
n_cols = 3  # charts per row
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols  # auto-calculate rows

plt.figure(figsize=(15, 4 * n_rows))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(wine_df[col], kde=True)
    plt.title(f"{col}")
    plt.tight_layout()

plt.suptitle("Standardized Feature Distributions", fontsize=16, y=1.02)
plt.show()

In [None]:
#saving the processed dataset
wine_df.to_csv("Data/Real_Datasets/wine_processed.csv", index=False)

Checking the Pearson and Spearman correlations for original data

In [None]:
#loading the unprocessed dataset

wine_df_org = pd.read_csv("Data/Real_Datasets/wine_unprocessed", sep=';')

# Only use numeric columns for correlation
numeric_df = wine_df_org.select_dtypes(include=['float64', 'int64'])

# Pearson and Spearman Correlation
pearson_corr = numeric_df.corr(method='pearson')
spearman_corr = numeric_df.corr(method='spearman')

# Create subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(24, 10))  # 1 row, 2 columns

# Pearson plot
sns.heatmap(pearson_corr, annot=True, fmt=".2f", cmap='coolwarm', ax=axes[0])
axes[0].set_title("Pearson Correlation Matrix")

# Spearman plot
sns.heatmap(spearman_corr, annot=True, fmt=".2f", cmap='coolwarm', ax=axes[1])
axes[1].set_title("Spearman Correlation Matrix")

plt.tight_layout()
plt.show()

Pearson ans Spearman for the processed data

In [None]:
# Pearson and Spearman Correlation (Standardized Data)
pearson_corr = wine_df.corr(method='pearson')
spearman_corr = wine_df.corr(method='spearman')

# Create side-by-side plots
fig, axes = plt.subplots(1, 2, figsize=(24, 10))  # 1 row, 2 columns

# Pearson plot
sns.heatmap(pearson_corr, annot=True, fmt=".2f", cmap='coolwarm', ax=axes[0])
axes[0].set_title("Pearson Correlation Matrix (Standardized Data)")

# Spearman plot
sns.heatmap(spearman_corr, annot=True, fmt=".2f", cmap='coolwarm', ax=axes[1])
axes[1].set_title("Spearman Correlation Matrix (Standardized Data)")

plt.tight_layout()
plt.show()

In [2]:
import os
os.getcwd()

'/Users/parisakhosravi/analysis'